In [52]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, RobustScaler

In [53]:
# -----------------------------
# 학습 데이터 로딩 및 전처리
# -----------------------------
call_df = pd.read_csv("call119_train.csv", index_col=0)

# 컬럼명 통일
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

# 숫자형 처리 및 결측 제거
num_cols = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']
for col in num_cols:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')
    call_df[col] = call_df[col].replace(-99.0, np.nan)

call_df = call_df.dropna(subset=num_cols + ['call_total'])

In [54]:
# 파생 변수 생성
call_df["tm"] = pd.to_datetime(call_df["tm"].astype(str))
call_df["month"] = call_df["tm"].dt.month
call_df["day"] = call_df["tm"].dt.day
call_df["weekday"] = call_df["tm"].dt.weekday
call_df["is_weekend"] = call_df["weekday"].isin([5, 6]).astype(int)
call_df["region"] = call_df["city"] + "_" + call_df["gu"]
call_df["ta_mean"] = (call_df["ta_min"] + call_df["ta_max"]) / 2
call_df["m_day_bin"] = pd.cut(call_df["rn_day"], bins=[-1, 0, 10, 30, 70, np.inf], labels=False)
call_df["ws_ins_max_bin"] = pd.cut(call_df["ws_ins_max"], bins=[-1, 5, 10, 15, 20, np.inf], labels=False)
call_df["ws_max_bin"] = pd.cut(call_df["ws_max"], bins=[-1, 3, 6, 9, 12, np.inf], labels=False)

# 불필요한 컬럼 제거
drop_cols = ["tm", "city", "gu", "dong", "rn_day", "ws_ins_max", "ws_max"]
call_df = call_df.drop(columns=drop_cols)

In [55]:
# 학습/검증 분할
X = call_df.drop(columns=["call_total"])
y = call_df["call_total"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 인코딩 및 스케일링
categorical_cols = ["region", "stn"]
le_dict = {}
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_train_xgb[col] = le.fit_transform(X_train_xgb[col])
    X_valid_xgb[col] = le.transform(X_valid_xgb[col])
    le_dict[col] = le

scaler = RobustScaler()
X_train_xgb = scaler.fit_transform(X_train_xgb)
X_valid_xgb = scaler.transform(X_valid_xgb)

# CatBoost는 원본 카테고리 유지
X_train_cat = X_train.copy()
X_valid_cat = X_valid.copy()

In [56]:
# 모델 학습
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=2.0,
    random_state=42,
    tree_method="hist"
)
xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_valid_xgb, y_valid)], verbose=100)

cat_model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.02,
    depth=6,
    l2_leaf_reg=5,
    bagging_temperature=1.0,
    random_strength=1.5,
    random_seed=42,
    verbose=100,
    loss_function="RMSE",
    early_stopping_rounds=100
)
cat_model.fit(X_train_cat, y_train, eval_set=(X_valid_cat, y_valid), cat_features=categorical_cols)

[0]	validation_0-rmse:1.57734
[100]	validation_0-rmse:1.40906
[200]	validation_0-rmse:1.41412
[300]	validation_0-rmse:1.41687
[400]	validation_0-rmse:1.41860
[500]	validation_0-rmse:1.41805
[600]	validation_0-rmse:1.42134
[700]	validation_0-rmse:1.41898
[800]	validation_0-rmse:1.42150
[900]	validation_0-rmse:1.42156
[999]	validation_0-rmse:1.42222
0:	learn: 1.7535010	test: 1.5787404	best: 1.5787404 (0)	total: 17.8ms	remaining: 26.7s
100:	learn: 1.4814583	test: 1.4203793	best: 1.4202986 (99)	total: 1.47s	remaining: 20.3s
200:	learn: 1.4339523	test: 1.4053933	best: 1.4053933 (200)	total: 2.96s	remaining: 19.1s
300:	learn: 1.4179329	test: 1.3996680	best: 1.3996680 (300)	total: 4.39s	remaining: 17.5s
400:	learn: 1.4072985	test: 1.3977756	best: 1.3977756 (400)	total: 5.86s	remaining: 16.1s
500:	learn: 1.3973031	test: 1.3969445	best: 1.3969445 (500)	total: 7.34s	remaining: 14.6s
600:	learn: 1.3876235	test: 1.3969708	best: 1.3962824 (576)	total: 8.83s	remaining: 13.2s
Stopped by overfitting d

<catboost.core.CatBoostRegressor at 0x292029e80e0>

In [59]:
# 예측
cat_preds = cat_model.predict(X_valid_cat)
xgb_preds = xgb_model.predict(X_valid_xgb)

# RMSE 계산
cat_rmse = np.sqrt(mean_squared_error(y_valid, cat_preds))
xgb_rmse = np.sqrt(mean_squared_error(y_valid, xgb_preds))

# 앙상블 예측 및 RMSE
final_preds = 0.45 * xgb_preds + 0.55 * cat_preds
ensemble_rmse = np.sqrt(mean_squared_error(y_valid, final_preds))

# 출력
print(f"CatBoost RMSE: {cat_rmse:.4f}")
print(f"XGBoost RMSE: {xgb_rmse:.4f}")
print(f"앙상블 RMSE: {ensemble_rmse:.4f}")

CatBoost RMSE: 1.3963
XGBoost RMSE: 1.4222
앙상블 RMSE: 1.3985


In [66]:
# 테스트 데이터 로드 및 전처리
test_df = pd.read_csv("test_call119.csv")

# 컬럼명 통일
test_df = test_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn',
    'ta_max': 'ta_max',
    'ta_min': 'ta_min',
    'ta_max_min': 'ta_max_min',
    'hm_min': 'hm_min',
    'hm_max': 'hm_max',
    'ws_max': 'ws_max',
    'ws_ins_max': 'ws_ins_max',
    'rn_day': 'rn_day',
    'call_count': 'call_total'  # 실제 예측 대상
})

# 수치형 처리
num_cols = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']
for col in num_cols:
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    test_df[col] = test_df[col].replace(-99.0, np.nan)

# 파생변수 생성
test_df["tm"] = pd.to_datetime(test_df["tm"].astype(str))
test_df["month"] = test_df["tm"].dt.month
test_df["day"] = test_df["tm"].dt.day
test_df["weekday"] = test_df["tm"].dt.weekday
test_df["is_weekend"] = test_df["weekday"].isin([5, 6]).astype(int)
test_df["region"] = test_df["city"] + "_" + test_df["gu"]
test_df["ta_mean"] = (test_df["ta_min"] + test_df["ta_max"]) / 2
test_df["m_day_bin"] = pd.cut(test_df["rn_day"], bins=[-1, 0, 10, 30, 70, np.inf], labels=False)
test_df["ws_ins_max_bin"] = pd.cut(test_df["ws_ins_max"], bins=[-1, 5, 10, 15, 20, np.inf], labels=False)
test_df["ws_max_bin"] = pd.cut(test_df["ws_max"], bins=[-1, 3, 6, 9, 12, np.inf], labels=False)

# 원래 학습에서 제거했던 컬럼과 동일하게 제거
drop_cols = ["tm", "city", "gu", "dong", "rn_day", "ws_ins_max", "ws_max", "call_total"]
test_features = test_df.drop(columns=drop_cols).copy()

In [67]:
# -------------------------------
# ✅ XGBoost용 인코딩 + 스케일링
# -------------------------------
# 저장된 label encoder 적용
for col in ["region", "stn"]:
    le = le_dict[col]
    test_features[col] = le.transform(test_features[col])

# 스케일링
test_scaled = scaler.transform(test_features)

# -------------------------------
# ✅ CatBoost용 원본 카테고리 유지
# -------------------------------
test_cat = test_features.copy()

In [68]:
# -------------------------------
# ✅ 예측
# -------------------------------
xgb_preds = xgb_model.predict(test_scaled)
cat_preds = cat_model.predict(test_cat)

# 앙상블
final_preds = 0.55 * xgb_preds + 0.45 * cat_preds

# -------------------------------
# ✅ 결과 저장
# -------------------------------
output_df = pd.read_csv("test_call119.csv")
output_df["call_count"] = np.round(final_preds).astype(int)
output_df.to_csv("call119_ensemble_predictions.csv", index=False)

print("✅ 예측 완료: call119_ensemble_predictions.csv")

✅ 예측 완료: call119_ensemble_predictions.csv


In [69]:
from sklearn.metrics import mean_squared_error

# 파일 경로 설정
file1 = 'call119_ensemble_predictions.csv'

# CSV 파일 읽기
df1 = pd.read_csv(file1)

# call_count 열 평균 및 합계 계산
mean_squared1 = (df1['call_count'] ** 2).mean()
sum1 = df1['call_count'].sum()

print(f"평균제곱값: {mean_squared1}, 합계: {sum1}")

평균제곱값: 5.02749713571503, 합계: 20869
