In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [2]:
## 데이터 로딩 및 전처리
# 데이터 불러오기
call_df = pd.read_csv("call119_train.csv", index_col=0)
cat_df = pd.read_csv("cat119_train.csv", index_col=0)

# 컬럼명 변경
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

cat_df = cat_df.rename(columns={
    'cat119_train.tm': 'tm',
    'cat119_train.address_city': 'city',
    'cat119_train.address_gu': 'gu',
    'cat119_train.sub_address': 'dong',
    'cat119_train.cat': 'cat',
    'cat119_train.sub_cat': 'sub_cat',
    'cat119_train.stn': 'stn',
    'cat119_train.call_count': 'call_cat_sum'
})

# 숫자형 변환
num_cols = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']
for col in num_cols:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')
    call_df[col] = call_df[col].replace(-99.0, np.nan)

# 결측치 제거
call_df = call_df.dropna(subset=num_cols + ['call_total'])
cat_df = cat_df.dropna(subset=['call_cat_sum'])

# 병합
merged_df = pd.merge(call_df, cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')
merged_df['call_cat_sum'] = merged_df['call_cat_sum'].fillna(0)

# 불필요한 컬럼 제거 및 결측 처리
merged_df = merged_df.drop(columns=['city'])
cat_cols = ['gu', 'dong', 'cat', 'sub_cat']
for col in cat_cols:
    merged_df[col] = merged_df[col].fillna('unknown')

# Feature / Target
features = num_cols + cat_cols
target = 'call_total'
X = merged_df[features]
y = np.log1p(merged_df[target])  # 로그 변환

# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
## 범주형 인코딩
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_enc = X_train.copy()
X_val_enc = X_val.copy()

X_train_enc[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val_enc[cat_cols] = encoder.transform(X_val[cat_cols])

In [29]:
###  XGB 파라미터 자동 튜닝 (튜닝할때만 실행) ###
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 7600, 7800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.02, log=True),
        "max_depth": trial.suggest_int("max_depth", 7, 10),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 0.05),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 0.5),
        "random_state": 42,
        "tree_method": "hist"
    }

    model = XGBRegressor(**params)
    model.fit(X_train_enc, y_train, verbose=False)

    y_pred = model.predict(X_val_enc)
    mse = mean_squared_error(y_val, y_pred)
    
    return mse

print("=== XGBoost 자동 튜닝 시작 ===")
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=5)                   # 시도횟수 조정 가능
print("XGBoost Best hyperparameters:", study_xgb.best_params)
print("XGBoost Best MSE:", study_xgb.best_value)

[I 2025-06-19 00:46:07,099] A new study created in memory with name: no-name-6069cc80-4c4a-471b-ac53-31d243731aaa


=== XGBoost 자동 튜닝 시작 ===


[I 2025-06-19 00:46:28,388] Trial 0 finished with value: 0.0809871936183765 and parameters: {'n_estimators': 7758, 'learning_rate': 0.01601508803717952, 'max_depth': 10, 'colsample_bytree': 0.7837466263273496, 'reg_alpha': 0.021938679013374984, 'reg_lambda': 0.337157861326587}. Best is trial 0 with value: 0.0809871936183765.
[I 2025-06-19 00:46:43,807] Trial 1 finished with value: 0.07921298911218252 and parameters: {'n_estimators': 7765, 'learning_rate': 0.016551629647974898, 'max_depth': 9, 'colsample_bytree': 0.8393246337765126, 'reg_alpha': 0.009364029289188626, 'reg_lambda': 0.16994061165057966}. Best is trial 1 with value: 0.07921298911218252.
[I 2025-06-19 00:46:55,178] Trial 2 finished with value: 0.08077215582102927 and parameters: {'n_estimators': 7702, 'learning_rate': 0.014017523376476815, 'max_depth': 8, 'colsample_bytree': 0.8671552969405703, 'reg_alpha': 0.031620850147733685, 'reg_lambda': 0.4650489856427915}. Best is trial 1 with value: 0.07921298911218252.
[I 2025-06-1

XGBoost Best hyperparameters: {'n_estimators': 7765, 'learning_rate': 0.016551629647974898, 'max_depth': 9, 'colsample_bytree': 0.8393246337765126, 'reg_alpha': 0.009364029289188626, 'reg_lambda': 0.16994061165057966}
XGBoost Best MSE: 0.07921298911218252


In [30]:
def objective_cat(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 3500, 5500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.02, log=True),
        "depth": trial.suggest_int("depth", 4, 7),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.0, 0.05),
        "random_seed": 42,
        "verbose": False,
        "allow_writing_files": False
    }

    model = CatBoostRegressor(**params)
    
    # 원본 데이터 사용 (X_train, X_val)
    model.fit(
        X_train, y_train, 
        cat_features=cat_cols,  # ['gu', 'dong', 'cat', 'sub_cat']
        eval_set=(X_val, y_val), 
        early_stopping_rounds=100, 
        verbose=False
    )

    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    
    return mse

print("\n=== CatBoost 자동 튜닝 시작 ===")
study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective_cat, n_trials=5)         # 시도횟수 조정 가능
print("CatBoost Best hyperparameters:", study_cat.best_params)
print("CatBoost Best MSE:", study_cat.best_value)

[I 2025-06-19 00:47:40,941] A new study created in memory with name: no-name-2440291d-01c8-4fea-9201-cdc4056b7833



=== CatBoost 자동 튜닝 시작 ===


[I 2025-06-19 00:49:36,904] Trial 0 finished with value: 0.10429527083474514 and parameters: {'iterations': 4494, 'learning_rate': 0.013176405588378027, 'depth': 6, 'l2_leaf_reg': 0.04060902773509378}. Best is trial 0 with value: 0.10429527083474514.
[I 2025-06-19 00:51:33,448] Trial 1 finished with value: 0.10362850562982219 and parameters: {'iterations': 3874, 'learning_rate': 0.011154014468089687, 'depth': 7, 'l2_leaf_reg': 0.009949388091395117}. Best is trial 1 with value: 0.10362850562982219.
[I 2025-06-19 00:53:53,100] Trial 2 finished with value: 0.10269891221256804 and parameters: {'iterations': 5372, 'learning_rate': 0.016026868169833466, 'depth': 6, 'l2_leaf_reg': 0.019033782575824543}. Best is trial 2 with value: 0.10269891221256804.
[I 2025-06-19 00:55:59,141] Trial 3 finished with value: 0.10362094504491597 and parameters: {'iterations': 4904, 'learning_rate': 0.014033483813078975, 'depth': 6, 'l2_leaf_reg': 0.023443443705447223}. Best is trial 2 with value: 0.102698912212

CatBoost Best hyperparameters: {'iterations': 5372, 'learning_rate': 0.016026868169833466, 'depth': 6, 'l2_leaf_reg': 0.019033782575824543}
CatBoost Best MSE: 0.10269891221256804


In [31]:
# ===================================
# 최적 파라미터로 모델 자동 학습
# ===================================

print("\n=== 최적 파라미터로 모델 학습 ===")

# XGBoost 최적 모델 학습
print("XGBoost 모델 학습 중...")
best_xgb_params = study_xgb.best_params.copy()
best_xgb_params.update({
    "random_state": 42,
    "tree_method": "hist"
})

xgb_model = XGBRegressor(**best_xgb_params)
xgb_model.fit(X_train_enc, y_train, verbose=100)

# CatBoost 최적 모델 학습
print("\nCatBoost 모델 학습 중...")
best_cat_params = study_cat.best_params.copy()
best_cat_params.update({
    "random_seed": 42,
    "verbose": 100,
    "loss_function": "RMSE"
})

catboost_model = CatBoostRegressor(**best_cat_params)
catboost_model.fit(
    X_train, y_train, 
    cat_features=cat_cols, 
    eval_set=(X_val, y_val), 
    early_stopping_rounds=100
)

# ===================================
# 모델 성능 평가
# ===================================

print("\n=== 모델 성능 평가 ===")

# XGBoost 평가
xgb_pred = xgb_model.predict(X_val_enc)
xgb_mse = mean_squared_error(y_val, xgb_pred)
xgb_rmse = np.sqrt(xgb_mse)

# CatBoost 평가
cat_pred = catboost_model.predict(X_val)
cat_mse = mean_squared_error(y_val, cat_pred)
cat_rmse = np.sqrt(cat_mse)

print(f"XGBoost - MSE: {xgb_mse:.6f}, RMSE: {xgb_rmse:.6f}")
print(f"CatBoost - MSE: {cat_mse:.6f}, RMSE: {cat_rmse:.6f}")


=== 최적 파라미터로 모델 학습 ===
XGBoost 모델 학습 중...

CatBoost 모델 학습 중...
0:	learn: 0.4368159	test: 0.4360544	best: 0.4360544 (0)	total: 24.6ms	remaining: 2m 12s
100:	learn: 0.3553560	test: 0.3502739	best: 0.3502739 (100)	total: 2.16s	remaining: 1m 52s
200:	learn: 0.3458492	test: 0.3404817	best: 0.3404817 (200)	total: 4.39s	remaining: 1m 53s
300:	learn: 0.3428234	test: 0.3379581	best: 0.3379581 (300)	total: 6.73s	remaining: 1m 53s
400:	learn: 0.3409327	test: 0.3367136	best: 0.3367136 (400)	total: 9.14s	remaining: 1m 53s
500:	learn: 0.3394442	test: 0.3358038	best: 0.3358038 (500)	total: 11.6s	remaining: 1m 52s
600:	learn: 0.3380003	test: 0.3350690	best: 0.3350690 (600)	total: 14.1s	remaining: 1m 51s
700:	learn: 0.3365111	test: 0.3342823	best: 0.3342823 (700)	total: 16.6s	remaining: 1m 50s
800:	learn: 0.3348376	test: 0.3333151	best: 0.3333151 (800)	total: 19.1s	remaining: 1m 49s
900:	learn: 0.3333906	test: 0.3325301	best: 0.3325301 (900)	total: 21.6s	remaining: 1m 47s
1000:	learn: 0.3319712	test: 

In [None]:
#################
## 모델 수동학습 ##
#################

# CatBoost
catboost_model = CatBoostRegressor(
    iterations=3500,
    learning_rate=0.02,
    depth=6,
    l2_leaf_reg=5,
    bagging_temperature=1.0,
    random_strength=1.5,
    random_seed=42,
    verbose=100,
    loss_function="RMSE"
)
catboost_model.fit(X_train, y_train, cat_features=cat_cols, eval_set=(X_val, y_val), early_stopping_rounds=100)

# XGBoost
xgb_model = XGBRegressor(
    n_estimators=7765,
    learning_rate=0.01666987439817238,
    max_depth=9,
    colsample_bytree=0.952258400174878,
    reg_alpha=0.004261234885452697,
    reg_lambda=0.1446413754321237,
    random_state=42,
    tree_method="hist"
)
xgb_model.fit(
    X_train_enc,
    y_train,
    verbose=True
)

0:	learn: 0.4363060	test: 0.4353259	best: 0.4353259 (0)	total: 187ms	remaining: 10m 54s
100:	learn: 0.3529093	test: 0.3472666	best: 0.3472666 (100)	total: 2.25s	remaining: 1m 15s
200:	learn: 0.3455895	test: 0.3398420	best: 0.3398420 (200)	total: 4.4s	remaining: 1m 12s
300:	learn: 0.3433462	test: 0.3380826	best: 0.3380826 (300)	total: 6.59s	remaining: 1m 10s
400:	learn: 0.3417831	test: 0.3370352	best: 0.3370352 (400)	total: 8.82s	remaining: 1m 8s
500:	learn: 0.3404344	test: 0.3362846	best: 0.3362846 (500)	total: 11.1s	remaining: 1m 6s
600:	learn: 0.3385933	test: 0.3352084	best: 0.3352084 (600)	total: 13.4s	remaining: 1m 4s
700:	learn: 0.3370072	test: 0.3342321	best: 0.3342321 (700)	total: 15.7s	remaining: 1m 2s
800:	learn: 0.3354768	test: 0.3334118	best: 0.3334118 (800)	total: 18.1s	remaining: 1m
900:	learn: 0.3343135	test: 0.3328501	best: 0.3328501 (900)	total: 20.4s	remaining: 58.8s
1000:	learn: 0.3331610	test: 0.3322517	best: 0.3322517 (999)	total: 22.7s	remaining: 56.7s
1100:	learn:

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.952258400174878
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
## 예측 및 평가
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))

print("CatBoost RMSE:", rmse(y_val, catboost_model.predict(X_val)))
print("XGBoost  RMSE:", rmse(y_val, xgb_model.predict(X_val_enc)))

CatBoost RMSE: 1.4067781579327756
XGBoost  RMSE: 1.1250465511039793


In [35]:
## 테스트 데이터에 적용
# 테스트 데이터 불러오기
test_call_df = pd.read_csv("test_call119.csv")
test_cat_df = pd.read_csv("test_cat119.csv")
output_base = test_call_df.copy()

# 컬럼명 변경
test_call_df = test_call_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn',
    'call_count': 'call_total'
})

test_cat_df = test_cat_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn'
})

# 숫자형 처리
for col in num_cols:
    test_call_df[col] = pd.to_numeric(test_call_df[col], errors='coerce')
    test_call_df[col] = test_call_df[col].replace(-99.0, np.nan)

# 병합 및 결측 처리
test_df = pd.merge(test_call_df, test_cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')
test_df['cat'] = test_df.get('cat', 'unknown').fillna('unknown')
test_df['sub_cat'] = test_df.get('sub_cat', 'unknown').fillna('unknown')
test_df = test_df.drop(columns=['city'])

# 범주형 결측 처리
for col in cat_cols:
    test_df[col] = test_df[col].fillna('unknown')

# merge_key 생성
test_df['merge_key'] = (
    test_df['tm'].astype(str) + '_' +
    test_df['gu'] + '_' +
    test_df['dong'] + '_' +
    test_df['stn'].astype(str)
)

# 예측용 feature
X_test = test_df[features]
X_test_enc = X_test.copy()
X_test_enc[cat_cols] = encoder.transform(X_test[cat_cols])

# 예측 수행 및 앙상블 평균
pred_cb = np.expm1(catboost_model.predict(X_test))
pred_xgb = np.expm1(xgb_model.predict(X_test_enc))

# 앙상블 가중치
test_df['predicted_call_total'] = (pred_cb * 0.50) + (pred_xgb * 0.50)

# 그룹 평균
agg_preds_df = test_df.groupby('merge_key')['predicted_call_total'].mean().reset_index()

# 원본 테스트셋과 merge
output_base['merge_key'] = (
    output_base['TM'].astype(str) + '_' +
    output_base['address_gu'] + '_' +
    output_base['sub_address'] + '_' +
    output_base['STN'].astype(str)
)

output_base = output_base.merge(agg_preds_df, on='merge_key', how='left')
output_base['call_count'] = output_base['predicted_call_total'].round().astype(int)
output_base = output_base.drop(columns=['merge_key', 'predicted_call_total'])

# 결과 저장
output_base.to_csv("call119_ensemble_predictions.csv", index=False)
print("예측 결과 저장 완료: call119_ensemble_predictions.csv")

예측 결과 저장 완료: call119_ensemble_predictions.csv


In [36]:
from sklearn.metrics import mean_squared_error

# 파일 경로 설정
file1 = 'call119_ensemble_predictions.csv'

# CSV 파일 읽기
df1 = pd.read_csv(file1)

# call_count 열 평균 및 합계 계산
mean = df1['call_count'].mean()
mean_squared1 = (df1['call_count'] ** 2).mean()
sum1 = df1['call_count'].sum()

print(f"평균: {mean}, 평균제곱값: {mean_squared1}, 합계: {sum1}")

평균: 1.9138631392563275, 평균제곱값: 4.437766899281325, 합계: 18375
