In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [2]:
## 데이터 로딩 및 전처리
# 데이터 불러오기
call_df = pd.read_csv("call119_train.csv", index_col=0)
cat_df = pd.read_csv("cat119_train.csv", index_col=0)

# 컬럼명 변경
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

cat_df = cat_df.rename(columns={
    'cat119_train.tm': 'tm',
    'cat119_train.address_city': 'city',
    'cat119_train.address_gu': 'gu',
    'cat119_train.sub_address': 'dong',
    'cat119_train.cat': 'cat',
    'cat119_train.sub_cat': 'sub_cat',
    'cat119_train.stn': 'stn',
    'cat119_train.call_count': 'call_cat_sum'
})

# 숫자형 변환
num_cols = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']
for col in num_cols:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')
    call_df[col] = call_df[col].replace(-99.0, np.nan)

# 결측치 제거
call_df = call_df.dropna(subset=num_cols + ['call_total'])
cat_df = cat_df.dropna(subset=['call_cat_sum'])

# 병합
merged_df = pd.merge(call_df, cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')
merged_df['call_cat_sum'] = merged_df['call_cat_sum'].fillna(0)

# 불필요한 컬럼 제거 및 결측 처리
merged_df = merged_df.drop(columns=['city'])
cat_cols = ['gu', 'dong', 'cat', 'sub_cat']
for col in cat_cols:
    merged_df[col] = merged_df[col].fillna('unknown')

# Feature / Target
features = num_cols + cat_cols
target = 'call_total'
X = merged_df[features]
y = np.log1p(merged_df[target])  # 로그 변환

# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
## 범주형 인코딩
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_enc = X_train.copy()
X_val_enc = X_val.copy()

X_train_enc[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val_enc[cat_cols] = encoder.transform(X_val[cat_cols])

In [4]:
## 모델학습
# CatBoost
catboost_model = CatBoostRegressor(
    iterations=1500, learning_rate=0.05, depth=6, l2_leaf_reg=3,
    bagging_temperature=1, random_seed=42, verbose=0
)
catboost_model.fit(X_train, y_train, cat_features=cat_cols, eval_set=(X_val, y_val), early_stopping_rounds=100)

# XGBoost
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(
    X_train_enc,
    y_train,
    verbose=True
)

In [6]:
## 예측 및 평가
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))

print("CatBoost RMSE:", rmse(y_val, catboost_model.predict(X_val)))
print("XGBoost  RMSE:", rmse(y_val, xgb_model.predict(X_val_enc)))

CatBoost RMSE: 1.4315863891642449
XGBoost  RMSE: 1.3561988379165264


In [9]:
## 테스트 데이터에 적용
# 테스트 데이터 불러오기
test_call_df = pd.read_csv("test_call119.csv")
test_cat_df = pd.read_csv("test_cat119.csv")
output_base = test_call_df.copy()

# 컬럼명 변경
test_call_df = test_call_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn',
    'call_count': 'call_total'
})

test_cat_df = test_cat_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn'
})

# 숫자형 처리
for col in num_cols:
    test_call_df[col] = pd.to_numeric(test_call_df[col], errors='coerce')
    test_call_df[col] = test_call_df[col].replace(-99.0, np.nan)

# 병합 및 결측 처리
test_df = pd.merge(test_call_df, test_cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')
test_df['cat'] = test_df.get('cat', 'unknown').fillna('unknown')
test_df['sub_cat'] = test_df.get('sub_cat', 'unknown').fillna('unknown')
test_df = test_df.drop(columns=['city'])

# 범주형 결측 처리
for col in cat_cols:
    test_df[col] = test_df[col].fillna('unknown')

# merge_key 생성
test_df['merge_key'] = (
    test_df['tm'].astype(str) + '_' +
    test_df['gu'] + '_' +
    test_df['dong'] + '_' +
    test_df['stn'].astype(str)
)

# 예측용 feature
X_test = test_df[features]
X_test_enc = X_test.copy()
X_test_enc[cat_cols] = encoder.transform(X_test[cat_cols])

# 예측 수행 및 앙상블 평균
pred_cb = np.expm1(catboost_model.predict(X_test))
pred_xgb = np.expm1(xgb_model.predict(X_test_enc))

test_df['predicted_call_total'] = (pred_cb + pred_xgb) / 2

# 그룹 평균
agg_preds_df = test_df.groupby('merge_key')['predicted_call_total'].mean().reset_index()

# 원본 테스트셋과 merge
output_base['merge_key'] = (
    output_base['TM'].astype(str) + '_' +
    output_base['address_gu'] + '_' +
    output_base['sub_address'] + '_' +
    output_base['STN'].astype(str)
)

output_base = output_base.merge(agg_preds_df, on='merge_key', how='left')
output_base['call_count'] = output_base['predicted_call_total'].round().astype(int)
output_base = output_base.drop(columns=['merge_key', 'predicted_call_total'])

# 결과 저장
output_base.to_csv("call119_ensemble_predictions.csv", index=False)
print("예측 결과 저장 완료: call119_ensemble_predictions.csv")

예측 결과 저장 완료: call119_ensemble_predictions.csv


In [10]:
from sklearn.metrics import mean_squared_error

# 파일 경로 설정
file1 = 'call119_ensemble_predictions.csv'

# CSV 파일 읽기
df1 = pd.read_csv(file1)

# call_count 열 평균 및 합계 계산
mean_squared1 = (df1['call_count'] ** 2).mean()
sum1 = df1['call_count'].sum()

print(f"평균제곱값: {mean_squared1}, 합계: {sum1}")

평균제곱값: 4.628267888761587, 합계: 18836
