In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 데이터 불러오기
call_df = pd.read_csv("call119_train.csv", index_col=0)
cat_df = pd.read_csv("cat119_train.csv", index_col=0)

# 컬럼명 통일 (test셋에도 동일한 컬럼 사용 전제)
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

cat_df = cat_df.rename(columns={
    'cat119_train.tm': 'tm',
    'cat119_train.address_city': 'city',
    'cat119_train.address_gu': 'gu',
    'cat119_train.sub_address': 'dong',
    'cat119_train.cat': 'cat',
    'cat119_train.sub_cat': 'sub_cat',
    'cat119_train.stn': 'stn',
    'cat119_train.call_count': 'call_cat_sum'
})

# 숫자형 변환
for col in ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')

# 결측치 제거 (기상데이터 품질을 위해)
call_df = call_df.dropna(subset=['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day', 'call_total'])
cat_df = cat_df.dropna(subset=['call_cat_sum'])

# 병합
merged_df = pd.merge(call_df, cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')

# cat_df에 없는 행은 call_cat_sum = 0
merged_df['call_cat_sum'] = merged_df['call_cat_sum'].fillna(0)

# city는 제거 (일관성 있게)
merged_df = merged_df.drop(columns=['city'])

# 범주형 결측값 처리
cat_features = ['gu', 'dong', 'cat', 'sub_cat']
for col in cat_features:
    merged_df[col] = merged_df[col].fillna('unknown')

# 학습용 feature, target 설정
features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day'] + cat_features
target = 'call_total'

X = merged_df[features]
y = merged_df[target]

# 로그 변환 (0 대응을 위해 log1p 사용)
y_log = np.log1p(y)

# Train/Val 분리
X_train, X_val, y_train_log, y_val_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# CatBoost에 범주형 인덱스 전달
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_features]

# 하이퍼파라미터 튜닝 (성능 고려해 수동 설정)
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_seed=42,
    verbose=100
)

# 모델 학습
model.fit(
    X_train, y_train_log,
    cat_features=cat_feature_indices,
    eval_set=(X_val, y_val_log),
    early_stopping_rounds=100
)

# 로그 복원 후 평가
y_val_pred_log = model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)

rmse = np.sqrt(mean_squared_error(np.expm1(y_val_log), y_val_pred))
r2 = r2_score(np.expm1(y_val_log), y_val_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

0:	learn: 0.4406824	test: 0.4383066	best: 0.4383066 (0)	total: 166ms	remaining: 4m 9s
100:	learn: 0.3391201	test: 0.3425110	best: 0.3425110 (100)	total: 3.18s	remaining: 44s
200:	learn: 0.3321337	test: 0.3393926	best: 0.3393926 (200)	total: 6.19s	remaining: 40s
300:	learn: 0.3253354	test: 0.3361875	best: 0.3361875 (300)	total: 9.5s	remaining: 37.8s
400:	learn: 0.3202477	test: 0.3341860	best: 0.3341860 (400)	total: 12.6s	remaining: 34.6s
500:	learn: 0.3158461	test: 0.3324359	best: 0.3324357 (499)	total: 15.9s	remaining: 31.7s
600:	learn: 0.3117607	test: 0.3308415	best: 0.3308415 (600)	total: 19.1s	remaining: 28.6s
700:	learn: 0.3082302	test: 0.3296120	best: 0.3296120 (700)	total: 22.3s	remaining: 25.4s
800:	learn: 0.3045259	test: 0.3283496	best: 0.3283496 (800)	total: 25.5s	remaining: 22.2s
900:	learn: 0.3015593	test: 0.3274623	best: 0.3274623 (900)	total: 28.6s	remaining: 19s
1000:	learn: 0.2982548	test: 0.3265116	best: 0.3265116 (1000)	total: 31.8s	remaining: 15.8s
1100:	learn: 0.2951

In [2]:
import pandas as pd
import numpy as np

# 테스트 데이터 불러오기
test_call_df = pd.read_csv("test_call119.csv")
test_cat_df = pd.read_csv("test_cat119.csv")

# 원본 백업
output_base = test_call_df.copy()

# 컬럼명 변경
test_call_df = test_call_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn',
    'call_count': 'call_total'
})

test_cat_df = test_cat_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn'
})

# 숫자형 처리
for col in ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']:
    test_call_df[col] = pd.to_numeric(test_call_df[col], errors='coerce')

# 병합
test_merged_df = pd.merge(test_call_df, test_cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')

# 결측치 처리
test_merged_df['cat'] = test_merged_df.get('cat', 'unknown')
test_merged_df['sub_cat'] = test_merged_df.get('sub_cat', 'unknown')
test_merged_df['cat'] = test_merged_df['cat'].fillna('unknown')
test_merged_df['sub_cat'] = test_merged_df['sub_cat'].fillna('unknown')

# city 제거
test_merged_df = test_merged_df.drop(columns=['city'])

# 범주형 처리
cat_features = ['gu', 'dong', 'cat', 'sub_cat']
for col in cat_features:
    test_merged_df[col] = test_merged_df[col].fillna('unknown')

# feature 구성
features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day'] + cat_features

# merge_key 생성
test_merged_df['merge_key'] = (
    test_merged_df['tm'].astype(str) + '_' +
    test_merged_df['gu'] + '_' +
    test_merged_df['dong'] + '_' +
    test_merged_df['stn'].astype(str)
)

X_test = test_merged_df[features]

# 로그 복원 적용
test_preds = np.expm1(model.predict(X_test))

test_merged_df['predicted_call_total'] = test_preds

# 그룹 평균으로 병합
agg_preds_df = test_merged_df.groupby('merge_key')['predicted_call_total'].mean().reset_index()

# output_base도 merge_key 생성
output_base['merge_key'] = (
    output_base['TM'].astype(str) + '_' +
    output_base['address_gu'] + '_' +
    output_base['sub_address'] + '_' +
    output_base['STN'].astype(str)
)

# merge & 예측 결과 삽입
output_base = output_base.merge(agg_preds_df, on='merge_key', how='left')
output_base['call_count'] = output_base['predicted_call_total'].round().astype(int)
output_base = output_base.drop(columns=['merge_key', 'predicted_call_total'])

# 저장
output_base.to_csv("call119_predictions.csv", index=False)
print("예측 결과 저장 완료: call119_predictions.csv")
print(output_base.head())

예측 결과 저장 완료: call119_predictions.csv
         TM address_city address_gu sub_address  STN  ta_max  ta_min  \
0  20240501        부산광역시        강서구        대저2동  904    18.5    11.1   
1  20240501        부산광역시        강서구         생곡동  904    18.5    11.1   
2  20240501        부산광역시        강서구         송정동  937    16.9     9.9   
3  20240501        부산광역시        강서구         신호동  950    16.6    11.4   
4  20240501        부산광역시        금정구         구서동  940    16.9    10.2   

   ta_max_min  hm_min  hm_max  ws_max  ws_ins_max  rn_day  call_count  
0         7.4    42.5    82.5     6.5        11.6     0.0           1  
1         7.4    42.5    82.5     6.5        11.6     0.0           1  
2         7.0    55.3    93.9     4.5         9.7     0.0           1  
3         5.2    48.1    84.6     6.4        13.5     0.0           1  
4         6.7    46.8    91.3     3.3         8.7     0.0           2  
