In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# 데이터 불러오기
call_df = pd.read_csv("call119_train.csv", index_col=0)
cat_df = pd.read_csv("cat119_train.csv", index_col=0)

# 컬럼명 변경
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

cat_df = cat_df.rename(columns={
    'cat119_train.tm': 'tm',
    'cat119_train.address_city': 'city',
    'cat119_train.address_gu': 'gu',
    'cat119_train.sub_address': 'dong',
    'cat119_train.cat': 'cat',
    'cat119_train.sub_cat': 'sub_cat',
    'cat119_train.stn': 'stn',
    'cat119_train.call_count': 'call_cat_sum'
})

# 숫자형 변환
for col in ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')

# 결측치 제거
call_df = call_df.dropna(subset=['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day', 'call_total'])
cat_df = cat_df.dropna(subset=['call_cat_sum'])

# 데이터 병합
merged_df = pd.merge(call_df, cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')

# 결측치 0으로 채우기 (cat 관련)
merged_df['call_cat_sum'] = merged_df['call_cat_sum'].fillna(0)

# city 제거
merged_df = merged_df.drop(columns=['city'])

# 범주형 변수
cat_features = ['gu', 'dong', 'cat', 'sub_cat']

# 결측값 문자열로 대체
for col in cat_features:
    merged_df[col] = merged_df[col].fillna('unknown')

# feature, target 정의
features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day'] + cat_features
target = 'call_total'

X = merged_df[features]
y = merged_df[target]

# train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# cat_features 인덱스 찾기
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_features]

# 모델 생성 및 학습
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_seed=42,
    verbose=100
)

model.fit(X_train, y_train, cat_features=cat_feature_indices, eval_set=(X_val, y_val), early_stopping_rounds=100)

# 예측 및 평가
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

0:	learn: 2.9454944	test: 3.0167066	best: 3.0167066 (0)	total: 21.3ms	remaining: 31.9s
100:	learn: 1.6571329	test: 1.5478280	best: 1.5478280 (100)	total: 2.08s	remaining: 28.8s
200:	learn: 1.5652063	test: 1.5111041	best: 1.5111041 (200)	total: 4.2s	remaining: 27.1s
300:	learn: 1.5139972	test: 1.4850485	best: 1.4850485 (300)	total: 6.33s	remaining: 25.2s
400:	learn: 1.4748765	test: 1.4671461	best: 1.4668394 (399)	total: 8.47s	remaining: 23.2s
500:	learn: 1.4394252	test: 1.4514888	best: 1.4514888 (500)	total: 10.6s	remaining: 21.2s
600:	learn: 1.4134675	test: 1.4412430	best: 1.4412430 (600)	total: 12.8s	remaining: 19.2s
700:	learn: 1.3926910	test: 1.4332646	best: 1.4332646 (700)	total: 15s	remaining: 17.1s
800:	learn: 1.3718154	test: 1.4257355	best: 1.4257355 (800)	total: 17.2s	remaining: 15s
900:	learn: 1.3551997	test: 1.4177455	best: 1.4177455 (900)	total: 19.3s	remaining: 12.9s
1000:	learn: 1.3399157	test: 1.4119797	best: 1.4116134 (998)	total: 21.6s	remaining: 10.8s
1100:	learn: 1.32

In [10]:
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=5,
    bagging_temperature=1,
    random_seed=42,
    verbose=100
)

model.fit(
    X_train, y_train,
    cat_features=cat_feature_indices,
    eval_set=(X_val, y_val),
    early_stopping_rounds=150
)

# 예측 및 평가
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

0:	learn: 2.9696573	test: 3.0481349	best: 3.0481349 (0)	total: 22.3ms	remaining: 1m 6s
100:	learn: 1.7288688	test: 1.6057296	best: 1.6057296 (100)	total: 2.33s	remaining: 1m 6s
200:	learn: 1.6061998	test: 1.5365506	best: 1.5365471 (198)	total: 4.72s	remaining: 1m 5s
300:	learn: 1.5531786	test: 1.5159666	best: 1.5159666 (300)	total: 7.17s	remaining: 1m 4s
400:	learn: 1.5150228	test: 1.4980841	best: 1.4980841 (400)	total: 9.62s	remaining: 1m 2s
500:	learn: 1.4877354	test: 1.4884697	best: 1.4884246 (496)	total: 12.1s	remaining: 1m
600:	learn: 1.4647549	test: 1.4785929	best: 1.4785929 (600)	total: 14.6s	remaining: 58.4s
700:	learn: 1.4428835	test: 1.4709520	best: 1.4709434 (699)	total: 17.1s	remaining: 56.1s
800:	learn: 1.4218511	test: 1.4619329	best: 1.4619329 (800)	total: 19.6s	remaining: 53.9s
900:	learn: 1.4048349	test: 1.4552403	best: 1.4552403 (900)	total: 22.2s	remaining: 51.6s
1000:	learn: 1.3892269	test: 1.4490514	best: 1.4490514 (1000)	total: 24.7s	remaining: 49.3s
1100:	learn: 1