In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# 학습 데이터 불러오기
call_df = pd.read_csv("call119_train.csv", index_col=0)
cat_df = pd.read_csv("cat119_train.csv", index_col=0)

In [4]:
# 컬럼명 변경
call_df = call_df.rename(columns={
    'call119_train.tm': 'tm',
    'call119_train.address_city': 'city',
    'call119_train.address_gu': 'gu',
    'call119_train.sub_address': 'dong',
    'call119_train.stn': 'stn',
    'call119_train.ta_max': 'ta_max',
    'call119_train.ta_min': 'ta_min',
    'call119_train.ta_max_min': 'ta_max_min',
    'call119_train.hm_min': 'hm_min',
    'call119_train.hm_max': 'hm_max',
    'call119_train.ws_max': 'ws_max',
    'call119_train.ws_ins_max': 'ws_ins_max',
    'call119_train.rn_day': 'rn_day',
    'call119_train.call_count': 'call_total'
})

In [5]:
cat_df = cat_df.rename(columns={
    'cat119_train.tm': 'tm',
    'cat119_train.address_city': 'city',
    'cat119_train.address_gu': 'gu',
    'cat119_train.sub_address': 'dong',
    'cat119_train.cat': 'cat',
    'cat119_train.sub_cat': 'sub_cat',
    'cat119_train.stn': 'stn',
    'cat119_train.call_count': 'call_cat_sum'
})

In [6]:
# 숫자형 변환
for col in ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']:
    call_df[col] = pd.to_numeric(call_df[col], errors='coerce')

In [7]:
# 결측치 제거
call_df = call_df.dropna(subset=['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day', 'call_total'])
cat_df = cat_df.dropna(subset=['call_cat_sum'])

In [8]:
# 병합
merged_df = pd.merge(call_df, cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')
merged_df['call_cat_sum'] = merged_df['call_cat_sum'].fillna(0)

In [9]:
# city 제거
merged_df = merged_df.drop(columns=['city'])

In [10]:
# 범주형 처리
cat_features = ['gu', 'dong', 'cat', 'sub_cat']
for col in cat_features:
    merged_df[col] = merged_df[col].fillna('unknown')

In [11]:
# 학습용 feature 구성
features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day'] + cat_features
target = 'call_total'

In [12]:
X = merged_df[features]
y = merged_df[target]

In [13]:
# train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_features]

In [14]:
# 모델 학습
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, cat_features=cat_feature_indices, eval_set=(X_val, y_val), early_stopping_rounds=100)

0:	learn: 2.9454944	test: 3.0167066	best: 3.0167066 (0)	total: 93.9ms	remaining: 2m 20s
100:	learn: 1.6571329	test: 1.5478280	best: 1.5478280 (100)	total: 5.56s	remaining: 1m 16s
200:	learn: 1.5652063	test: 1.5111041	best: 1.5111041 (200)	total: 9.46s	remaining: 1m 1s
300:	learn: 1.5139972	test: 1.4850485	best: 1.4850485 (300)	total: 13.5s	remaining: 53.7s
400:	learn: 1.4748765	test: 1.4671461	best: 1.4668394 (399)	total: 18.9s	remaining: 51.8s
500:	learn: 1.4394252	test: 1.4514888	best: 1.4514888 (500)	total: 23.1s	remaining: 46s
600:	learn: 1.4134675	test: 1.4412430	best: 1.4412430 (600)	total: 28.3s	remaining: 42.3s
700:	learn: 1.3926910	test: 1.4332646	best: 1.4332646 (700)	total: 33s	remaining: 37.6s
800:	learn: 1.3718154	test: 1.4257355	best: 1.4257355 (800)	total: 37.3s	remaining: 32.5s
900:	learn: 1.3551997	test: 1.4177455	best: 1.4177455 (900)	total: 42.9s	remaining: 28.5s
1000:	learn: 1.3399157	test: 1.4119797	best: 1.4116134 (998)	total: 47.2s	remaining: 23.5s
1100:	learn: 1

<catboost.core.CatBoostRegressor at 0x7e71572026d0>

In [15]:
# 예측 및 평가
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

RMSE: 1.3812
R2 Score: 0.8007


In [16]:
# --------------------------------------------
# 🧪 테스트 예측 파트 시작
# --------------------------------------------

In [18]:
# 테스트 데이터 불러오기
test_call_df = pd.read_csv("test_call119.csv", )
test_cat_df = pd.read_csv("test_cat119.csv")

In [19]:
# 원본 백업
output_base = test_call_df.copy()

In [20]:
# 컬럼명 변경
test_call_df = test_call_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn',
    'call_count': 'call_total'
})

test_cat_df = test_cat_df.rename(columns={
    'TM': 'tm',
    'address_city': 'city',
    'address_gu': 'gu',
    'sub_address': 'dong',
    'STN': 'stn'
})

In [21]:
# 숫자형 처리
for col in ['ta_max', 'ta_min', 'ta_max_min', 'hm_min', 'hm_max', 'ws_max', 'ws_ins_max', 'rn_day']:
    test_call_df[col] = pd.to_numeric(test_call_df[col], errors='coerce')

In [22]:
# 병합
test_merged_df = pd.merge(test_call_df, test_cat_df, on=['tm', 'city', 'gu', 'dong', 'stn'], how='left')

In [23]:
# 결측치 처리
test_merged_df['cat'] = test_merged_df.get('cat', 'unknown')
test_merged_df['sub_cat'] = test_merged_df.get('sub_cat', 'unknown')
test_merged_df['cat'] = test_merged_df['cat'].fillna('unknown')
test_merged_df['sub_cat'] = test_merged_df['sub_cat'].fillna('unknown')

In [24]:
# city 제거
test_merged_df = test_merged_df.drop(columns=['city'])

In [25]:
# 범주형 처리
for col in cat_features:
    test_merged_df[col] = test_merged_df[col].fillna('unknown')

In [26]:
# feature 구성
test_merged_df['merge_key'] = (
    test_merged_df['tm'].astype(str) + '_' +
    test_merged_df['gu'] + '_' +
    test_merged_df['dong'] + '_' +
    test_merged_df['stn'].astype(str)
)

X_test = test_merged_df[features]
test_preds = model.predict(X_test)
test_merged_df['predicted_call_total'] = test_preds

In [27]:
# 그룹 평균으로 병합
agg_preds_df = test_merged_df.groupby('merge_key')['predicted_call_total'].mean().reset_index()

In [28]:
# output_base도 merge_key 생성
output_base['merge_key'] = (
    output_base['TM'].astype(str) + '_' +
    output_base['address_gu'] + '_' +
    output_base['sub_address'] + '_' +
    output_base['STN'].astype(str)
)

In [29]:
# merge & 예측 결과 삽입
output_base = output_base.merge(agg_preds_df, on='merge_key', how='left')
output_base['call_count'] = output_base['predicted_call_total'].round().astype(int)
output_base = output_base.drop(columns=['merge_key', 'predicted_call_total'])

In [30]:
# 저장
output_base.to_csv("call119_predictions.csv", index=False)
print("✅ 예측 결과 저장 완료: call119_predictions.csv")
print(output_base.head())

✅ 예측 결과 저장 완료: call119_predictions.csv
         TM address_city address_gu sub_address  STN  ta_max  ta_min  \
0  20240501        부산광역시        강서구        대저2동  904    18.5    11.1   
1  20240501        부산광역시        강서구         생곡동  904    18.5    11.1   
2  20240501        부산광역시        강서구         송정동  937    16.9     9.9   
3  20240501        부산광역시        강서구         신호동  950    16.6    11.4   
4  20240501        부산광역시        금정구         구서동  940    16.9    10.2   

   ta_max_min  hm_min  hm_max  ws_max  ws_ins_max  rn_day  call_count  
0         7.4    42.5    82.5     6.5        11.6     0.0           1  
1         7.4    42.5    82.5     6.5        11.6     0.0           1  
2         7.0    55.3    93.9     4.5         9.7     0.0           2  
3         5.2    48.1    84.6     6.4        13.5     0.0           1  
4         6.7    46.8    91.3     3.3         8.7     0.0           2  
