In [1]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [2]:
!pip install catboost holidays scikit-learn matplotlib

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [10]:
!pip install --upgrade scikit-learn



In [29]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [4]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# 데이터 불러오기
train_df = pd.read_csv("train.csv")

# 날짜 컬럼 처리 및 파생 변수 생성
train_df['일시'] = pd.to_datetime(train_df['일시'].astype(str), format="%Y%m%d")
train_df['year'] = train_df['일시'].dt.year
train_df['month'] = train_df['일시'].dt.month
train_df['day'] = train_df['일시'].dt.day
train_df['weekday'] = train_df['일시'].dt.weekday  # 0: 월요일, 6: 일요일
train_df['is_weekend'] = (train_df['weekday'] >= 5).astype(int)

# 컬럼명 영문화
train_df = train_df.rename(columns={
    '광진구': 'Gwangjin',
    '동대문구': 'Dongdaemun',
    '성동구': 'Seongdong',
    '중랑구': 'Jungnang'
})

In [5]:
# 공휴일 정보 파생 변수
import holidays
kr_holidays = holidays.KR(years=[2018, 2019, 2020, 2021, 2022])

# 임시공휴일 및 선거일 직접 추가
extra_holidays = {
    '2018-06-13': '제7회 전국동시지방선거',
    '2020-04-15': '제21대 국회의원선거',
    '2020-08-17': '임시공휴일',
    '2022-03-09': '제20대 대통령선거',
    '2022-06-01': '제8회 전국동시지방선거'
}
for date, name in extra_holidays.items():
    kr_holidays[date] = name


train_df['holiday_name'] = train_df['일시'].dt.date.map(kr_holidays)
train_df['holiday_name'] = train_df['holiday_name'].fillna('None')
train_df['is_holiday'] = (train_df['holiday_name'] != 'None').astype(int)

# 활동형/휴식형 공휴일 구분
activity_holidays = ['어린이날']
rest_holidays = ['설날', '추석', '크리스마스']
train_df['is_activity_holiday'] = train_df['holiday_name'].isin(activity_holidays).astype(int)
train_df['is_rest_holiday'] = train_df['holiday_name'].isin(rest_holidays).astype(int)

In [6]:
# 급증일/급감일 변수 생성

for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
    ma = train_df[gu].rolling(window=3, center=True).mean()
    residual = train_df[gu] - ma
    spike_days = train_df.loc[residual.nlargest(10).index, '일시']
    drop_days = train_df.loc[residual.nsmallest(10).index, '일시']
    train_df[f'{gu}_is_spike_day'] = train_df['일시'].isin(spike_days).astype(int)
    train_df[f'{gu}_is_drop_day'] = train_df['일시'].isin(drop_days).astype(int)


# 급증/급감일 날짜순 확인
for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
    ma = train_df[gu].rolling(window=3, center=True).mean()
    residual = train_df[gu] - ma

    # 급증일
    spike_idx = residual.nlargest(10).index
    spike_days = train_df.loc[spike_idx, ['일시', gu]].copy()
    spike_days['residual'] = residual.loc[spike_idx]
    print(f"\n=== {gu} 급증일(날짜순) ===")
    print(spike_days.sort_values(by='일시'))

    # 급감일
    drop_idx = residual.nsmallest(10).index
    drop_days = train_df.loc[drop_idx, ['일시', gu]].copy()
    drop_days['residual'] = residual.loc[drop_idx]
    print(f"\n=== {gu} 급감일(날짜순) ===")
    print(drop_days.sort_values(by='일시'))


=== Gwangjin 급증일(날짜순) ===
             일시  Gwangjin  residual
618  2019-09-11     9.790  4.068000
838  2020-04-18    11.258  5.116000
867  2020-05-17    10.754  3.494000
981  2020-09-08     9.450  3.950667
986  2020-09-13    11.150  3.198000
1220 2021-05-05    12.446  3.618000
1248 2021-06-02    11.884  3.274000
1329 2021-08-22    11.918  5.234000
1337 2021-08-30    13.242  3.863333
1377 2021-10-09    11.336  4.372667

=== Gwangjin 급감일(날짜순) ===
             일시  Gwangjin  residual
503  2019-05-19     1.724 -5.288000
639  2019-10-02     3.418 -4.632000
644  2019-10-07     2.700 -4.910667
1188 2021-04-03     1.636 -5.700667
1219 2021-05-04     2.734 -6.044667
1249 2021-06-03     3.776 -5.024000
1328 2021-08-21     3.664 -5.677333
1338 2021-08-31     2.690 -6.109333
1345 2021-09-07     1.178 -5.558667
1378 2021-10-10     3.480 -5.189333

=== Dongdaemun 급증일(날짜순) ===
             일시  Dongdaemun  residual
838  2020-04-18       6.128  2.544000
981  2020-09-08       7.024  2.821333
1218 2021-0

In [7]:
# 공휴일 제외 급증/급감일
for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
    non_holiday_df = train_df[train_df['is_holiday'] == 0].copy()
    ma = non_holiday_df[gu].rolling(window=3, center=True).mean()
    residual = non_holiday_df[gu] - ma

    spike_idx = residual.nlargest(10).index
    spike_days = non_holiday_df.loc[spike_idx, ['일시', gu]].copy()
    spike_days['residual'] = residual.loc[spike_idx]
    print(f"\n=== {gu} 급증일(공휴일 제외, 날짜순) ===")
    print(spike_days.sort_values(by='일시'))

    drop_idx = residual.nsmallest(10).index
    drop_days = non_holiday_df.loc[drop_idx, ['일시', gu]].copy()
    drop_days['residual'] = residual.loc[drop_idx]
    print(f"\n=== {gu} 급감일(공휴일 제외, 날짜순) ===")
    print(drop_days.sort_values(by='일시'))


=== Gwangjin 급증일(공휴일 제외, 날짜순) ===
             일시  Gwangjin  residual
838  2020-04-18    11.258  5.116000
867  2020-05-17    10.754  3.494000
981  2020-09-08     9.450  3.950667
986  2020-09-13    11.150  3.198000
1221 2021-05-06    11.304  3.757333
1233 2021-05-18    11.084  3.381333
1248 2021-06-02    11.884  3.274000
1329 2021-08-22    11.918  5.234000
1337 2021-08-30    13.242  3.863333
1346 2021-09-08    11.990  3.193333

=== Gwangjin 급감일(공휴일 제외, 날짜순) ===
             일시  Gwangjin  residual
503  2019-05-19     1.724 -5.288000
639  2019-10-02     3.418 -5.354000
644  2019-10-07     2.700 -4.910667
837  2020-04-17     3.798 -4.608000
1188 2021-04-03     1.636 -5.700667
1219 2021-05-04     2.734 -5.664000
1249 2021-06-03     3.776 -5.024000
1328 2021-08-21     3.664 -5.677333
1338 2021-08-31     2.690 -6.109333
1345 2021-09-07     1.178 -5.558667

=== Dongdaemun 급증일(공휴일 제외, 날짜순) ===
             일시  Dongdaemun  residual
838  2020-04-18       6.128  2.544000
981  2020-09-08       7.0

In [23]:
# Feature/Target/범주형 변수 정의
feature_cols = [
    'year', 'month', 'day', 'weekday', 'is_weekend',
    'is_holiday', 'holiday_name', 'is_activity_holiday', 'is_rest_holiday',
    'Gwangjin_is_spike_day', 'Dongdaemun_is_spike_day', 'Seongdong_is_spike_day', 'Jungnang_is_spike_day',

]
cat_features = [
    'month', 'weekday', 'is_weekend', 'is_holiday', 'holiday_name',
    'is_activity_holiday', 'is_rest_holiday',
    'Gwangjin_is_spike_day', 'Dongdaemun_is_spike_day', 'Seongdong_is_spike_day', 'Jungnang_is_spike_day',
]
target_cols = ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']


In [24]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

results = {}
summary_data = []

# 학습/검증 인덱스 정의
train_idx = train_df['year'] < 2021    # 2018~2020년
val_idx = train_df['year'] == 2021     # 2021년

X_train = train_df.loc[train_idx, feature_cols]
X_val = train_df.loc[val_idx, feature_cols]

In [31]:
import optuna
import catboost as cb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# 예시: target_cols와 해당 구별로 train/val 데이터, cat_features는 미리 준비
# target_cols = ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']

# ‘최종 best params’ 사전
best_params_dict = {
'Gwangjin': {'learning_rate': 0.07463933708677424, 'depth': 5, 'l2_leaf_reg': 1.074777222256696,
'subsample': 0.805469615390896, 'colsample_bylevel': 0.7879853811582616},
'Dongdaemun': {'learning_rate': 0.05090532965560029, 'depth': 4, 'l2_leaf_reg': 2.4336961951246585,
'subsample': 0.9860732586553317, 'colsample_bylevel': 0.8329465092538842},
'Seongdong': {'learning_rate': 0.06865667868137201, 'depth': 5, 'l2_leaf_reg': 3.548575891319757,
'subsample': 0.7118705479838677, 'colsample_bylevel': 0.7334892361378472},
'Jungnang': {'learning_rate': 0.05691151851732044, 'depth': 4, 'l2_leaf_reg': 2.5595777632181074,
'subsample': 0.9751001756331396, 'colsample_bylevel': 0.5490555159536538}
}

results_summary = []

for target in target_cols:
    params = {
        # 구별로 best_params_dict에서 직접 꺼낸 값!
        "iterations": 1000,
        "learning_rate": best_params_dict[target]["learning_rate"],
        "depth": best_params_dict[target]["depth"],
        "l2_leaf_reg": best_params_dict[target]["l2_leaf_reg"],
        "subsample": best_params_dict[target]["subsample"],
        "colsample_bylevel": best_params_dict[target]["colsample_bylevel"],
        "random_seed": 42,
        "cat_features": cat_features,
        "loss_function": "RMSE",
        "verbose": 0
    }
    y_train = train_df.loc[train_idx, target]
    y_val = train_df.loc[val_idx, target]
    model = cb.CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    preds = model.predict(X_val)


    # 결과 저장 등 — for문 안에서 처리!
    results_summary.append({
        '구': target,
        'Best RMSE': round(study.best_value, 4),
        'Best Params': study.best_params
    })

# for-loop 종료 후, 결과 출력 등
summary_df = pd.DataFrame(results_summary)
print(summary_df)

summary_df = pd.DataFrame(results_summary)
print("\n=== 모든 구별 CatBoost Best RMSE & Params 요약 ===")
print(summary_df.to_markdown(index=False))


            구  Best RMSE                                        Best Params
0    Gwangjin     1.6932  {'learning_rate': 0.049888915335499294, 'depth...
1  Dongdaemun     1.6932  {'learning_rate': 0.049888915335499294, 'depth...
2   Seongdong     1.6932  {'learning_rate': 0.049888915335499294, 'depth...
3    Jungnang     1.6932  {'learning_rate': 0.049888915335499294, 'depth...

=== 모든 구별 CatBoost Best RMSE & Params 요약 ===
| 구         |   Best RMSE | Best Params                                                                                                                                                     |
|:-----------|------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Gwangjin   |      1.6932 | {'learning_rate': 0.049888915335499294, 'depth': 5, 'l2_leaf_reg': 2.508146089854441, 'subsample': 0.9479138127775345, 'colsample_bylevel': 0.5544152702759939} |
| 