In [1]:
from google.colab import files
uploaded = files.upload()

Saving catboost_valid_preds_2021.csv to catboost_valid_preds_2021.csv
Saving knn_valid_preds_2021.csv to knn_valid_preds_2021.csv
Saving lgbm_valid_preds_2021.csv to lgbm_valid_preds_2021.csv
Saving randomforest_valid_preds_2021.csv to randomforest_valid_preds_2021.csv
Saving train.csv to train.csv


In [31]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet

In [16]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [25]:
# 데이터 불러오기
train_df = pd.read_csv("train.csv")

# 날짜 컬럼 처리 및 파생 변수 생성
train_df['일시'] = pd.to_datetime(train_df['일시'].astype(str), format="%Y%m%d")
train_df['year'] = train_df['일시'].dt.year
train_df['month'] = train_df['일시'].dt.month
train_df['day'] = train_df['일시'].dt.day
train_df['weekday'] = train_df['일시'].dt.weekday  # 0: 월요일, 6: 일요일
train_df['is_weekend'] = (train_df['weekday'] >= 5).astype(int)

# 컬럼명 영문화
train_df = train_df.rename(columns={
    '광진구': 'Gwangjin',
    '동대문구': 'Dongdaemun',
    '성동구': 'Seongdong',
    '중랑구': 'Jungnang'
})

In [26]:
# 각 모델별 예측 결과 로드
catboost_preds = pd.read_csv('catboost_valid_preds_2021.csv')
lgbm_preds = pd.read_csv('lgbm_valid_preds_2021.csv')
randomforest_preds = pd.read_csv('randomforest_valid_preds_2021.csv')
knn_preds = pd.read_csv('knn_valid_preds_2021.csv')



# 일시 컬럼 제거
catboost_preds = catboost_preds.drop(columns=['catboost_일시'], errors='ignore')
lgbm_preds = lgbm_preds.drop(columns=['lgbm_일시'], errors='ignore')
randomforest_preds = randomforest_preds.drop(columns=['randomforest_일시'], errors='ignore')
knn_preds = knn_preds.drop(columns=['knn_일시'], errors='ignore')



# catboost 컬럼 매핑
rename_map_catboost = {
    '광진구': 'catboost_Gwangjin',
    '동대문구': 'catboost_Dongdaemun',
    '성동구': 'catboost_Seongdong',
    '중랑구': 'catboost_Jungnang'
}
catboost_preds = catboost_preds.rename(columns=rename_map_catboost)

# lgbm 컬럼 매핑
rename_map_lgbm = {
    '광진구': 'lgbm_Gwangjin',
    '동대문구': 'lgbm_Dongdaemun',
    '성동구': 'lgbm_Seongdong',
    '중랑구': 'lgbm_Jungnang'
}
lgbm_preds = lgbm_preds.rename(columns=rename_map_lgbm)

# randomforest 컬럼 매핑
rename_map_rf = {
    'Gwangjin': 'randomforest_Gwangjin',
    'Dongdaemun': 'randomforest_Dongdaemun',
    'Seongdong': 'randomforest_Seongdong',
    'Jungnang': 'randomforest_Jungnang'
}
randomforest_preds = randomforest_preds.rename(columns=rename_map_rf)

# knn 컬럼 매핑
rename_map_knn = {
    '광진구': 'knn_Gwangjin',
    '동대문구': 'knn_Dongdaemun',
    '성동구': 'knn_Seongdong',
    '중랑구': 'knn_Jungnang'
}
knn_preds = knn_preds.rename(columns=rename_map_knn)


In [27]:
# 두 예측 DataFrame 합치기
X_meta_c_l = pd.concat([catboost_preds, lgbm_preds], axis=1)
X_meta_c_r = pd.concat([catboost_preds, randomforest_preds], axis=1)
X_meta_c_k = pd.concat([catboost_preds,knn_preds], axis=1)
X_meta_l_r = pd.concat([lgbm_preds, randomforest_preds], axis=1)
X_meta_l_k = pd.concat([lgbm_preds, knn_preds], axis=1)
X_meta_r_k = pd.concat([randomforest_preds, knn_preds], axis=1)
X_meta_c_l_r = pd.concat([catboost_preds, lgbm_preds, randomforest_preds], axis=1)
X_meta_l_r_k = pd.concat([lgbm_preds, randomforest_preds, knn_preds], axis=1)
X_meta_all = pd.concat([catboost_preds, lgbm_preds, randomforest_preds, knn_preds], axis=1)


y_valid = train_df.loc[train_df['year'] == 2021, ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']]



# 모든 조합을 딕셔너리로 관리
X_meta_dict = {
    "catboost + lightGBM": X_meta_c_l,
    "catboost + randomforest": X_meta_c_r,
    "catboost + knn": X_meta_c_k,
    "lightGBM + randomforest": X_meta_l_r,
    "lightGBM + knn": X_meta_l_k,
    "randomforest + knn": X_meta_r_k,
    "catboost + lightGBM + randomforest": X_meta_c_l_r,
    "lightGBM + randomforest + knn": X_meta_l_r_k,
    "all": X_meta_all
}


In [28]:
### Linear Regression

results_linear = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_linear[combo_name] = {}
    linear_meta = {}
    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        # 해당 구에 맞는 컬럼만 추출
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)   # ← 여기 필수
        y_gu = y_valid[gu].reset_index(drop=True)            # ← 같이 인덱스 맞추기

        # 선형 회귀 메타 모델 학습
        meta_model = LinearRegression()
        meta_model.fit(X_meta_gu, y_gu)

        # 예측 및 평가
        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        # 결과 저장 및 출력
        linear_meta[gu] = meta_model
        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_linear[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }

    print("")  # 조합별 구분선


=== catboost + lightGBM ===
Gwangjin MAE: 1.8208, RMSE: 2.4568
Dongdaemun MAE: 1.5101, RMSE: 2.0256
Seongdong MAE: 1.6865, RMSE: 2.2450
Jungnang MAE: 1.0972, RMSE: 1.4647

=== catboost + randomforest ===
Gwangjin MAE: 3.0260, RMSE: 3.5351
Dongdaemun MAE: 2.4201, RMSE: 2.8340
Seongdong MAE: 2.6152, RMSE: 3.0495
Jungnang MAE: 1.7794, RMSE: 2.0589

=== catboost + knn ===
Gwangjin MAE: 1.9995, RMSE: 2.5679
Dongdaemun MAE: 1.5503, RMSE: 2.0041
Seongdong MAE: 1.8290, RMSE: 2.3363
Jungnang MAE: 1.1204, RMSE: 1.4588

=== lightGBM + randomforest ===
Gwangjin MAE: 1.8218, RMSE: 2.4507
Dongdaemun MAE: 1.5110, RMSE: 2.0249
Seongdong MAE: 1.6862, RMSE: 2.2381
Jungnang MAE: 1.0967, RMSE: 1.4643

=== lightGBM + knn ===
Gwangjin MAE: 1.7632, RMSE: 2.3812
Dongdaemun MAE: 1.4149, RMSE: 1.8966
Seongdong MAE: 1.6354, RMSE: 2.1808
Jungnang MAE: 1.0253, RMSE: 1.3728

=== randomforest + knn ===
Gwangjin MAE: 1.9945, RMSE: 2.5622
Dongdaemun MAE: 1.5491, RMSE: 2.0023
Seongdong MAE: 1.8091, RMSE: 2.3212
Jungnan

In [29]:
### Randomforest

results_rf = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_rf[combo_name] = {}
    rf_meta = {}

    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)
        y_gu = y_valid[gu].reset_index(drop=True)

        meta_model = RandomForestRegressor(n_estimators=100, random_state=42)
        meta_model.fit(X_meta_gu, y_gu)
        rf_meta[gu] = meta_model

        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_rf[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }
    print("")


=== catboost + lightGBM ===
Gwangjin MAE: 0.7892, RMSE: 1.1098
Dongdaemun MAE: 0.7248, RMSE: 0.9712
Seongdong MAE: 0.7404, RMSE: 1.0238
Jungnang MAE: 0.4929, RMSE: 0.6544

=== catboost + randomforest ===
Gwangjin MAE: 3.0234, RMSE: 3.5354
Dongdaemun MAE: 2.4189, RMSE: 2.8341
Seongdong MAE: 2.6134, RMSE: 3.0497
Jungnang MAE: 1.7780, RMSE: 2.0591

=== catboost + knn ===
Gwangjin MAE: 0.8160, RMSE: 1.0943
Dongdaemun MAE: 0.6939, RMSE: 0.9179
Seongdong MAE: 0.7861, RMSE: 1.0284
Jungnang MAE: 0.4708, RMSE: 0.6394

=== lightGBM + randomforest ===
Gwangjin MAE: 0.7880, RMSE: 1.1165
Dongdaemun MAE: 0.7149, RMSE: 0.9569
Seongdong MAE: 0.7222, RMSE: 0.9920
Jungnang MAE: 0.4972, RMSE: 0.6609

=== lightGBM + knn ===
Gwangjin MAE: 0.7326, RMSE: 1.0023
Dongdaemun MAE: 0.6093, RMSE: 0.8211
Seongdong MAE: 0.6500, RMSE: 0.8862
Jungnang MAE: 0.4211, RMSE: 0.5700

=== randomforest + knn ===
Gwangjin MAE: 0.8050, RMSE: 1.0835
Dongdaemun MAE: 0.6915, RMSE: 0.9151
Seongdong MAE: 0.7714, RMSE: 1.0170
Jungnan

In [30]:
### CatBoost


results_cb = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_cb[combo_name] = {}
    catboost_meta = {}

    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        # 해당 구에 맞는 컬럼만 추출
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)   # ← 인덱스 정렬 필수
        y_gu = y_valid[gu].reset_index(drop=True)            # ← 타깃도 동일하게 처리

        # CatBoost 메타 모델 학습
        meta_model = CatBoostRegressor(iterations=100, learning_rate=0.1, verbose=0)
        meta_model.fit(X_meta_gu, y_gu)

        # 예측 및 평가
        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        # 결과 저장 및 출력
        catboost_meta[gu] = meta_model
        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_cb[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }

    print("")  # 조합별 구분선


=== catboost + lightGBM ===
Gwangjin MAE: 1.6500, RMSE: 2.2210
Dongdaemun MAE: 1.3589, RMSE: 1.8239
Seongdong MAE: 1.4961, RMSE: 1.9828
Jungnang MAE: 0.9950, RMSE: 1.3179

=== catboost + randomforest ===
Gwangjin MAE: 3.0260, RMSE: 3.5351
Dongdaemun MAE: 2.4201, RMSE: 2.8340
Seongdong MAE: 2.6152, RMSE: 3.0495
Jungnang MAE: 1.7794, RMSE: 2.0589

=== catboost + knn ===
Gwangjin MAE: 1.7404, RMSE: 2.2408
Dongdaemun MAE: 1.3747, RMSE: 1.7794
Seongdong MAE: 1.6238, RMSE: 2.0752
Jungnang MAE: 1.0061, RMSE: 1.2988

=== lightGBM + randomforest ===
Gwangjin MAE: 1.6571, RMSE: 2.2196
Dongdaemun MAE: 1.3595, RMSE: 1.8298
Seongdong MAE: 1.4821, RMSE: 1.9673
Jungnang MAE: 1.0038, RMSE: 1.3215

=== lightGBM + knn ===
Gwangjin MAE: 1.4675, RMSE: 1.9456
Dongdaemun MAE: 1.1643, RMSE: 1.5541
Seongdong MAE: 1.2875, RMSE: 1.6981
Jungnang MAE: 0.8417, RMSE: 1.1134

=== randomforest + knn ===
Gwangjin MAE: 1.7594, RMSE: 2.2486
Dongdaemun MAE: 1.3843, RMSE: 1.7916
Seongdong MAE: 1.6072, RMSE: 2.0518
Jungnan

In [34]:
### XGBoost


results_xgb = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_xgb[combo_name] = {}
    xgb_meta = {}

    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)
        y_gu = y_valid[gu].reset_index(drop=True)

        meta_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbosity=0)
        meta_model.fit(X_meta_gu, y_gu)

        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        xgb_meta[gu] = meta_model


        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_xgb[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }
    print("")


=== catboost + lightGBM ===
Gwangjin MAE: 1.1532, RMSE: 1.6035
Dongdaemun MAE: 0.9049, RMSE: 1.2473
Seongdong MAE: 1.1048, RMSE: 1.5527
Jungnang MAE: 0.7075, RMSE: 0.9419

=== catboost + randomforest ===
Gwangjin MAE: 3.0260, RMSE: 3.5351
Dongdaemun MAE: 2.4201, RMSE: 2.8340
Seongdong MAE: 2.6152, RMSE: 3.0495
Jungnang MAE: 1.7794, RMSE: 2.0589

=== catboost + knn ===
Gwangjin MAE: 1.1369, RMSE: 1.5011
Dongdaemun MAE: 0.9706, RMSE: 1.3013
Seongdong MAE: 1.0929, RMSE: 1.4428
Jungnang MAE: 0.6956, RMSE: 0.9613

=== lightGBM + randomforest ===
Gwangjin MAE: 1.1065, RMSE: 1.5523
Dongdaemun MAE: 0.8736, RMSE: 1.1936
Seongdong MAE: 1.0427, RMSE: 1.4706
Jungnang MAE: 0.6892, RMSE: 0.9236

=== lightGBM + knn ===
Gwangjin MAE: 0.6164, RMSE: 0.8463
Dongdaemun MAE: 0.5338, RMSE: 0.7250
Seongdong MAE: 0.5233, RMSE: 0.7154
Jungnang MAE: 0.3647, RMSE: 0.4844

=== randomforest + knn ===
Gwangjin MAE: 1.0955, RMSE: 1.4573
Dongdaemun MAE: 0.9600, RMSE: 1.2898
Seongdong MAE: 1.0463, RMSE: 1.3831
Jungnan

In [35]:
### SVR


results_svr = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_svr[combo_name] = {}
    svr_meta = {}

    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)
        y_gu = y_valid[gu].reset_index(drop=True)

        meta_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
        meta_model.fit(X_meta_gu, y_gu)

        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        svr_meta[gu] = meta_model

        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_svr[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }
    print("")

=== catboost + lightGBM ===
Gwangjin MAE: 1.7439, RMSE: 2.4752
Dongdaemun MAE: 1.4680, RMSE: 2.0609
Seongdong MAE: 1.6135, RMSE: 2.2684
Jungnang MAE: 1.0510, RMSE: 1.4553

=== catboost + randomforest ===
Gwangjin MAE: 3.0000, RMSE: 3.5880
Dongdaemun MAE: 2.4041, RMSE: 2.8689
Seongdong MAE: 2.5990, RMSE: 3.0864
Jungnang MAE: 1.7594, RMSE: 2.1159

=== catboost + knn ===
Gwangjin MAE: 1.9184, RMSE: 2.5646
Dongdaemun MAE: 1.4592, RMSE: 1.9839
Seongdong MAE: 1.7409, RMSE: 2.3560
Jungnang MAE: 1.0523, RMSE: 1.4592

=== lightGBM + randomforest ===
Gwangjin MAE: 1.7449, RMSE: 2.4747
Dongdaemun MAE: 1.4629, RMSE: 2.0590
Seongdong MAE: 1.6159, RMSE: 2.2632
Jungnang MAE: 1.0513, RMSE: 1.4544

=== lightGBM + knn ===
Gwangjin MAE: 1.6976, RMSE: 2.4286
Dongdaemun MAE: 1.3409, RMSE: 1.9184
Seongdong MAE: 1.5796, RMSE: 2.2117
Jungnang MAE: 0.9617, RMSE: 1.3740

=== randomforest + knn ===
Gwangjin MAE: 1.9182, RMSE: 2.5654
Dongdaemun MAE: 1.4580, RMSE: 1.9848
Seongdong MAE: 1.7414, RMSE: 2.3545
Jungnan

In [36]:
### Elastic(L1+L2)


results_elastic = {}

for combo_name, X_meta in X_meta_dict.items():
    print(f"=== {combo_name} ===")
    results_elastic[combo_name] = {}
    elastic_meta = {}

    for gu in ['Gwangjin', 'Dongdaemun', 'Seongdong', 'Jungnang']:
        gu_cols = [col for col in X_meta.columns if col.endswith(f"_{gu}")]
        X_meta_gu = X_meta[gu_cols].reset_index(drop=True)
        y_gu = y_valid[gu].reset_index(drop=True)

        meta_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
        meta_model.fit(X_meta_gu, y_gu)

        pred_gu = meta_model.predict(X_meta_gu)
        rmse = np.sqrt(mean_squared_error(y_gu, pred_gu))
        mae = mean_absolute_error(y_gu, pred_gu)

        elastic_meta[gu] = meta_model

        print(f"{gu} MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        results_elastic[combo_name][gu] = {
            'MAE': mae,
            'RMSE': rmse
        }
    print("")

=== catboost + lightGBM ===
Gwangjin MAE: 1.8949, RMSE: 2.4789
Dongdaemun MAE: 1.6511, RMSE: 2.0856
Seongdong MAE: 1.7849, RMSE: 2.2775
Jungnang MAE: 1.2750, RMSE: 1.5504

=== catboost + randomforest ===
Gwangjin MAE: 3.0262, RMSE: 3.5352
Dongdaemun MAE: 2.4192, RMSE: 2.8386
Seongdong MAE: 2.6170, RMSE: 3.0524
Jungnang MAE: 1.7794, RMSE: 2.0589

=== catboost + knn ===
Gwangjin MAE: 2.0958, RMSE: 2.6104
Dongdaemun MAE: 1.6911, RMSE: 2.0813
Seongdong MAE: 1.9354, RMSE: 2.3821
Jungnang MAE: 1.3025, RMSE: 1.5671

=== lightGBM + randomforest ===
Gwangjin MAE: 1.8949, RMSE: 2.4789
Dongdaemun MAE: 1.6511, RMSE: 2.0856
Seongdong MAE: 1.7849, RMSE: 2.2775
Jungnang MAE: 1.2750, RMSE: 1.5504

=== lightGBM + knn ===
Gwangjin MAE: 1.8301, RMSE: 2.4044
Dongdaemun MAE: 1.5351, RMSE: 1.9513
Seongdong MAE: 1.7210, RMSE: 2.2109
Jungnang MAE: 1.1849, RMSE: 1.4572

=== randomforest + knn ===
Gwangjin MAE: 2.0958, RMSE: 2.6104
Dongdaemun MAE: 1.6911, RMSE: 2.0813
Seongdong MAE: 1.9354, RMSE: 2.3821
Jungnan