## Import

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [2]:
import random
import pandas as pd
import numpy as np
import os
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings(action='ignore')

## Fixed RandomSeed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data & Pre-Processing

In [22]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
building_info = pd.read_csv('./building_info.csv')

In [23]:
import pandas as pd

train_df = pd.merge(train_df, building_info, how='inner', on='건물번호')
train_df = train_df.replace('-', 0.0)
train_df = train_df.apply(pd.to_numeric, errors='ignore')

test_df = pd.merge(test_df, building_info, how='inner', on='건물번호')
test_df = test_df.replace('-', 0.0)
test_df = test_df.apply(pd.to_numeric, errors='ignore')

In [24]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [25]:
train_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', '건물유형', '연면적(m2)', '냉방면적(m2)',
       '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'month', 'day', 'time'],
      dtype='object')

In [26]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),month,day,time
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,호텔,82912.71,77586.0,0.0,0.0,0.0,6,1,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,호텔,82912.71,77586.0,0.0,0.0,0.0,6,1,1
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,호텔,82912.71,77586.0,0.0,0.0,0.0,6,1,2
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,호텔,82912.71,77586.0,0.0,0.0,0.0,6,1,3
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,호텔,82912.71,77586.0,0.0,0.0,0.0,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,호텔,162070.24,152943.0,0.0,0.0,0.0,8,24,19
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,호텔,162070.24,152943.0,0.0,0.0,0.0,8,24,20
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,호텔,162070.24,152943.0,0.0,0.0,0.0,8,24,21
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,호텔,162070.24,152943.0,0.0,0.0,0.0,8,24,22


## 특성 공학

In [27]:
def getDIScore(temp, humidity):
    return 1.8 * temp - 0.55 * (1 - humidity / 100) * (1.8 * temp - 26) + 32

In [28]:
def getATScore(T: float, V: float):
    V = round(V * 3.6, 2)
    apparent_temp = 13.12 + 0.6215 * T - 11.37 * V**0.16 + 0.3965 * V**0.16 * T
    return round(apparent_temp, 2)

In [29]:
def getAbsolhumidity(T, RH):
    numerator = 6.112 * np.exp((17.67 * T) / (T + 243.5)) * RH * 2.1674
    denominator = 273.15 + T
    return numerator / denominator

In [36]:
import numpy as np

def Feature_Engineering(df):
    # 불쾌지수
    df['DI'] = getDIScore(df['기온(°C)'], df['습도(%)'])

    # 체감온도
    df['AT'] = getATScore(df['기온(°C)'], df['풍속(m/s)'])

    # 절대습도
    df['Absolhumidity'] = getAbsolhumidity(df['기온(°C)'], df['습도(%)'])

    # 면적당 냉방비율
    df['Cooling_Area_Ratio'] = df['냉방면적(m2)'] / df['연면적(m2)']

    # 체감 습도 지수
    df['Perceived_Humidity_Index'] = df['Absolhumidity'] * (df['습도(%)'] / 100)

    # 0 나누기 방지
    solar_kw = df['태양광용량(kW)'].replace(0, np.nan)
    ess_kwh  = df['ESS저장용량(kWh)'].replace(0, np.nan)
    pcs_kw   = df['PCS용량(kW)'].replace(0, np.nan)

    # 파생 변수 계산 (음수 값 방지: 0보다 작으면 0으로)
    df['ess_solar_ratio'] = (ess_kwh / solar_kw).clip(lower=0)
    df['pcs_solar_ratio'] = (pcs_kw / solar_kw).clip(lower=0)
    df['pcs_ess_ratio']   = (pcs_kw / ess_kwh).clip(lower=0)

    # 병목 용량 (태양광 vs PCS 중 최소값)
    bottleneck_two = df[['태양광용량(kW)', 'PCS용량(kW)']].min(axis=1)
    df['bottleneck_kw'] = bottleneck_two.clip(lower=0)

    # NaN, Inf → 0 처리
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    return df

In [37]:
train_df = Feature_Engineering(train_df)
test_df = Feature_Engineering(test_df)

In [38]:
train_df.isnull().sum()

Unnamed: 0,0
num_date_time,0
건물번호,0
일시,0
기온(°C),0
강수량(mm),0
풍속(m/s),0
습도(%),0
일조(hr),0
일사(MJ/m2),0
전력소비량(kWh),0


In [39]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),...,time,DI,AT,Absolhumidity,Cooling_Area_Ratio,Perceived_Humidity_Index,ess_solar_ratio,pcs_solar_ratio,pcs_ess_ratio,bottleneck_kw
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,...,0,64.25294,18.61,12.817075,0.935755,10.510002,0.0,0.0,0.0,0.0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,...,1,64.25294,18.57,12.817075,0.935755,10.510002,0.0,0.0,0.0,0.0
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,...,2,63.85620,18.37,12.356809,0.935755,9.885447,0.0,0.0,0.0,0.0
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,...,3,63.73120,18.25,12.437095,0.935755,10.074047,0.0,0.0,0.0,0.0
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,...,4,63.40882,18.66,12.289890,0.935755,9.954811,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,20240824 19,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,...,19,80.89784,31.47,21.966761,0.943683,16.694738,0.0,0.0,0.0,0.0
203996,100_20240824 20,100,20240824 20,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,...,20,79.83636,30.85,20.811678,0.943683,15.400642,0.0,0.0,0.0,0.0
203997,100_20240824 21,100,20240824 21,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,...,21,79.37358,30.49,20.471950,0.943683,15.149243,0.0,0.0,0.0,0.0
203998,100_20240824 22,100,20240824 22,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,...,22,79.17920,30.16,20.681253,0.943683,15.717752,0.0,0.0,0.0,0.0


## 데이터 분활

In [40]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)','건물유형'])
train_y = train_df['전력소비량(kWh)']

In [41]:
# 시계열 특성을 학습에 반영하기 위해 test 데이터도 동일하게 처리합니다
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [42]:
test_x = test_df.drop(columns=['num_date_time', '일시','건물유형'])

## Train

In [43]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial, X, y):
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.05, log=True),
    'max_depth': trial.suggest_int('max_depth', 3, 8),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
    'gamma': trial.suggest_float('gamma', 0, 5),
    'subsample': trial.suggest_float('subsample', 0.6, 0.95),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
    'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
    'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
    'n_estimators': 3000,
    'objective': 'reg:squarederror',
    'random_state': 42,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': 'rmse'   # 여기로 이동
    }

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse


# 데이터 하나로 튜닝
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, train_x, train_y), n_trials=50)

print("Best trial:")
print(study.best_trial)

best_params = study.best_trial.params
best_params.update({'n_estimators': 300, 'objective': 'reg:squarederror', 'random_state': 42})

model = XGBRegressor(**best_params)
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"최적 파라미터로 테스트 RMSE: {rmse:.2f}")

# 데이터 하나로 튜닝
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, train_x, train_y), n_trials=50)

print("Best trial:")
print(study.best_trial)

best_params = study.best_trial.params
best_params.update({'n_estimators': 300, 'objective': 'reg:squarederror', 'random_state': 42})

model = XGBRegressor(**best_params)
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"최적 파라미터로 테스트 RMSE: {rmse:.2f}")

[I 2025-08-10 08:01:42,063] A new study created in memory with name: no-name-529be121-e79b-42e6-a187-d2d0596ba244
[I 2025-08-10 08:01:56,065] Trial 0 finished with value: 399.6310562941456 and parameters: {'learning_rate': 0.0029047845511527347, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 3.0461334393501915, 'subsample': 0.7654893902256505, 'colsample_bytree': 0.8948590544971486, 'reg_alpha': 3.715241967606053, 'reg_lambda': 0.193407830135433}. Best is trial 0 with value: 399.6310562941456.
[I 2025-08-10 08:02:06,134] Trial 1 finished with value: 2746.8871564130745 and parameters: {'learning_rate': 0.00011497850053109983, 'max_depth': 7, 'min_child_weight': 11, 'gamma': 3.341099296289306, 'subsample': 0.8644587449984441, 'colsample_bytree': 0.9320599337805682, 'reg_alpha': 0.016569660341608894, 'reg_lambda': 0.2728216640800999}. Best is trial 0 with value: 399.6310562941456.
[I 2025-08-10 08:02:12,690] Trial 2 finished with value: 454.067229711136 and parameters: {'learning_rate': 

Best trial:
FrozenTrial(number=42, state=1, values=[270.27805578055046], datetime_start=datetime.datetime(2025, 8, 10, 8, 8, 58, 126487), datetime_complete=datetime.datetime(2025, 8, 10, 8, 9, 10, 356873), params={'learning_rate': 0.03687391448021568, 'max_depth': 8, 'min_child_weight': 8, 'gamma': 0.3233803584750722, 'subsample': 0.9442439028061014, 'colsample_bytree': 0.7747299461794387, 'reg_alpha': 2.574647577483356, 'reg_lambda': 0.08610785603688484}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.05, log=True, low=0.0001, step=None), 'max_depth': IntDistribution(high=8, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=15, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=0.95, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=0.95, log=False, low=0.6, step=None), 'reg_alpha': FloatDistributi

[I 2025-08-10 08:10:25,428] A new study created in memory with name: no-name-62842887-4a92-4ef6-a8c0-648c735a140d


최적 파라미터로 테스트 RMSE: 376.80


[I 2025-08-10 08:10:31,110] Trial 0 finished with value: 506.82317191369606 and parameters: {'learning_rate': 0.01593441693843738, 'max_depth': 4, 'min_child_weight': 8, 'gamma': 3.5327801406908006, 'subsample': 0.6387539207688455, 'colsample_bytree': 0.8049879595153102, 'reg_alpha': 4.997327849406261, 'reg_lambda': 0.0011890860808987773}. Best is trial 0 with value: 506.82317191369606.
[I 2025-08-10 08:10:41,002] Trial 1 finished with value: 325.28614754462336 and parameters: {'learning_rate': 0.013919408272271177, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 3.4724403099154433, 'subsample': 0.7939173130775132, 'colsample_bytree': 0.9004897933576778, 'reg_alpha': 0.08380874980779995, 'reg_lambda': 0.19042589300332488}. Best is trial 1 with value: 325.28614754462336.
[I 2025-08-10 08:10:46,684] Trial 2 finished with value: 835.2409536506212 and parameters: {'learning_rate': 0.0028949078823268704, 'max_depth': 4, 'min_child_weight': 14, 'gamma': 2.8244435626784457, 'subsample': 0.671

Best trial:
FrozenTrial(number=36, state=1, values=[263.1439909386774], datetime_start=datetime.datetime(2025, 8, 10, 8, 16, 19, 384369), datetime_complete=datetime.datetime(2025, 8, 10, 8, 16, 32, 468552), params={'learning_rate': 0.04963280156589795, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 0.3053150721034219, 'subsample': 0.7841917774211156, 'colsample_bytree': 0.9322733599504766, 'reg_alpha': 3.2561364847695473, 'reg_lambda': 0.003877689980132002}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.05, log=True, low=0.0001, step=None), 'max_depth': IntDistribution(high=8, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=15, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=0.95, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=0.95, log=False, low=0.6, step=None), 'reg_alpha': FloatDistrib

In [44]:
train_pred = model.predict(X_train)
valid_pred = model.predict(X_valid)

rmse_train = np.sqrt(mean_squared_error(y_train, train_pred))
rmse_valid = np.sqrt(mean_squared_error(y_valid, valid_pred))

print(f"Train RMSE: {rmse_train:.4f}")
print(f"Valid RMSE: {rmse_valid:.4f}")
print(f"Gap: {rmse_valid - rmse_train:.4f}")

Train RMSE: 301.2840
Valid RMSE: 345.5672
Gap: 44.2832


In [45]:
import numpy as np

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    # 0으로 나누는 경우를 방지하기 위해 denominator가 0인 경우 제외
    diff = diff[denominator != 0]
    return np.mean(diff) * 100  # % 단위로 반환

# 사용 예시
smape_val = smape(y_valid, y_pred)
print(f"SMAPE: {smape_val:.2f}%")

SMAPE: 11.88%


## Prediction

In [46]:
X_valid

Unnamed: 0,건물번호,기온(°C),강수량(mm),풍속(m/s),습도(%),연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),...,time,DI,AT,Absolhumidity,Cooling_Area_Ratio,Perceived_Humidity_Index,ess_solar_ratio,pcs_solar_ratio,pcs_ess_ratio,bottleneck_kw
184895,91,26.1,0.0,0.8,91.0,364652.77,243493.983,0.00,0.0,0.0,...,23,77.94149,28.13,22.286849,0.667742,20.281033,0.000000,0.000000,0.000000,0.0
187594,92,26.6,3.1,6.4,97.0,27915.29,5628.000,322.90,209.0,100.0,...,10,79.51898,28.29,24.427810,0.201610,23.694976,0.647259,0.309693,0.478469,100.0
119228,59,22.9,1.3,0.4,96.0,169052.00,33867.000,0.00,0.0,0.0,...,20,72.88516,24.92,19.619665,0.200335,18.834878,0.000000,0.000000,0.000000,0.0
148570,73,30.1,0.0,1.5,75.0,139042.61,85996.600,103.00,0.0,0.0,...,10,82.30525,32.57,22.889339,0.618491,17.167004,0.000000,0.000000,0.000000,0.0
150165,74,28.1,0.0,1.9,83.0,146518.00,98493.000,0.00,0.0,0.0,...,21,80.28177,30.27,22.710738,0.672225,18.849912,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90934,45,23.8,0.0,2.4,89.0,196028.20,74378.250,0.00,0.0,0.0,...,22,73.82118,25.18,19.147379,0.379426,17.041167,0.000000,0.000000,0.000000,0.0
39625,20,24.9,0.0,4.1,90.0,44259.80,32639.290,100.00,0.0,0.0,...,1,75.78490,26.29,20.606606,0.737448,18.545945,0.000000,0.000000,0.000000,0.0
178855,88,27.3,0.0,0.2,89.0,129520.65,85751.070,38.88,0.0,0.0,...,7,79.74003,29.57,23.300914,0.662065,20.737814,0.000000,0.000000,0.000000,0.0
137902,68,27.7,0.0,1.8,86.0,111365.97,36356.070,94.38,0.0,0.0,...,22,80.02278,29.81,23.018717,0.326456,19.796097,0.000000,0.000000,0.000000,0.0


In [47]:
# X_train의 컬럼 순서로 test_x 재정렬
test_x = test_x[X_train.columns]
test_x

Unnamed: 0,건물번호,기온(°C),강수량(mm),풍속(m/s),습도(%),연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),...,time,DI,AT,Absolhumidity,Cooling_Area_Ratio,Perceived_Humidity_Index,ess_solar_ratio,pcs_solar_ratio,pcs_ess_ratio,bottleneck_kw
0,1,26.5,0.0,0.7,80.0,82912.71,77586.0,0.0,0.0,0.0,...,0,77.31300,28.59,20.034817,0.935755,16.027854,0.0,0.0,0.0,0.0
1,1,26.1,0.0,0.0,80.0,82912.71,77586.0,0.0,0.0,0.0,...,1,76.67220,29.34,19.592834,0.935755,15.674268,0.0,0.0,0.0,0.0
2,1,25.9,0.0,0.3,83.0,82912.71,77586.0,0.0,0.0,0.0,...,2,76.69203,28.10,20.101579,0.935755,16.684310,0.0,0.0,0.0,0.0
3,1,25.7,0.0,1.1,83.0,82912.71,77586.0,0.0,0.0,0.0,...,3,76.36569,27.62,19.877763,0.935755,16.498543,0.0,0.0,0.0,0.0
4,1,25.5,0.0,1.0,86.0,82912.71,77586.0,0.0,0.0,0.0,...,4,76.36770,27.42,20.366563,0.935755,17.515244,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,27.1,0.0,2.7,68.0,162070.24,152943.0,0.0,0.0,0.0,...,19,76.77072,29.06,17.606824,0.943683,11.972640,0.0,0.0,0.0,0.0
16796,100,26.5,0.0,2.9,73.0,162070.24,152943.0,0.0,0.0,0.0,...,20,76.47755,28.33,18.281771,0.943683,13.345693,0.0,0.0,0.0,0.0
16797,100,26.3,0.0,3.5,73.0,162070.24,152943.0,0.0,0.0,0.0,...,21,76.17101,28.05,18.079146,0.943683,13.197777,0.0,0.0,0.0,0.0
16798,100,26.0,0.0,3.4,68.0,162070.24,152943.0,0.0,0.0,0.0,...,22,75.13920,27.69,16.561113,0.943683,11.261557,0.0,0.0,0.0,0.0


In [48]:
 preds = model.predict(test_x)

## Submission

In [49]:
submission = pd.read_csv('./sample_submission.csv')
submission['answer'] = preds

In [50]:
submission.to_csv('./XGBoost_feature_add_v4.csv', index=False)