In [2]:
import pandas as pd

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
bi = pd.read_csv('building_info.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
bi = bi.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power',
    'ESS저장용량(kWh)': 'ess',
    'PCS용량(kW)': 'pcs'
})

In [6]:
replace_dict = {
    '건물기타': 'Other',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data_Center',
    '백화점및아울렛': 'Department_Store_and_Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Laboratory',
    '지식산업센터': 'Knowledge_Industry_Center',
    '할인마트': 'Discount_Mart',
    '호텔및리조트': 'Hotel_and_Resort'
}

bi['building_type'] = bi['building_type'].replace(replace_dict)
bi.head()

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power,ess,pcs
0,1,Other,110634.0,39570.0,-,-,-
1,2,Other,122233.47,99000.0,-,-,-
2,3,Other,171243.0,113950.0,40,-,-
3,4,Other,74312.98,34419.62,60,-,-
4,5,Other,205884.0,150000.0,-,2557,1000


In [7]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [8]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity'
})

In [9]:
train.drop(['num_date_time', 'sunshine', 'solar_radiation'], axis=1, inplace=True)
test.drop('num_date_time', axis=1, inplace=True)

In [10]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

In [None]:
train['year'] = train['date_time'].dt.year
train['month'] = train['date_time'].dt.month
train['day'] = train['date_time'].dt.day
train['hour'] = train['date_time'].dt.hour

In [11]:
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

In [None]:
test['month'] = test['date_time'].dt.month
test['hour'] = test['date_time'].dt.hour

In [12]:
train = pd.merge(train, bi, on='building_number', how='left')
test = pd.merge(test, bi, on='building_number', how='left')

In [None]:
train['day_of_year'] = train['date_time'].dt.dayofyear
train['day_of_week'] = train['date_time'].dt.dayofweek

In [13]:
train.replace('-', 0, inplace=True)
train = train.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
train = train.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})

In [None]:
train.loc[train['solar_power'] > 0, 'solar_power'] = 1
train.loc[train['ess'] > 0, 'ess'] = 1
train.loc[train['pcs'] > 0, 'pcs'] = 1

In [None]:
test['day_of_week'] = test['date_time'].dt.dayofweek

In [14]:
test.replace('-', 0, inplace=True)
test = test.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
test = test.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})

In [None]:
test.loc[test['solar_power'] > 0, 'solar_power'] = 1
test.loc[test['ess'] > 0, 'ess'] = 1
test.loc[test['pcs'] > 0, 'pcs'] = 1

In [15]:
train.head()

Unnamed: 0,building_number,date_time,temperature,precipitation,windspeed,humidity,power_consumption,building_type,total_area,cooling_area,solar_power,ess,pcs
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,1085.28,Other,110634.0,39570.0,0,0,0
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,1047.36,Other,110634.0,39570.0,0,0,0
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,974.88,Other,110634.0,39570.0,0,0,0
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,953.76,Other,110634.0,39570.0,0,0,0
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,986.4,Other,110634.0,39570.0,0,0,0


In [17]:
train.to_csv('train_bt.csv', index=False)
test.to_csv('test_bt.csv', index=False)

In [20]:
train.columns

Index(['building_number', 'date_time', 'temperature', 'precipitation',
       'windspeed', 'humidity', 'power_consumption', 'year', 'month', 'day',
       'hour', 'building_type', 'total_area', 'cooling_area', 'solar_power',
       'ess', 'pcs', 'day_of_year', 'day_of_week'],
      dtype='object')

In [32]:
train_ohe = pd.get_dummies(train, columns=['building_type'])

In [79]:
train_ohe = pd.get_dummies(train_ohe, columns=['day_of_week'])

In [33]:
train_ohe.head()

Unnamed: 0,building_number,date_time,temperature,precipitation,windspeed,humidity,power_consumption,total_area,cooling_area,solar_power,ess,pcs,building_type_Apartment,building_type_Commercial,building_type_Data_Center,building_type_Department_Store_and_Outlet,building_type_Discount_Mart,building_type_Hospital,building_type_Hotel_and_Resort,building_type_Knowledge_Industry_Center,building_type_Laboratory,building_type_Other,building_type_Public,building_type_University
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,1085.28,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,1047.36,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,974.88,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,953.76,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,986.4,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [34]:
train_ohe.drop(['precipitation'], axis=1, inplace=True)

In [35]:
test_ohe = pd.get_dummies(test, columns=['building_type'])
test_ohe.drop(['precipitation'], axis=1, inplace=True)

In [80]:
train_ohe.drop(['date_time', 'precipitation', 'year', 'day', 'day_of_year', 'total_area','cooling_area'], axis=1, inplace=True)

In [36]:
train_ohe.head()

Unnamed: 0,building_number,date_time,temperature,windspeed,humidity,power_consumption,total_area,cooling_area,solar_power,ess,pcs,building_type_Apartment,building_type_Commercial,building_type_Data_Center,building_type_Department_Store_and_Outlet,building_type_Discount_Mart,building_type_Hospital,building_type_Hotel_and_Resort,building_type_Knowledge_Industry_Center,building_type_Laboratory,building_type_Other,building_type_Public,building_type_University
0,1,2022-06-01 00:00:00,18.6,0.9,42.0,1085.28,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,2022-06-01 01:00:00,18.0,1.1,45.0,1047.36,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,2022-06-01 02:00:00,17.7,1.5,45.0,974.88,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,2022-06-01 03:00:00,16.7,1.4,48.0,953.76,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,2022-06-01 04:00:00,18.4,2.8,43.0,986.4,110634.0,39570.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [62]:
test_ohe = pd.get_dummies(test, columns=['day_of_week', 'building_type'])

In [63]:
test_ohe.drop(['date_time','precipitation', 'total_area', 'cooling_area'], axis=1, inplace=True)

In [27]:
train_ohe.isnull().sum()

building_number                               0
temperature                                   0
windspeed                                    19
humidity                                      9
power_consumption                             0
total_area                                    0
cooling_area                                  0
solar_power                                   0
ess                                           0
pcs                                           0
building_type_Apartment                       0
building_type_Commercial                      0
building_type_Data_Center                     0
building_type_Department_Store_and_Outlet     0
building_type_Discount_Mart                   0
building_type_Hospital                        0
building_type_Hotel_and_Resort                0
building_type_Knowledge_Industry_Center       0
building_type_Laboratory                      0
building_type_Other                           0
building_type_Public                    

In [23]:
train_ohe.dropna(inplace=True)

In [37]:
train_ohe.to_csv('train_ohe.csv', index=False)
test_ohe.to_csv('test_ohe.csv', index=False)

## Simple Prediction

In [70]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import copy
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [71]:
def smape(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output

In [90]:
buildings = dict()
models = dict()
for i in range(0, 100):
    models['RF'] = RandomForestRegressor(random_state=41)
    models['XGB'] = XGBRegressor(random_state=41)
    buildings[f'building_number_{i}'] = copy.deepcopy(models)

In [87]:
ds_sep = dict()
datasets = dict()
for i in range(0, 100):
    datasets['train'] = train_ohe[train_ohe['building_number']==(i+1)].drop('building_number', axis=1).reset_index()
    datasets['test'] = test_ohe[test_ohe['building_number']==(i+1)].drop('building_number', axis=1).reset_index()
    ds_sep[f'building_number_{i}'] = copy.deepcopy(datasets)

In [88]:
metrics_history = dict()

for i in range(0, 100):
    X = np.array(ds_sep[f'building_number_{i}']['train'].drop('power_consumption', axis=1))
    y = ds_sep[f'building_number_{i}']['train']['power_consumption']

    kf = KFold(n_splits = 5, shuffle=True, random_state=41)

    met = dict()
    
    for tr_i, ts_i in kf.split(X):
        X_tr, X_ts = X[tr_i], X[ts_i]
        y_tr, y_ts = y[tr_i], y[ts_i]

        buildings[f'building_number_{i}']['RF'].fit(X_tr, y_tr)
        buildings[f'building_number_{i}']['XGB'].fit(X_tr, y_tr)

        y_pred_rf = buildings[f'building_number_{i}']['RF'].predict(X_ts)
        y_pred_xgb = buildings[f'building_number_{i}']['XGB'].predict(X_ts)

        met['RF'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]
        met['XGB'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]

    metrics_history[f'building_number_{i}'] = copy.deepcopy(met)

In [89]:
metrics_history

{'building_number_0': {'RF': [124.29667058823536,
   32626.27607376003,
   5.203373208785136],
  'XGB': [124.29667058823536, 32626.27607376003, 5.203373208785136]},
 'building_number_1': {'RF': [96.21001764705893,
   22088.71730625353,
   6.0471727459103795],
  'XGB': [96.21001764705893, 22088.71730625353, 6.0471727459103795]},
 'building_number_2': {'RF': [114.59840735294117,
   37862.82710003381,
   7.90260261118301],
  'XGB': [114.59840735294117, 37862.82710003381, 7.90260261118301]},
 'building_number_3': {'RF': [30.576156617647058,
   1563.4040801697804,
   3.2603671471697364],
  'XGB': [30.576156617647058, 1563.4040801697804, 3.2603671471697364]},
 'building_number_4': {'RF': [152.46010588235293,
   75148.7041725669,
   5.264273295420993],
  'XGB': [152.46010588235293, 75148.7041725669, 5.264273295420993]},
 'building_number_5': {'RF': [58.651594117647086,
   9104.006335976474,
   3.310267814835202],
  'XGB': [58.651594117647086, 9104.006335976474, 3.310267814835202]},
 'building

In [98]:
prediction = dict()
for i in range(0, 100):
    pred = dict()
    X = np.array(ds_sep[f'building_number_{i}']['train'].drop('power_consumption', axis=1))
    y = ds_sep[f'building_number_{i}']['train']['power_consumption']

    buildings[f'building_number_{i}']['RF'].fit(X, y)
    buildings[f'building_number_{i}']['XGB'].fit(X, y)

    y_pred_rf = buildings[f'building_number_{i}']['RF'].predict(np.array(ds_sep[f'building_number_{i}']['test']))
    y_pred_xgb = buildings[f'building_number_{i}']['XGB'].predict(ds_sep[f'building_number_{i}']['test'])

    pred['RF'] = copy.deepcopy(y_pred_rf)
    pred['XGB'] = copy.deepcopy(y_pred_xgb)
    prediction[f'building_number_{i}'] = copy.deepcopy(pred)

In [99]:
prediction

{'building_number_0': {'RF': array([1152.3696, 1145.4   , 1148.0448, 1147.944 , 1140.0864, 1149.7344,
         1163.736 , 1158.5424, 1171.776 , 1158.9216, 1148.8128, 1148.6496,
         1161.6336, 1176.816 , 1176.36  , 1134.8352, 1153.0032, 1162.392 ,
         1159.6848, 1155.4704, 1150.656 , 1149.864 , 1140.312 , 1146.3264,
         1140.4944, 1151.1504, 1157.9952, 1150.8048, 1154.712 , 1151.3136,
         1150.0224, 1150.2336, 1161.6624, 1157.6496, 1156.5024, 1206.5952,
         1207.2432, 1213.5024, 1216.464 , 1227.3672, 1227.8808, 1222.5648,
         1210.3824, 1167.144 , 1165.656 , 1167.7392, 1173.6624, 1178.6112,
         1179.1488, 1172.6352, 1175.1936, 1161.5424, 1154.3808, 1148.1984,
         1132.8672, 1138.3872, 1112.544 , 1112.1552, 1131.2544, 1164.4416,
         1154.0784, 1166.328 , 1167.1776, 1197.9744, 1205.5392, 1194.528 ,
         1154.0448, 1153.1856, 1141.7808, 1130.1792, 1123.8528, 1127.7792,
         1108.9728, 1105.2528, 1099.7904, 1097.8416, 1104.4176, 1109.6352

In [94]:
sub = pd.read_csv('sample_submission.csv')

In [96]:
sub.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0


In [105]:
rf_pred = []
for i in range(0, 100):
    rf_pred.extend(prediction[f'building_number_{i}']['RF'])

In [106]:
rf_pred

[1152.3696000000002,
 1145.4,
 1148.0448000000004,
 1147.9440000000002,
 1140.0864000000004,
 1149.7344000000005,
 1163.7360000000006,
 1158.5424000000005,
 1171.776,
 1158.9216,
 1148.8128000000004,
 1148.6496000000004,
 1161.6336000000001,
 1176.8160000000003,
 1176.3600000000004,
 1134.8352000000004,
 1153.0031999999999,
 1162.392,
 1159.6848,
 1155.4704000000002,
 1150.6560000000004,
 1149.8640000000005,
 1140.3120000000001,
 1146.3264000000008,
 1140.494400000001,
 1151.150400000001,
 1157.9952000000012,
 1150.804800000001,
 1154.7120000000014,
 1151.313600000001,
 1150.0224000000012,
 1150.2336000000012,
 1161.6624000000008,
 1157.6495999999997,
 1156.5024000000005,
 1206.5952000000007,
 1207.2432000000006,
 1213.502400000001,
 1216.4640000000009,
 1227.3672000000008,
 1227.8808000000008,
 1222.5648000000008,
 1210.3824000000006,
 1167.1440000000007,
 1165.6560000000004,
 1167.7392000000002,
 1173.6624000000002,
 1178.6112000000003,
 1179.1488,
 1172.6352,
 1175.1936000000014,
 1

In [107]:
sub['answer'] = rf_pred

In [108]:
sub.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1152.3696
1,1_20220825 01,1145.4
2,1_20220825 02,1148.0448
3,1_20220825 03,1147.944
4,1_20220825 04,1140.0864


In [109]:
sub.to_csv('rf_sub_01.csv', index=False)

In [110]:
xgb_pred = []
for i in range(0, 100):
    xgb_pred.extend(prediction[f'building_number_{i}']['XGB'])

sub['answer'] = xgb_pred

In [112]:
sub.to_csv('xgb_sub_02.csv', index=False)

In [114]:
train_ohe.to_csv('train_ohe.csv', index=False)
test_ohe.to_csv('test_ohe.csv', index=False)

성능이 만족스럽지 못했다.

빌딩 넘버 별로 구분하는 것보다 건물 유형에 따라 예측 모델을 만드는 것이 좋다고 판단된다.

In [127]:
building_types = dict()
models = dict()
t = ['Apartment','Commercial','Data_Center','Department_Store_and_Outlet','Discount_Mart',
     'Hospital','Hotel_and_Resort','Knowledge_Industry_Center','Laboratory','Other',
     'Public','University']

for i in t:
    models['RF'] = RandomForestRegressor(random_state=41)
    models['XGB'] = XGBRegressor(random_state=41)
    building_types[i] = copy.deepcopy(models)

In [135]:
test_ohe[test_ohe[f'building_type_{i}']==1].drop('building_number', axis=1).index

Int64Index([10080, 10081, 10082, 10083, 10084, 10085, 10086, 10087, 10088,
            10089,
            ...
            11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11422,
            11423],
           dtype='int64', length=1344)

In [136]:
ds_sep = dict()
datasets = dict()

for i in t:
    datasets['train'] = train_ohe[train_ohe[f'building_type_{i}']==1].drop('building_number', axis=1).reset_index()
    datasets['test'] = test_ohe[test_ohe[f'building_type_{i}']==1].drop('building_number', axis=1).reset_index()
    datasets['test_index'] = test_ohe[test_ohe[f'building_type_{i}']==1].drop('building_number', axis=1).index
    ds_sep[i] = copy.deepcopy(datasets)

In [124]:
metrics_history = dict()

for i in t:
    X = np.array(ds_sep[i]['train'].drop('power_consumption', axis=1))
    y = ds_sep[i]['train']['power_consumption']

    kf = KFold(n_splits = 5, shuffle=True, random_state=41)

    met = dict()
    
    for tr_i, ts_i in kf.split(X):
        X_tr, X_ts = X[tr_i], X[ts_i]
        y_tr, y_ts = y[tr_i], y[ts_i]

        building_types[i]['RF'].fit(X_tr, y_tr)
        building_types[i]['XGB'].fit(X_tr, y_tr)

        y_pred_rf = building_types[i]['RF'].predict(X_ts)
        y_pred_xgb = building_types[i]['XGB'].predict(X_ts)

        met['RF'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]
        met['XGB'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]

    metrics_history[i] = copy.deepcopy(met)

In [126]:
metrics_history

{'Apartment': {'RF': [58.49574981617645,
   10701.523118080546,
   4.399703620725961],
  'XGB': [58.49574981617645, 10701.523118080546, 4.399703620725961]},
 'Commercial': {'RF': [72.40540459558828,
   20568.843588564385,
   4.115088366538639],
  'XGB': [72.40540459558828, 20568.843588564385, 4.115088366538639]},
 'Data_Center': {'RF': [18.6375173529413,
   1608.0163448073715,
   0.37894569513891213],
  'XGB': [18.6375173529413, 1608.0163448073715, 0.37894569513891213]},
 'Department_Store_and_Outlet': {'RF': [77.22928501379099,
   28523.7129012725,
   4.42844092745197],
  'XGB': [77.22928501379099, 28523.7129012725, 4.42844092745197]},
 'Discount_Mart': {'RF': [65.0818121091355,
   15758.315713208653,
   5.425387558104109],
  'XGB': [65.0818121091355, 15758.315713208653, 5.425387558104109]},
 'Hospital': {'RF': [71.06813729696601,
   12534.801637772016,
   2.6172910544514147],
  'XGB': [71.06813729696601, 12534.801637772016, 2.6172910544514147]},
 'Hotel_and_Resort': {'RF': [67.502638

In [137]:
prediction = dict()
for i in t:
    pred = dict()
    X = np.array(ds_sep[i]['train'].drop('power_consumption', axis=1))
    y = ds_sep[i]['train']['power_consumption']

    building_types[i]['RF'].fit(X, y)
    building_types[i]['XGB'].fit(X, y)

    y_pred_rf = building_types[i]['RF'].predict(np.array(ds_sep[i]['test']))
    y_pred_xgb = building_types[i]['XGB'].predict(ds_sep[i]['test'])

    pred['RF'] = copy.deepcopy(y_pred_rf)
    pred['XGB'] = copy.deepcopy(y_pred_xgb)
    prediction[i] = copy.deepcopy(pred)

In [138]:
prediction

{'Apartment': {'RF': array([2360.1672, 2338.8516, 2340.5364, ..., 2282.6898, 2268.1944,
         2268.8172]),
  'XGB': array([2395.421 , 2379.8093, 2368.6707, ..., 2230.317 , 2225.8384,
         2201.3035], dtype=float32)},
 'Commercial': {'RF': array([746.04  , 708.6576, 684.3024, ..., 673.6704, 671.6544, 699.0384]),
  'XGB': array([684.3635 , 631.0952 , 658.7235 , ..., 692.6644 , 701.8624 ,
         594.03845], dtype=float32)},
 'Data_Center': {'RF': array([9564.2712, 9556.4016, 9559.9584, 9554.3208, 9532.188 , 9559.08  ,
         9557.1648, 9553.5216, 9562.536 , 9563.9976, 9566.9208, 9587.4624,
         9575.2944, 9557.0136, 9561.348 , 9482.4936, 9538.9344, 9559.188 ,
         9566.964 , 9558.576 , 9551.3904, 9544.7664, 9549.8352, 9517.9536,
         9508.0536, 9494.8596, 9506.0052, 9476.0928, 9481.032 , 9485.3376,
         9472.5504, 9468.4536, 9548.784 , 9566.0352, 9587.6496, 9537.228 ,
         9529.128 , 9527.85  , 9521.1756, 9528.4188, 9518.4036, 9514.9512,
         9539.3808, 

In [146]:
ans_rf = [0 for x in range(len(test_ohe))]
ans_xgb = [0 for x in range(len(test_ohe))]

In [152]:
i

'Apartment'

In [157]:
for i in t:
    idx = 0
    for j in ds_sep[i]['test_index']:
        ans_rf[j] = prediction[i]['RF'][idx]
        ans_xgb[j] = prediction[i]['XGB'][idx]

        idx += 1

In [160]:
ans_rf = np.round(ans_rf, 0)
ans_xgb = np.round(ans_xgb, 0)

In [162]:
sub['answer'] = ans_rf

In [165]:
sub.to_csv('rf_sub_03.csv', index=False)

In [166]:
sub['answer'] = ans_xgb
sub.to_csv('xgb_sub_04.csv', index=False)

### xgb_sub_04.csv 제출요망

성능이 좋지 않았다.

그러므로 비슷한 추세를 보이는 유형끼리 묶는 것으로 학습 데이터를 늘린다.

분류는 네가지
- 시간대에 따른 격차가 매우 큰 유형 (University, Department_Store_and_Outlet)
- 시간대에 따른 격차가 보통인 유형 (Commercial, Hospital, Discount_Mart, Hotel_and_Resort, Knowledge_Industry_Center, Laboratory, Other, Public)
- 저녁 시간대에 증가하는 유형 (Apartment)
- 일정한 유형 (Data_Center) 

In [115]:
usage_types = dict()
models = dict()
for i in range(0, 4):
    models['RF'] = RandomForestRegressor(random_state=41)
    models['XGB'] = XGBRegressor(random_state=41)
    usage_types[f'usage_{i}'] = copy.deepcopy(models)

In [None]:
ds_sep = dict()
datasets = dict()
t = [['University','Department_Store_and_Outlet'],
     ['Hospital','Commercial','Discount_Mar','Hotel_and_Resort','Knowledge_Industry_Center','Laboratory','Other','Public'],
     ['Apartment'],
     ['Data_Center']
]

for i in t:
    for j in i:
        
        datasets['train'] = train_ohe[train_ohe[f'building_type_{j}']==1].drop('building_number', axis=1).reset_index()
        datasets['test'] = test_ohe[test_ohe['building_number']==(i+1)].drop('building_number', axis=1).reset_index()
        ds_sep[f'building_number_{i}'] = copy.deepcopy(datasets)