In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import copy
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def smape(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output

In [4]:
bi = pd.read_csv('building_info.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
bi = bi.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power',
    'ESS저장용량(kWh)': 'ess',
    'PCS용량(kW)': 'pcs'
})

In [6]:
replace_dict = {
    '건물기타': 'Other',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data_Center',
    '백화점및아울렛': 'Department_Store_and_Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Laboratory',
    '지식산업센터': 'Knowledge_Industry_Center',
    '할인마트': 'Discount_Mart',
    '호텔및리조트': 'Hotel_and_Resort'
}

bi['building_type'] = bi['building_type'].replace(replace_dict)

In [7]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [8]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity'
})

In [9]:
train.drop(['num_date_time', 'sunshine', 'solar_radiation'], axis=1, inplace=True)
test.drop('num_date_time', axis=1, inplace=True)

In [10]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

train['month'] = train['date_time'].dt.month
train['day'] = train['date_time'].dt.day
train['hour'] = train['date_time'].dt.hour

In [11]:
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

test['month'] = test['date_time'].dt.month
test['day'] = test['date_time'].dt.day
test['hour'] = test['date_time'].dt.hour

In [12]:
train = pd.merge(train, bi, on='building_number', how='left')
test = pd.merge(test, bi, on='building_number', how='left')

In [13]:
train['day_of_year'] = train['date_time'].dt.dayofyear
train['day_of_week'] = train['date_time'].dt.dayofweek

train.replace('-', 0, inplace=True)
train = train.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
train = train.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})
train.loc[train['solar_power'] > 0, 'solar_power'] = 1
train.loc[train['ess'] > 0, 'ess'] = 1
train.loc[train['pcs'] > 0, 'pcs'] = 1

In [14]:
test['day_of_year'] = test['date_time'].dt.dayofyear
test['day_of_week'] = test['date_time'].dt.dayofweek

test.replace('-', 0, inplace=True)
test = test.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
test = test.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})
test.loc[test['solar_power'] > 0, 'solar_power'] = 1
test.loc[test['ess'] > 0, 'ess'] = 1
test.loc[test['pcs'] > 0, 'pcs'] = 1

In [15]:
train.drop(['date_time', 'precipitation', 'total_area','cooling_area'], axis=1, inplace=True)

In [16]:
test.drop(['date_time', 'precipitation', 'total_area','cooling_area'], axis=1, inplace=True)

### 건물 유형 별 모델 생성

In [17]:
train_bt = train.copy()
train_bt['day_of_week'] = pd.cut(train_bt.day_of_week, bins=[0,4,6], labels=[0,1], include_lowest=True)
train_bt['day_of_week'] = train_bt['day_of_week'].astype('int')

In [18]:
test_bt = test.copy()
test_bt['day_of_week'] = pd.cut(test_bt.day_of_week, bins=[0,4,6], labels=[0,1], include_lowest=True)
test_bt['day_of_week'] = test_bt['day_of_week'].astype('int')

In [19]:
train_bt.head()

Unnamed: 0,building_number,temperature,windspeed,humidity,power_consumption,month,day,hour,building_type,solar_power,ess,pcs,day_of_year,day_of_week
0,1,18.6,0.9,42.0,1085.28,6,1,0,Other,0,0,0,152,0
1,1,18.0,1.1,45.0,1047.36,6,1,1,Other,0,0,0,152,0
2,1,17.7,1.5,45.0,974.88,6,1,2,Other,0,0,0,152,0
3,1,16.7,1.4,48.0,953.76,6,1,3,Other,0,0,0,152,0
4,1,18.4,2.8,43.0,986.4,6,1,4,Other,0,0,0,152,0


In [20]:
train_bt.isnull().sum()

building_number       0
temperature           0
windspeed            19
humidity              9
power_consumption     0
month                 0
day                   0
hour                  0
building_type         0
solar_power           0
ess                   0
pcs                   0
day_of_year           0
day_of_week           0
dtype: int64

In [19]:
train_bt.dropna(inplace=True)

In [200]:
building_types = dict()
models = dict()
t = ['Apartment','Commercial','Data_Center','Department_Store_and_Outlet','Discount_Mart',
     'Hospital','Hotel_and_Resort','Knowledge_Industry_Center','Laboratory','Other',
     'Public','University']

for i in t:
    models['RF'] = RandomForestRegressor(random_state=41)
    models['XGB'] = XGBRegressor(random_state=42)
    building_types[i] = copy.deepcopy(models)

In [201]:
ds_sep = dict()
datasets = dict()

for i in t:
    datasets['train'] = train_bt[train_bt['building_type']==i].drop(['building_number', 'building_type'], axis=1).reset_index(drop=True)
    datasets['test'] = test_bt[test_bt['building_type']==i].drop(['building_number', 'building_type'], axis=1).reset_index(drop=True)
    datasets['test_index'] = test_bt[test_bt['building_type']==i].drop(['building_number', 'building_type'], axis=1).index
    ds_sep[i] = copy.deepcopy(datasets)

In [202]:
metrics_history = dict()

for i in t:
    X = np.array(ds_sep[i]['train'].drop('power_consumption', axis=1))
    y = ds_sep[i]['train']['power_consumption']

    kf = KFold(n_splits = 5, shuffle=True, random_state=41)

    met = dict()
    
    for tr_i, ts_i in kf.split(X):
        X_tr, X_ts = X[tr_i], X[ts_i]
        y_tr, y_ts = y[tr_i], y[ts_i]

        building_types[i]['RF'].fit(X_tr, y_tr)
        building_types[i]['XGB'].fit(X_tr, y_tr)

        y_pred_rf = building_types[i]['RF'].predict(X_ts)
        y_pred_xgb = building_types[i]['XGB'].predict(X_ts)

        met['RF'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]
        met['XGB'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]

    metrics_history[i] = copy.deepcopy(met)

In [203]:
for i in t:
    models['RF'] = RandomForestRegressor(random_state=41)
    models['XGB'] = XGBRegressor(random_state=42)
    building_types[i] = copy.deepcopy(models)

In [204]:
prediction = dict()
for i in t:
    pred = dict()
    X = np.array(ds_sep[i]['train'].drop('power_consumption', axis=1))
    y = ds_sep[i]['train']['power_consumption']

    building_types[i]['RF'].fit(X, y)
    building_types[i]['XGB'].fit(X, y)

    y_pred_rf = building_types[i]['RF'].predict(np.array(ds_sep[i]['test']))
    y_pred_xgb = building_types[i]['XGB'].predict(ds_sep[i]['test'])

    pred['RF'] = copy.deepcopy(y_pred_rf)
    pred['XGB'] = copy.deepcopy(y_pred_xgb)
    prediction[i] = copy.deepcopy(pred)

In [205]:
prediction

{'Apartment': {'RF': array([1101.38037   , 1042.1016    ,  878.07118143, ..., 1300.25811   ,
         1102.062875  , 1175.359225  ]),
  'XGB': array([1354.991   , 1188.9044  ,  937.2575  , ...,  155.77855 ,
         -918.80273 ,  -89.117294], dtype=float32)},
 'Commercial': {'RF': array([1095.87194, 1418.5738 , 1846.63104, ..., 2649.15412, 2618.0308 ,
         2191.3761 ]),
  'XGB': array([ -17.935272, 1311.6565  , 1655.1768  , ..., 3808.6433  ,
         3945.4763  , 3293.0388  ], dtype=float32)},
 'Data_Center': {'RF': array([5396.59332786, 5347.16030095, 5640.47117714, 4552.99365143,
         4458.81328013, 4866.79465143, 5131.299615  , 4617.05337857,
         4901.51561143, 5291.53162571, 4529.85154857, 5094.33012714,
         4337.05709714, 5127.76646714, 5457.61405381, 4632.37489571,
         5323.84146714, 4974.10775143, 4761.05509   , 4719.85879   ,
         4809.1492    , 5253.12808286, 4064.58705429, 4889.26175247,
         4906.36122714, 5091.24174429, 5312.41816571, 5040.177

In [206]:
ans_rf = [0 for x in range(len(test_bt))]
ans_xgb = [0 for x in range(len(test_bt))]

In [207]:
for i in t:
    idx = 0
    for j in ds_sep[i]['test_index']:
        ans_rf[j] = prediction[i]['RF'][idx]
        ans_xgb[j] = prediction[i]['XGB'][idx]

        idx += 1

In [208]:
ans_rf = np.round(ans_rf, 0)
ans_xgb = np.round(ans_xgb, 0)

In [209]:
sub = pd.read_csv('sample_submission.csv')

In [210]:
sub['answer'] = ans_rf
sub.to_csv('rf_sub_05.csv', index=False)

In [211]:
sub['answer'] = ans_xgb
sub.to_csv('xgb_sub_06.csv', index=False)

제출 결과 RF의 경우 38.132로 지금까지 중 가장 높은 성능을 보임

### Feature Selection 수행

In [None]:
metrics_history = dict()

for i in t:
    X = np.array(ds_sep[i]['train'].drop('power_consumption', axis=1))
    y = ds_sep[i]['train']['power_consumption']

    kf = KFold(n_splits = 5, shuffle=True, random_state=41)

    met = dict()
    
    for tr_i, ts_i in kf.split(X):
        X_tr, X_ts = X[tr_i], X[ts_i]
        y_tr, y_ts = y[tr_i], y[ts_i]

        building_types[i]['RF'].fit(X_tr, y_tr)
        building_types[i]['XGB'].fit(X_tr, y_tr)

        y_pred_rf = building_types[i]['RF'].predict(X_ts)
        y_pred_xgb = building_types[i]['XGB'].predict(X_ts)

        met['RF'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]
        met['XGB'] = [
            mean_absolute_error(y_ts, y_pred_rf),
            mean_squared_error(y_ts, y_pred_rf),
            smape(y_ts, y_pred_rf)
        ]

    metrics_history[i] = copy.deepcopy(met)

In [214]:
X = train_bt.drop(['building_type','power_consumption'], axis=1)
y = train_bt['power_consumption']

In [216]:
model = RandomForestRegressor()
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)
fs = X.columns[fit.support_].tolist()
print(fs)

['building_number', 'hour', 'solar_power', 'day_of_year', 'day_of_week']


In [217]:
model = RandomForestRegressor()
model.fit(X[fs], y)

In [219]:
X_test = test_bt.drop(['building_type'], axis=1)

In [220]:
pred = model.predict(X_test[fs])

In [222]:
sub['answer'] = pred

In [223]:
sub.to_csv('rf_sub_07.csv', index=False)

이게 제일 높은 성능을 보였다.

어째서인지는 모르겠다.

전력 소비량 그래프를 보고 직접 유형을 나눠보는 것도 고려할 것

In [26]:
train_bt.head()

Unnamed: 0,building_number,temperature,windspeed,humidity,power_consumption,month,day,hour,building_type,solar_power,ess,pcs,day_of_year,day_of_week
0,1,18.6,0.9,42.0,1085.28,6,1,0,Other,0,0,0,152,0
1,1,18.0,1.1,45.0,1047.36,6,1,1,Other,0,0,0,152,0
2,1,17.7,1.5,45.0,974.88,6,1,2,Other,0,0,0,152,0
3,1,16.7,1.4,48.0,953.76,6,1,3,Other,0,0,0,152,0
4,1,18.4,2.8,43.0,986.4,6,1,4,Other,0,0,0,152,0


In [34]:
train_tmp = train_bt.copy()

In [43]:
test_tmp = test_bt.copy()

In [35]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

train_tmp['building_type'] = train_bt['building_type'].replace(replace_dict)

In [44]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

test_tmp['building_type'] = test_bt['building_type'].replace(replace_dict)

In [36]:
train_tmp

Unnamed: 0,building_number,temperature,windspeed,humidity,power_consumption,month,day,hour,building_type,solar_power,ess,pcs,day_of_year,day_of_week
0,1,18.6,0.9,42.0,1085.28,6,1,0,0,0,0,0,152,0
1,1,18.0,1.1,45.0,1047.36,6,1,1,0,0,0,0,152,0
2,1,17.7,1.5,45.0,974.88,6,1,2,0,0,0,0,152,0
3,1,16.7,1.4,48.0,953.76,6,1,3,0,0,0,0,152,0
4,1,18.4,2.8,43.0,986.40,6,1,4,0,0,0,0,152,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.9,86.0,881.04,8,24,19,11,0,0,0,236,0
203996,100,22.4,1.3,86.0,798.96,8,24,20,11,0,0,0,236,0
203997,100,21.3,1.0,92.0,825.12,8,24,21,11,0,0,0,236,0
203998,100,21.0,0.3,94.0,640.08,8,24,22,11,0,0,0,236,0


In [39]:
X = train_tmp.drop(['power_consumption'], axis=1)
y = train_tmp['power_consumption']

In [40]:
model = RandomForestRegressor()
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)
fs = X.columns[fit.support_].tolist()
print(fs)

['building_number', 'hour', 'building_type', 'day_of_year', 'day_of_week']


In [41]:
model = RandomForestRegressor(random_state=41)
model.fit(X[fs], y)

In [49]:
sub = pd.read_csv('sample_submission.csv')

In [50]:
X_test = test_tmp
pred = model.predict(X_test[fs])
sub['answer'] = pred
sub.to_csv('rf_sub_08.csv', index=False)

전력 소비량 그래프를 통한 타입 재지정

In [21]:
train_tmp = train_bt.copy()
test_tmp = test_bt.copy()

In [22]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

train_tmp['building_type'] = train_bt['building_type'].replace(replace_dict)

In [23]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

test_tmp['building_type'] = test_bt['building_type'].replace(replace_dict)

In [24]:
train_tmp = pd.get_dummies(train_tmp, columns=['building_type', 'building_number'])
test_tmp = pd.get_dummies(test_tmp, columns=['building_type', 'building_number'])

In [25]:
X = train_tmp.drop(['power_consumption'], axis=1)
y = train_tmp['power_consumption']

In [76]:
model = RandomForestRegressor(random_state=41)
model.fit(X, y)

In [78]:
X_test = test_tmp
pred = model.predict(X_test)
sub['answer'] = pred
sub.to_csv('rf_sub_10.csv', index=False)

In [82]:
sub['answer'] = np.round(pred, 2)

In [84]:
sub.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2080.77
1,1_20220825 01,2136.32
2,1_20220825 02,2011.17
3,1_20220825 03,1971.6
4,1_20220825 04,1957.48


In [85]:
sub.to_csv('rf_sub_10.csv', index=False)