In [30]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import copy
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def smape(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output

In [4]:
bi = pd.read_csv('building_info.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
bi = bi.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power',
    'ESS저장용량(kWh)': 'ess',
    'PCS용량(kW)': 'pcs'
})

In [6]:
replace_dict = {
    '건물기타': 'Other',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data_Center',
    '백화점및아울렛': 'Department_Store_and_Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Laboratory',
    '지식산업센터': 'Knowledge_Industry_Center',
    '할인마트': 'Discount_Mart',
    '호텔및리조트': 'Hotel_and_Resort'
}

bi['building_type'] = bi['building_type'].replace(replace_dict)

In [7]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [8]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'precipitation',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity'
})

In [9]:
train.drop(['num_date_time', 'sunshine', 'solar_radiation'], axis=1, inplace=True)
test.drop('num_date_time', axis=1, inplace=True)

In [10]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

train['month'] = train['date_time'].dt.month
train['day'] = train['date_time'].dt.day
train['hour'] = train['date_time'].dt.hour

In [11]:
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

test['month'] = test['date_time'].dt.month
test['day'] = test['date_time'].dt.day
test['hour'] = test['date_time'].dt.hour

In [12]:
train = pd.merge(train, bi, on='building_number', how='left')
test = pd.merge(test, bi, on='building_number', how='left')

In [13]:
train['day_of_year'] = train['date_time'].dt.dayofyear
train['day_of_week'] = train['date_time'].dt.dayofweek

train.replace('-', 0, inplace=True)
train = train.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
train = train.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})
train.loc[train['solar_power'] > 0, 'solar_power'] = 1
train.loc[train['ess'] > 0, 'ess'] = 1
train.loc[train['pcs'] > 0, 'pcs'] = 1

In [14]:
test['day_of_year'] = test['date_time'].dt.dayofyear
test['day_of_week'] = test['date_time'].dt.dayofweek

test.replace('-', 0, inplace=True)
test = test.astype({'solar_power':'float', 'ess':'float', 'pcs':'float'})
test = test.astype({'solar_power':'int', 'ess':'int', 'pcs':'int'})
test.loc[test['solar_power'] > 0, 'solar_power'] = 1
test.loc[test['ess'] > 0, 'ess'] = 1
test.loc[test['pcs'] > 0, 'pcs'] = 1

In [15]:
train.drop(['date_time', 'precipitation', 'total_area','cooling_area'], axis=1, inplace=True)

In [16]:
test.drop(['date_time', 'precipitation', 'total_area','cooling_area'], axis=1, inplace=True)

### 건물 유형 별 모델 생성

In [17]:
train_bt = train.copy()
train_bt['day_of_week'] = pd.cut(train_bt.day_of_week, bins=[0,4,6], labels=[0,1], include_lowest=True)
train_bt['day_of_week'] = train_bt['day_of_week'].astype('int')

In [18]:
test_bt = test.copy()
test_bt['day_of_week'] = pd.cut(test_bt.day_of_week, bins=[0,4,6], labels=[0,1], include_lowest=True)
test_bt['day_of_week'] = test_bt['day_of_week'].astype('int')

In [19]:
train_bt.dropna(inplace=True)

In [20]:
train_tmp = train_bt.copy()
test_tmp = test_bt.copy()

In [21]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

train_tmp['building_type'] = train_bt['building_type'].replace(replace_dict)

In [22]:
replace_dict = {
    'Other': 0,
    'Public': 1,
    'University': 2,
    'Data_Center': 3,
    'Department_Store_and_Outlet': 4,
    'Hospital': 5,
    'Commercial': 6,
    'Apartment': 7,
    'Laboratory': 8,
    'Knowledge_Industry_Center': 9,
    'Discount_Mart': 10,
    'Hotel_and_Resort': 11
}

test_tmp['building_type'] = test_bt['building_type'].replace(replace_dict)

In [23]:
train_tmp = pd.get_dummies(train_tmp, columns=['building_type', 'building_number'])
test_tmp = pd.get_dummies(test_tmp, columns=['building_type', 'building_number'])

In [24]:
X = train_tmp.drop(['power_consumption'], axis=1)
y = train_tmp['power_consumption']

### Grid Search

In [25]:
model = RandomForestRegressor(
    max_depth=80,
    max_features=30,
    min_samples_leaf=10,
    min_samples_split=2,
    n_estimators=100,
    random_state=41)
model.fit(X,y)

In [27]:
sub = pd.read_csv('sample_submission.csv')

In [28]:
X_test = test_tmp
pred = model.predict(X_test)
sub['answer'] = pred
sub.to_csv('rf_sub_11.csv', index=False)

In [43]:
model = ExtraTreesRegressor(random_state=41)
model.fit(X, y)

In [44]:
pred = model.predict(X_test)
sub['answer'] = pred
mean_absolute_error(best['answer'], sub['answer'])

66.04642559896669

In [45]:
sub.to_csv('ex_sub_12.csv', index=False)