In [408]:
# Modules

import numpy as np
import pandas as pd
import math
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns 

# Modeling
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [409]:
# Metric
def SMAPE(actual, pred):
    return np.mean((np.abs(actual-pred))/(np.abs(actual) + np.abs(pred))/2)

In [410]:
# Data
train_df=pd.read_csv('electricity_train.csv')
test_df=pd.read_csv('electricity_test.csv')
submission=pd.read_csv('electricity_sample_submission.csv')

In [411]:
# Rename columns
train_df.columns = ['num','datetime','target','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']
test_df.columns = ['num','datetime','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']

In [413]:
#Create Features

def angryindex(x):
    y = 9/5*x['temperature'] - 0.55*(1-x['humidity'])*(9/5*x['temperature'] - 26) + 32
    return y

def heatindex(x):
    HI = -42.379 + 2.04901523*x['temperature_f'] + 10.14333127*x['humidity'] - .22475541*x['temperature_f']*x['humidity'] - .00683783*x['temperature_f']*x['temperature_f'] - .05481717*x['humidity']*x['humidity'] + .00122874*x['temperature_f']*x['temperature_f']*x['humidity'] + .00085282*x['temperature_f']*x['humidity']*x['humidity']- .00000199*x['temperature_f']*x['temperature_f']*x['humidity']*x['humidity']
    return HI

def windchillindex(x):
    WI = 33 - 0.045*(10.45 + 10*(x['windspeed']**(1/2)) - x['windspeed'] )*(33 - x['temperature'])
    return WI

def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,61,1):
    temp = eda_df[eda_df['num'] == num]
    cdh = CDH(temp['temperature'].values)
    cdhs = np.concatenate([cdhs, cdh])

In [417]:
# Create features related to datetime
eda_df = train_df.copy()
eda_df['datetime'] = pd.to_datetime(eda_df['datetime'])
eda_df['hour'] = eda_df['datetime'].dt.hour
eda_df['weekday'] = eda_df['datetime'].dt.weekday
eda_df['date'] = eda_df['datetime'].dt.date
eda_df['day'] = eda_df['datetime'].dt.day
eda_df['day2'] = eda_df['datetime'].dt.day
eda_df['month'] = eda_df['datetime'].dt.month
eda_df['weekend'] = eda_df['weekday'].isin([5,6]).astype(int)
eda_df['week'] = eda_df['datetime'].dt.isocalendar().week
eda_df['week_1'] = eda_df['week'] - 22
eda_df['squared_week'] = eda_df['week_1'] ** 2
eda_df.loc[eda_df['month'] == 7, 'day'] = eda_df.loc[eda_df['month'] == 7, 'day'] + 30
eda_df.loc[eda_df['month'] == 8, 'day'] = eda_df.loc[eda_df['month'] == 8, 'day'] + 61
eda_df['angryindex'] = eda_df.apply(lambda x: angryindex(x), axis=1)
eda_df['hour_1'] = ((eda_df['week_1']-1)*7 + eda_df['weekday'])*24 + eda_df['hour']+1
eda_df['squared_hour'] = eda_df['hour_1'] ** 2
eda_df['squared_day'] = eda_df['day'] ** 2
eda_df['temperature_f'] = eda_df['temperature']*1.8 + 3.2
eda_df['heatindex'] = eda_df.apply(lambda x: heatindex(x), axis=1)
eda_df['windchillindex'] = eda_df.apply(lambda x: windchillindex(x), axis=1)
eda_df['CDH'] = cdhs
eda_df = eda_df.merge(eda_df.groupby(['num','date'])['CDH'].mean().reset_index().rename(columns = {'CDH':'mean_CDH'}), on = ['num','date'], how = 'left')
eda_df['typoon'] = 0
for i in [5, 10, 11, 41, 42]:
    eda_df.loc[(eda_df['month'] == 8) & (eda_df['day2'] == 10) & (eda_df['num']==i), 'typoon'] = 1
eda_df['817'] = 0
for i in [2,6,7,8,9,13,16,17,18,22,23,25,26,27,33,34,35,37,43,46,47,48,52,53,54,55,56,57,58]:
    eda_df.loc[(eda_df['month'] == 8) & (eda_df['day2'] == 17) & (eda_df['num']==i), '817'] = 1
eda_df = eda_df.loc[(eda_df['817'] != 1)]

In [418]:
# 날짜를 인덱스로
eda_df.set_index('datetime', inplace = True)

In [419]:
#결측치 보간
test_df = test_df.interpolate(method='values')

In [421]:
# Adding features related to datetime (Test Data)
test_df['datetime'] = pd.to_datetime(test_df['datetime'])
test_df['hour'] = test_df['datetime'].dt.hour
test_df['weekday'] = test_df['datetime'].dt.weekday
s_map = { 6:0, 5:1, 0:2, 1:3, 2:4, 4:5, 3:6}
test_df['weekday2'] = test_df['weekday'].map(s_map)
test_df['date'] = test_df['datetime'].dt.date
test_df['day'] = test_df['datetime'].dt.day
test_df['day2'] = test_df['datetime'].dt.day
test_df['month'] = test_df['datetime'].dt.month
test_df['weekend'] = test_df['weekday'].isin([5,6]).astype(int)
test_df['week'] = test_df['datetime'].dt.isocalendar().week
test_df['week_1'] = test_df['week'] - 22
test_df['squared_week'] = test_df['week_1'] ** 2
test_df.loc[test_df['month'] == 7, 'day'] = test_df.loc[test_df['month'] == 7, 'day'] + 30
test_df.loc[test_df['month'] == 8, 'day'] = test_df.loc[test_df['month'] == 8, 'day'] + 61
test_df['angryindex'] = test_df.apply(lambda x: angryindex(x), axis=1)
test_df['hour_1'] = ((test_df['week_1']-1)*7 + test_df['weekday'])*24 + test_df['hour']+1
test_df['squared_hour'] = test_df['hour_1'] ** 2
test_df['squared_day'] = test_df['day'] ** 2
test_df['temperature_f'] = test_df['temperature']*1.8 + 3.2
test_df['heatindex'] = test_df.apply(lambda x: heatindex(x), axis=1)
test_df['windchillindex'] = test_df.apply(lambda x: windchillindex(x), axis=1)
cdhs_test = np.array([])
for num in range(1,61,1):
    temp = test_df[test_df['num'] == num]
    cdh = CDH(temp['temperature'].values)
    cdhs_test = np.concatenate([cdhs_test, cdh])
test_df['CDH'] = cdhs_test
test_df = test_df.merge(test_df.groupby(['num','date'])['CDH'].mean().reset_index().rename(columns = {'CDH':'mean_CDH'}), on = ['num','date'], how = 'left')
test_df['insolation'] = test_df['insolation']/3
for i in [20, 21, 22, 23]:
    test_df.loc[test_df['hour'] == i, 'insolation'] = 0 
test_df['typoon'] = 0
for i in [5, 10, 11, 41, 42]:
    test_df.loc[(test_df['month'] == 8) & (test_df['day2'] == 26) & (test_df['num']==i), 'typoon'] = 1
for i in [5, 10, 11, 41, 42]:
    test_df.loc[(test_df['month'] == 8) & (test_df['day2'] == 27) & (test_df['num']==i), 'typoon'] = 1

In [423]:
#빌딩별 변수 생성
for i in range(1,61):
    globals()['building_' + '{}'.format(i)] = eda_df.loc[eda_df['num'] == i]

In [None]:
# EDA 기반 건물별 이상치 처리

building_1.loc['2020-06-09 12:00:00','target'] = (building_1.loc['2020-06-09 11:00:00', 'target'] + building_1.loc['2020-06-09 13:00:00', 'target']) / 2
building_1.loc['2020-07-14 01:00:00','target'] = (building_1.loc['2020-07-14 00:00:00', 'target'] + building_1.loc['2020-07-14 02:00:00', 'target']) / 2
building_1 = building_1.loc['2020-06-08':]
building_3 = pd.concat([building_3.loc[:'2020-07-14 23:00:00'], building_3.loc['2020-08-05 00:00:00':]])
building_9 = building_9[building_9['target'] > 1050]
building_10 = pd.concat([building_10.loc[:'2020-07-26 23:00:00'], building_10.loc['2020-07-28':]])
building_10 = pd.concat([building_10.loc[:'2020-08-09 23:00:00'], building_10.loc['2020-08-11':]])
building_16.loc['2020-06-27 10:00:00','target'] = (building_16.loc['2020-06-27 09:00:00', 'target'] + building_16.loc['2020-06-27 11:00:00', 'target']) / 2
building_24 = building_24.loc['2020-06-08':]
building_25 = pd.concat([building_25.loc[:'2020-07-23'], building_25.loc['2020-08-03':]])
building_25.loc['2020-07-12 10:00:00','target'] = (building_25.loc['2020-07-12 09:00:00', 'target'] + building_25.loc['2020-07-12 12:00:00', 'target']) / 2
building_25.loc['2020-07-12 11:00:00','target'] = (building_25.loc['2020-07-12 09:00:00', 'target'] + building_25.loc['2020-07-12 12:00:00', 'target']) / 2
building_27 = building_27[building_27['target'] > 0]
building_31.loc['2020-06-11 17:00:00','target'] = (building_31.loc['2020-06-11 16:00:00', 'target'] + building_31.loc['2020-06-11 18:00:00', 'target']) / 2
building_33.loc['2020-06-11 17:00:00','target'] = (building_33.loc['2020-06-11 16:00:00', 'target'] + building_33.loc['2020-06-11 18:00:00', 'target']) / 2
building_36.loc['2020-06-19 06:00:00', 'target'] = building_36.loc['2020-06-19 05:00:00', 'target'] + building_36.loc['2020-06-19 07:00:00', 'target'] 
building_36.loc['2020-08-12 05:00:00', 'target'] = building_36.loc['2020-08-12 04:00:00', 'target'] + building_36.loc['2020-08-12 08:00:00', 'target'] 
building_36.loc['2020-08-12 06:00:00', 'target'] = building_36.loc['2020-08-12 04:00:00', 'target'] + building_36.loc['2020-08-12 08:00:00', 'target'] 
building_36.loc['2020-08-12 07:00:00', 'target'] = building_36.loc['2020-08-12 04:00:00', 'target'] + building_36.loc['2020-08-12 08:00:00', 'target'] 
building_40 = pd.concat([building_40.loc[:'2020-08-02'], building_40.loc['2020-08-04':]])
building_42 = pd.concat([building_42.loc[:'2020-07-12 23:00:00'],building_42.loc['2020-07-14':]])
building_42 = pd.concat([building_42.loc[:'2020-08-09 23:00:00'],building_42.loc['2020-08-11':]])
building_45.loc['2020-07-05 01:00:00','target'] = (building_45.loc['2020-07-05 00:00:00', 'target'] + building_45.loc['2020-07-05 02:00:00', 'target']) / 2
building_52.loc['2020-06-11 18:00:00', 'target'] = (building_52.loc['2020-06-11 17:00:00', 'target'] + building_52.loc['2020-06-11 19:00:00', 'target']) / 2 
building_53 = building_53.loc['2020-06-08':]
building_55.loc['2020-08-08 11:00:00', 'target'] = (building_55.loc['2020-08-08 10:00:00', 'target'] + building_55.loc['2020-06-11 12:00:00', 'target']) / 2 
building_55 = pd.concat([building_55.loc[:'2020-08-02'], building_55.loc['2020-08-09':]])
building_56 = pd.concat([building_56.loc[:'2020-08-02'], building_56.loc['2020-08-09':]])
building_60.loc['2020-07-01 01:00:00', 'target'] = (building_60.loc['2020-07-01 00:00:00', 'target'] + building_60.loc['2020-07-01 02:00:00', 'target']) / 2 

### 모델 학습

In [528]:
#빌딩별 DataFrame 하나로 합치기
train_x = pd.concat([building_1, building_2])
for i in range(3,61):
    train_x = pd.concat([train_x, globals()['building_' + '{}'.format(i)]])

In [529]:
#학습용 DataFrame 전처리
train_x.reset_index(drop = True, inplace = True)
train_y = train_x[['target']]
test_df.reset_index(drop = True, inplace = True)
scaler = StandardScaler() 

#Train df
for i in [i for i in range(1, 61) if i not in [1, 3, 4, 5, 13, 15, 19, 20, 21, 23, 24, 26, 28, 29, 30, 31, 34, 36, 39, 44, 45, 49, 50, 51, 59, 60]]:
    globals()['building_' + '{}'.format(i)].reset_index(drop = True, inplace = True)
    globals()['y_building_' + '{}'.format(i)] = globals()['building_' + '{}'.format(i)][['target']]
    X_num = globals()['building_' + '{}'.format(i)][['temperature','windspeed','humidity','precipitation','insolation','hour','weekday','month','angryindex', 'heatindex', 'windchillindex', 'mean_CDH', 'squared_day', 'squared_hour']]
    scaler.fit(X_num)  
    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)
    if i in [1,4,5,9,10,19,20,21,34,36]:
        globals()['scaled_building_' + '{}'.format(i)] = X_scaled
    else:
        X_cat = globals()['building_' + '{}'.format(i)][['weekend']]
        globals()['scaled_building_' + '{}'.format(i)] = pd.concat([X_scaled, X_cat], axis=1)

    X_num = test_df[test_df['num']==i][['temperature','windspeed','humidity','precipitation','insolation','hour','weekday','month','angryindex','heatindex', 'windchillindex', 'mean_CDH', 'squared_day', 'squared_hour']]

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)
    if i in [1,4,5,9,10,19,20,21,34,36]:
        globals()['scaled_test_' + '{}'.format(i)] = X_scaled
    else:
        X_cat = test_df[test_df['num']==i][['weekend']]
        globals()['scaled_test_' + '{}'.format(i)] = pd.concat([X_scaled, X_cat], axis=1)
   
#Test df
for i in [1, 3, 4, 5, 13, 15, 19, 20, 21, 23, 24, 26, 28, 29, 30, 31, 34, 36, 39, 44, 45, 49, 50, 51, 59, 60]:
    globals()['building_' + '{}'.format(i)].reset_index(drop = True, inplace = True)
    globals()['y_building_' + '{}'.format(i)] = globals()['building_' + '{}'.format(i)][['target']]
    X_num = globals()['building_' + '{}'.format(i)][['temperature','windspeed','humidity','precipitation','insolation','hour','weekday','month','squared_week','angryindex', 'heatindex', 'windchillindex', 'mean_CDH', 'squared_day', 'squared_hour']]

    scaler.fit(X_num)  
    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)
    if i in [1,4,5,9,10,19,20,21,34,36]:
        globals()['scaled_building_' + '{}'.format(i)] = X_scaled
    else:
        X_cat = globals()['building_' + '{}'.format(i)][['weekend']]
        globals()['scaled_building_' + '{}'.format(i)] = pd.concat([X_scaled, X_cat], axis=1)
    X_num = test_df[test_df['num']==i][['temperature','windspeed','humidity','precipitation','insolation','hour','weekday','month','squared_week','angryindex', 'heatindex', 'windchillindex', 'mean_CDH', 'squared_day', 'squared_hour']] 

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)
    if i in [1,4,5,9,10,19,20,21,34,36]:
        globals()['scaled_test_' + '{}'.format(i)] = X_scaled
    else:
        X_cat = test_df[test_df['num']==i][['weekend']]
        globals()['scaled_test_' + '{}'.format(i)] = pd.concat([X_scaled, X_cat], axis=1)
    

In [539]:
#모델 학습후 결과 Submission 파일로 저장

final_list=[]
for i in range(1,61):
    train_x2=globals()['scaled_building_' + '{}'.format(i)]
    train_y2=globals()['y_building_' + '{}'.format(i)]
    cross=KFold(n_splits=5, shuffle=True, random_state=42)
    folds=[]
    print(len(train_x2))
    print(len(train_y2))
    for train_idx, valid_idx in cross.split(train_x2, train_y2):
        folds.append((train_idx, valid_idx))
    models={}
    for fold in range(5):
        print(f'===================={fold+1}=======================')
        train_idx, valid_idx=folds[fold]
        X_train=train_x2.iloc[train_idx, :]
        y_train=train_y2.iloc[train_idx, :]
        X_valid=train_x2.iloc[valid_idx, :]
        y_valid=train_y2.iloc[valid_idx, :]
        
        model=LGBMRegressor(n_estimators=1000)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                 early_stopping_rounds=1, verbose=100)
        models[fold]=model

        print(f'================================================\n\n')
    sum_list = [0 for i in range(168)]
    for j in range(5):
        sum_list += models[j].predict(globals()['scaled_test_' + '{}'.format(i)])/5 
    final_list.extend(sum_list)


submission['answer']+=final_list

#제출
submission.to_csv('private_11th.csv', index=False)

1815
1815
Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[67]	training's l2: 137.536	valid_1's l2: 370.019


Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[55]	training's l2: 161.182	valid_1's l2: 401.816


Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[50]	training's l2: 181.465	valid_1's l2: 413.043


Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[84]	training's l2: 105.402	valid_1's l2: 376.003


Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[67]	training's l2: 127.275	valid_1's l2: 485.119


1955
1955
Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[60]	training's l2: 463.898	valid_1's l2: 1060.32


Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[63