In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
import os
import random

from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split

from datetime import date
import holidays
us_holidays = holidays.UnitedStates()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


import warnings
warnings.filterwarnings('ignore')

## Config

In [2]:
## change the data_path to where you store data
data_path = 'Data'

## change the model_path to where you want store models
model_path = 'model'

## Read in data

In [3]:
df = pd.read_csv(data_path + 'processed_automl_google.csv')

#### using_col below contains the columns selected by stepwise selecting process

In [4]:
using_col = [ 'meter_reading', 'building_id', 'meter', 
       'air_temperature', 
       'precip_depth_1_hr', 
       'wind_speed', 
        'precip_depth_1_hr_lag',
       'date_month', 'date_day', 'iso_week', 'iso_weekend', 'hour',
       'holidays']
df = df[using_col]

## Split training and testing set

In [5]:
train_df = df.sample(frac=0.8,random_state=11)
test_df = df.drop(train_df.index)

X_train = train_df.iloc[:,1:]
y_train = train_df.iloc[:,0]

X_test = test_df.iloc[:,1:]
y_test = test_df.iloc[:,0]

del df
del train_df
del test_df

cat_variables = ['meter', 'site_id', 'primary_use', 
                'iso_weekend',  'holidays',
                'wind_direction cat', 'precipitation', 'precipitation sign']

cat_variables_loc = list()
for col in cat_variables:
    if col in X_train.columns:
        cat_variables_loc.append(X_train.columns.get_loc(col))

## Modeling

In [17]:
%%time

building_list = X_test['building_id'].drop_duplicates()

y_train_list = pd.Series()
y_test_list = pd.Series()

prediction_train = pd.Series()
prediction_test = pd.Series()

for sample in building_list: 
    X_train_b = X_train[X_train['building_id']==sample]
    y_train_b = y_train[X_train_b.index]
    X_test_b = X_test[X_test['building_id']==sample]
    y_test_b = y_test[X_test_b.index]
        
    bst = lgb.LGBMRegressor(n_estimators = 1000, max_depth = 10)
    bst.fit(X_train_b.values, y_train_b.values, categorical_feature=cat_variables_loc)
    bst.booster_.save_model(model_path + '/model{}.txt'.format(sample))
    
    prediction_on_X_train = bst.predict(X_train_b)
    prediction_on_X_train = pd.Series(prediction_on_X_train)
    prediction_on_X_train[prediction_on_X_train<0] = 0

    prediction_on_X_test = bst.predict(X_test_b)
    prediction_on_X_test = pd.Series(prediction_on_X_test)
    prediction_on_X_test[prediction_on_X_test<0] = 0
    
    y_train_list = pd.concat([y_train_list, y_train_b])
    y_test_list = pd.concat([y_test_list, y_test_b])

    prediction_train = pd.concat([prediction_train, prediction_on_X_train])
    prediction_test = pd.concat([prediction_test, prediction_on_X_test])

print('train rmsle: ', np.sqrt(mean_squared_log_error( y_train_list, prediction_train)))
print('test rmsle: ', np.sqrt(mean_squared_log_error( y_test_list, prediction_test)))

print('train rmse: ', np.sqrt(mean_squared_error( y_train_list, prediction_train)))
print('test rmse: ', np.sqrt(mean_squared_error( y_test_list, prediction_test)))

# best results for now
# train rmsle:  0.6661559455495784
# test rmsle:  0.7729813016050944
# train rmse:  9907.157937267504
# test rmse:  25846.30966838011

train rmsle:  0.6661559455495784
test rmsle:  0.7729813016050944
train rmse:  9907.157937267504
test rmse:  25846.30966838011
Wall time: 45min 20s
