In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
import os

from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

from datetime import date
import holidays
us_holidays = holidays.UnitedStates()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_feather('Data/train_joined_processed_processed.feather')

current_vars = ['building_id', 'meter', 'site_id', 'primary_use', 
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
        'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'month', 'day', 'wind_direction cat',
       'wind_direction cat2',  
       'wind_speed_log', 'iso_weekend', 'hour', 'holidays']


train_df = df.sample(frac=0.8,random_state=11)
test_df = df.drop(train_df.index)

X_train = train_df[current_vars]
y_train = train_df.iloc[:,0]

X_test = test_df[current_vars]
y_test = test_df.iloc[:,0]

del df
del train_df
del test_df

In [3]:
X_half_1 = X_train.sample(frac=0.5,random_state=33)
X_half_2 = X_train.drop(X_half_1.index)

y_half_1 = y_train[X_half_1.index]
y_half_2 = y_train[X_half_2.index]

In [4]:
categorical_features = ['meter', 'site_id', 'primary_use', 
                'iso_weekend',  'holidays',
                'wind_direction cat']

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

In [5]:
%%time

num_boost_round = 3000

model_half_1 = lgb.train(params, train_set=d_half_1, 
                         num_boost_round=num_boost_round, 
                         valid_sets=watchlist_1, 
                         verbose_eval=200, 
                         early_stopping_rounds=200)

model_half_2 = lgb.train(params, train_set=d_half_2, 
                         num_boost_round=num_boost_round, 
                         valid_sets=watchlist_2, 
                         verbose_eval=200, 
                         early_stopping_rounds=200)

prediction_on_X_train = (model_half_1.predict(X_train) + model_half_2.predict(X_train))/2
prediction_on_X_train = pd.Series(prediction_on_X_train)
prediction_on_X_train[prediction_on_X_train<0] = 0
print('train: ', np.sqrt(mean_squared_log_error( y_train, prediction_on_X_train)))

prediction_on_X_test = (model_half_1.predict(X_test) + model_half_2.predict(X_test))/2
prediction_on_X_test = pd.Series(prediction_on_X_test)
prediction_on_X_test[prediction_on_X_test<0] = 0
print('test: ', np.sqrt(mean_squared_log_error( y_test, prediction_on_X_test )))



Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 29652.7	valid_1's rmse: 41513.3
[400]	training's rmse: 24225.3	valid_1's rmse: 38234.9
[600]	training's rmse: 22456.3	valid_1's rmse: 37473.4
[800]	training's rmse: 21655.1	valid_1's rmse: 37222
[1000]	training's rmse: 20661	valid_1's rmse: 36902.9
[1200]	training's rmse: 19846.8	valid_1's rmse: 36667.5
[1400]	training's rmse: 19131.6	valid_1's rmse: 36379.8
[1600]	training's rmse: 18430	valid_1's rmse: 36125.5
[1800]	training's rmse: 17725.4	valid_1's rmse: 35881.5
[2000]	training's rmse: 17309.2	valid_1's rmse: 35791.7
[2200]	training's rmse: 16837	valid_1's rmse: 35641.8
[2400]	training's rmse: 16469.6	valid_1's rmse: 35537.3
[2600]	training's rmse: 16002.8	valid_1's rmse: 35407.6
[2800]	training's rmse: 15711.1	valid_1's rmse: 35314.4
[3000]	training's rmse: 15358.2	valid_1's rmse: 35173.5
Did not meet early stopping. Best iteration is:
[3000]	training's rmse: 15358.2	valid_1's rmse: 35173.5
Traini

## num_boost_round=1000: 2.47
## num_boost_round=3000: 2.2915356857342903