In [None]:
from DataLoader import train_df,test_df
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from model_utils import trim_site,filling_nan_values
from model_utils import percentile_condition
from lightworklow import GBM

train = train_df(build_meta_csv='./heat_data/building_metadata.csv',
                train_csv='./heat_data/train.csv',
                weather_train_csv='./heat_data/weather_train.csv',
                merge=True,
                unmerged=False,
                drop=True,
                col_drop = ['year_built','timestamp','floor_count'],
                axis = 1,
                datetime=True,
                encode_and_scale=True,
                trim_bad_rows = True, 
                fill_weather = False)

Memory usage of properties dataframe is : 616.9464874267578  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  meter_reading
dtype before:  float64
min for this col:  0.0
max for this col:  21904700.0
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  289.1937065124512  MB
This is  46.875006569639226 % of the initial size


In [2]:
train = percentile_condition(train)

Removing outliers based on percentile conditioning
Input data shape: (20216100, 18)
False/True values while removing the outliers:

True     20153452
False       62648
Name: meter_reading, dtype: int64
Final shape after removing outliers: 20216100


In [3]:
train = filling_nan_values(train)

Filling Column:air_temperature
Filling Column:cloud_coverage
Filling Column:dew_temperature
Filling Column:precip_depth_1_hr
Filling Column:sea_level_pressure
Filling Column:wind_direction
Filling Column:wind_speed


In [4]:
labels = train['meter_reading']
train = train.drop(['meter_reading'],axis=1)

In [5]:
test = test_df(test_csv='./heat_data/test.csv',
              weather_test_csv='./heat_data/weather_test.csv',
              merge=True,
              unmerged=False,
              drop=True,
              col_drop = ['row_id','timestamp'],
              axis = 1,
              datetime=True)

Memory usage of properties dataframe is : 30.517654418945312  MB
******************************
Column:  row_id
dtype before:  int64
min for this col:  0
max for this col:  999999
dtype after:  uint32
******************************
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  104
dtype after:  uint8
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  13.351516723632812  MB
This is  43.75014062464844 % of the initial size


In [6]:
test = filling_nan_values(test)

Filling Column:air_temperature
Filling Column:cloud_coverage
Filling Column:dew_temperature
Filling Column:precip_depth_1_hr
Filling Column:sea_level_pressure
Filling Column:wind_direction
Filling Column:wind_speed


In [9]:
models = GBM(train_gbm = True, 
            train_xg = False, 
            train_cat = False,
            train_ng = False,
            test_predict = True, 
            save_model = True,
            save_history = True,
            seed = 100,
            name = 'LightGBM',
            importance = True, 
            stratify = False,
            eval_metric = None, 
            time_series = False,
            prepare_submission = False,
            jsonize = True,
            show_metric_results = True)

parameters = {'boosting_type':'gbdt',
             'objective':'regression',
             'metric':'rmse',
             'learning_rate':0.01,
             'num_leaves':30,
             'subsample':0.4,
             'reg_alpha':0.5,
             'reg_lambda':0.5,
             'verbose_eval':10,
             'early_stopping_rounds':200,
             'num_boost_round':300}

valid_predictions,test_predictions = models.fold_run(src_dir='LightGBM',
                                                     X_train = train, 
                                                     y_train = labels,
                                                     X_test = test,
                                                     n_folds = 3,
                                                     parameters = parameters,
                                                     categorical_features = None)

X_train_shape (20153452, 17)
X_test_shape (15906217, 15)
vaild_predict 20153452
test_predictions 15906217
Train LightGBM
Training until validation scores don't improve for 200 rounds
[10]	training's rmse: 2.07197	valid_1's rmse: 2.23381
[20]	training's rmse: 2.0704	valid_1's rmse: 2.23221
[30]	training's rmse: 2.06897	valid_1's rmse: 2.23091
[40]	training's rmse: 2.06766	valid_1's rmse: 2.22975
[50]	training's rmse: 2.06646	valid_1's rmse: 2.22888
[60]	training's rmse: 2.06534	valid_1's rmse: 2.22818
[70]	training's rmse: 2.06438	valid_1's rmse: 2.22755
[80]	training's rmse: 2.06357	valid_1's rmse: 2.22694


KeyboardInterrupt: 