In [3]:
#Python packages
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
import re
from scipy import stats

#Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ML modeling packages
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [None]:
train = pd.read_csv('train_to_encode.csv')

In [4]:
train = train.iloc[:,1:]
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday,month,day_of_year,air_temperature_f,wind_speed_mph,dew_temp_f,wind_chill,heat_index,feels,precip_cm,radiation,altitude,solar_spec,hourly_roc,random,age
0,0,Electricity,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,1.784,24802456,8.0
1,1,Electricity,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,1.233,61316517,12.0
2,2,Electricity,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.941,63831873,25.0
3,3,Electricity,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.7153,56365151,14.0
4,4,Electricity,2016-01-01 00:00:00,0.0,0,Education,-14465,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.8364,4040317,41.0


In [None]:
train = train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [None]:
'''Since we've already took advantage of the converted meteorologcial features, we can now drop them.
This is also applicable to the "timestamp" feature'''

In [6]:
train.drop(['timestamp', 'air_temperature_f','wind_speed_mph','dew_temp_f','precip_cm'], axis=1, inplace=True)

In [7]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday,month,day_of_year,air_temperature_f,wind_speed_mph,dew_temp_f,wind_chill,heat_index,feels,precip_cm,radiation,altitude,solar_spec,hourly_roc,random,age
0,0,Electricity,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,1.784,24802456,8.0
1,1,Electricity,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,1.233,61316517,12.0
2,2,Electricity,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.941,63831873,25.0
3,3,Electricity,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.7153,56365151,14.0
4,4,Electricity,0.0,0,Education,-14465,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,4,1,1,77.0,0.0,68.0,83.6,77.6,77.0,,1414.0,-51.78,,0.8364,4040317,41.0


In [7]:
train.groupby('site_id').apply(lambda group: group.isna().sum())

building_id           0.000000
meter                 0.000000
meter_reading         0.000000
site_id               0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.599900
floor_count           0.826528
air_temperature       0.004781
cloud_coverage        0.436551
dew_temperature       0.004953
precip_depth_1_hr     0.185447
sea_level_pressure    0.060925
wind_direction        0.071678
wind_speed            0.007107
hour                  0.000000
day                   0.000000
weekday               0.000000
month                 0.000000
day_of_year           0.000000
air_temperature_f     0.000000
wind_speed_mph        0.000000
dew_temp_f            0.000000
wind_chill            0.007107
heat_index            0.000000
feels                 0.000000
precip_cm             0.185447
radiation             0.000000
altitude              0.060925
solar_spec            0.197798
hourly_roc            0.999981
random                0.000000
age     

In [26]:
roc_month = train.groupby(['site_id', 'month'])['air_temperature'].agg(np.mean).reset_index('month').reset_index()

roc_month['dif']=0.00
for i in range(len(roc_month)-1):
    roc_month['dif'][i] = abs((roc_month['air_temperature'][i+1])-(roc_month['air_temperature'][i]))

In [28]:
roc_month

Unnamed: 0,site_id,month,air_temperature_f,dif
0,0,1,61.892413,0.818287
1,0,2,61.074126,9.205478
2,0,3,70.279603,2.101856
3,0,4,72.38146,4.13409
4,0,5,76.51555,4.747847
5,0,6,81.263397,2.13181
6,0,7,83.395206,1.690792
7,0,8,81.704415,1.34277
8,0,9,80.361645,5.105268
9,0,10,75.256377,7.161485


In [21]:
for x in range(len(train['month'].unique())):
    print(train.loc[train['month']==x]['air_temperature'])

Series([], Name: air_temperature, dtype: float64)
0           25.0
1           25.0
2           25.0
3           25.0
4           25.0
5           25.0
6           25.0
7           25.0
8           25.0
9           25.0
10          25.0
11          25.0
12          25.0
13          25.0
14          25.0
15          25.0
16          25.0
17          25.0
18          25.0
19          25.0
20          25.0
21          25.0
22          25.0
23          25.0
24          25.0
25          25.0
26          25.0
27          25.0
28          25.0
29          25.0
30          25.0
31          25.0
32          25.0
33          25.0
34          25.0
35          25.0
36          25.0
37          25.0
38          25.0
39          25.0
40          25.0
41          25.0
42          25.0
43          25.0
44          25.0
45          25.0
46          25.0
47          25.0
48          25.0
49          25.0
50          25.0
51          25.0
52          25.0
53          25.0
54          25.0
55          25.

3203750    20.0
3203751    20.0
3203752    20.0
3203753    20.0
3203754    20.0
3203755    20.0
3203756    20.0
3203757    20.0
3203758    20.0
3203759    20.0
3203760    20.0
3203761    20.0
3203762    20.0
3203763    20.0
3203764    20.0
3203765    20.0
3203766    20.0
3203767    20.0
3203768    20.0
3203769    20.0
3203770    20.0
3203771    20.0
3203772    20.0
3203773    20.0
3203774    20.0
3203775    20.0
3203776    20.0
3203777    20.0
3203778    20.0
3203779    20.0
3203780    20.0
3203781    20.0
3203782    20.0
3203783    20.0
3203784    20.0
3203785    20.0
3203786    20.0
3203787    20.0
3203788    20.0
3203789    20.0
3203790    20.0
3203791    20.0
3203792    20.0
3203793    20.0
3203794    20.0
3203795    20.0
3203796    20.0
3203797    20.0
3203798    20.0
3203799    20.0
3203800    20.0
3203801    20.0
3203802    20.0
3203803    20.0
3203804    20.0
3203805    20.0
3203806    20.0
3203807    20.0
3203808    20.0
3203809    20.0
3203810    20.0
3203811    20.0
3203812 

KeyboardInterrupt: 

In [23]:
for i in range(len(train['site_id'].unique())):
    for x in range(len(train['month'].unique())):
        train.loc[train['site_id']==i,
                  'month_roc']=abs(train.loc[train['month']==x+1]['air_temperature']-train.loc[train['month']==x]['air_temperature'])

In [None]:
# train['month_roc'] = train.apply(lambda x:x.loc
#                                  [(x['month']==roc_month['month']) & (x['site_id']==roc_month['site_id']),
#                                   roc_month['dif']])

In [25]:
train.month_roc.value_counts()

Series([], Name: month_roc, dtype: int64)

In [11]:
roc = train.groupby(['site_id','hour'])['air_temperature'].agg(np.mean).to_frame().reset_index('hour').reset_index()

roc['dif']=0.00
for i in range(len(roc)-1):
    roc['dif'][i] = abs((roc['air_temperature'][i+1])-(roc['air_temperature_f'][i]))
    

In [12]:
train['hour_roc'] = train.apply(lambda x:roc.loc
                                 [(x['hour']==roc['hour']) & (x['site_id']==roc['site_id']),
                                  roc['dif'].astpye(np.float16)])

Unnamed: 0,site_id,hour,air_temperature_f,dif
0,0,0,74.651728,1.783941
1,0,1,72.867786,1.235298
2,0,2,71.632488,0.938892
3,0,3,70.693595,0.71632
4,0,4,69.977275,0.835117
5,0,5,69.142158,0.585373
6,0,6,68.556785,0.637262
7,0,7,67.919524,1.442317
8,0,8,69.361841,0.32881
9,0,9,69.033031,0.305693


### Pipeline for Encoding

In [None]:
from sklearn.pipeline import pipeline
from sklearn import StandardScaler

num_pipeline = Pipeline([
	('imputer', SimpleImputer(strategy='median')),
	('attribs_adder', CombinedAttribAdder()),
	('std_scalar', StandardScaler()),
	])

housing_num = num_pipeline.fit_transform(housing_num)

from sklearn import ColumnTransformer

full_pipeline = ColumnTransformer([
	('num',num_pipeline,num_attribs),
	('cat',OneHotEncoder,cat_attribs),
	])

housing_prepared = full_pipeline.fit_transform(housing)

#### Modeling for feature importance

In [None]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

