In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime as dt
import importlib

import aux_funcs as aux

In [2]:
train = aux.read_train_data()
building = aux.read_building_data()
weather_train = aux.read_weather_train_data()

train.csv read in 13.9 s
Mem. usage decreased to 289.19 Mb (53.1% reduction)
(20216100, 11)
   building_id  meter  timestamp  meter_reading  hour  day  year  weeknumber  \
0            0      0 2016-01-01            0.0     0    1  2016          53   
1            1      0 2016-01-01            0.0     0    1  2016          53   
2            2      0 2016-01-01            0.0     0    1  2016          53   
3            3      0 2016-01-01            0.0     0    1  2016          53   
4            4      0 2016-01-01            0.0     0    1  2016          53   

   weekday  month  log_meter_reading  
0        4      1                0.0  
1        4      1                0.0  
2        4      1                0.0  
3        4      1                0.0  
4        4      1                0.0  
building_metadata.csv read in 0.1 s
Mem. usage decreased to  0.03 Mb (60.3% reduction)
(1449, 7)
   site_id  building_id primary_use  square_feet  year_built  floor_count  \
0        0         

In [3]:
tbw = train.join(building.loc[:,['building_id','primary_use','site_id','square_feet','log_square_feet']], on='building_id',rsuffix='r')
tbw = pd.merge(tbw, weather_train.loc[:,['site_id','timestamp','air_temperature']],\
               left_on=['site_id','timestamp'], right_on=['site_id','timestamp'])
tbw.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,hour,day,year,weeknumber,weekday,month,log_meter_reading,building_idr,primary_use,site_id,square_feet,log_square_feet,air_temperature
0,0,0,2016-01-01,0.0,0,1,2016,53,4,1,0.0,0,Education,0,7432,8.91355,25.0
1,1,0,2016-01-01,0.0,0,1,2016,53,4,1,0.0,1,Education,0,2720,7.908387,25.0
2,2,0,2016-01-01,0.0,0,1,2016,53,4,1,0.0,2,Education,0,5376,8.5897,25.0
3,3,0,2016-01-01,0.0,0,1,2016,53,4,1,0.0,3,Education,0,23685,10.072597,25.0
4,4,0,2016-01-01,0.0,0,1,2016,53,4,1,0.0,4,Education,0,116607,11.666565,25.0


In [4]:
subgroups = tbw.groupby(['primary_use','meter'])['meter_reading'].count().to_frame()

In [5]:
features_list=['weeknumber','weekday','hour','log_square_feet','air_temperature']

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
use_list = np.unique(building['primary_use'])
meter_list = np.arange(0,4)
features_list=['weeknumber','weekday','hour','log_square_feet','air_temperature']

In [10]:
from sklearn.ensemble import AdaBoostRegressor
out = []
adargs = []
for use in use_list:
    for meter in meter_list:
        print('---------------------')
        print('Extracting data for: ', use, meter)
        print('')
        # extract by primary use and meter value
        sub_df = tbw[(tbw['primary_use']== use) & (tbw['meter']==meter)]
        
        X=sub_df[features_list]
        y=sub_df['log_meter_reading']
        
        #eliminate points with no temperature
        cond = ~X['air_temperature'].isnull() 
        print(sum(cond),'datapoints')
        X = X[cond]
        y = y[cond]
        
        if len(y)==0:
            continue        
        
        print('Train-test split...')
        # train-test split
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.75, random_state=42)
        print('X_train shape: ', X_train.shape)
        print('X_test shape: ', X_test.shape)

        # fit
        print('Fitting model...')
        t0 = time()
        adarg = AdaBoostRegressor(\
                    DecisionTreeRegressor(min_samples_split=20),\
                        n_estimators=50, random_state=42)
        adarg.fit(X_train,y_train)
        fit_time = time()-t0
        print('Fit in {:.1f}s'.format(fit_time) )
        
        print('Validating...')
        # predict
        pred = adarg.predict(X_test)
        
        # validate model
        R2 = r2_score(y_test,pred)
        mse = mean_squared_error(y_test,pred)
        print('R2 :', R2)
        print('MSE :', mse)
        
        # attach results
        out.append([use, meter, len(y), fit_time, R2, mse])
        adargs.append(adarg)
        print('Done')
        print('')


---------------------
Extracting data for:  Education 0

4577522 datapoints
Train-test split...
X_train shape:  (3433141, 5)
X_test shape:  (1144381, 5)
Fitting model...
Fit in 615.4s
Validating...
R2 : 0.9867768334048143
MSE : 0.041294528983900194
Done

---------------------
Extracting data for:  Education 1

1812418 datapoints
Train-test split...
X_train shape:  (1359313, 5)
X_test shape:  (453105, 5)
Fitting model...
Fit in 274.8s
Validating...
R2 : 0.9328981832325025
MSE : 0.4534244654010751
Done

---------------------
Extracting data for:  Education 2

1125729 datapoints
Train-test split...
X_train shape:  (844296, 5)
X_test shape:  (281433, 5)
Fitting model...
Fit in 177.4s
Validating...
R2 : 0.8969663975110744
MSE : 0.7516280993223075
Done

---------------------
Extracting data for:  Education 3

606440 datapoints
Train-test split...
X_train shape:  (454830, 5)
X_test shape:  (151610, 5)
Fitting model...
Fit in 81.3s
Validating...
R2 : 0.8966304256976653
MSE : 0.673747867231354


17564 datapoints
Train-test split...
X_train shape:  (13173, 5)
X_test shape:  (4391, 5)
Fitting model...
Fit in 1.3s
Validating...
R2 : 0.9061638301547492
MSE : 0.25626879594576624
Done

---------------------
Extracting data for:  Parking 3

0 datapoints
---------------------
Extracting data for:  Public services 0

1317166 datapoints
Train-test split...
X_train shape:  (987874, 5)
X_test shape:  (329292, 5)
Fitting model...
Fit in 122.5s
Validating...
R2 : 0.9764760284835319
MSE : 0.03866773248576966
Done

---------------------
Extracting data for:  Public services 1

179273 datapoints
Train-test split...
X_train shape:  (134454, 5)
X_test shape:  (44819, 5)
Fitting model...
Fit in 3197.2s
Validating...
R2 : 0.9110092032711512
MSE : 0.521057262438708
Done

---------------------
Extracting data for:  Public services 2

82960 datapoints
Train-test split...
X_train shape:  (62220, 5)
X_test shape:  (20740, 5)
Fitting model...
Fit in 7.2s
Validating...
R2 : 0.9231329778039183
MSE : 0.494

In [11]:
res = pd.DataFrame(out,columns=['primary_use','meter','n_data','fit_time','R2','mse'])
res


Unnamed: 0,primary_use,meter,n_data,fit_time,R2,mse
0,Education,0,4577522,615.408553,0.986777,0.041295
1,Education,1,1812418,274.79665,0.932898,0.453424
2,Education,2,1125729,177.437819,0.896966,0.751628
3,Education,3,606440,81.275391,0.89663,0.673748
4,Entertainment/public assembly,0,1505695,670.698068,0.986562,0.043393
5,Entertainment/public assembly,1,369548,41.187476,0.952958,0.306065
6,Entertainment/public assembly,2,240055,25.415971,0.800967,1.050905
7,Entertainment/public assembly,3,138796,13.776818,0.774447,1.974831
8,Food sales and service,0,43804,3.889749,0.903631,0.06518
9,Food sales and service,1,35112,2.655596,0.955513,0.232047


In [12]:
res.to_csv('adaboost_perf_01.csv')

In [13]:
import pickle

In [17]:
with open('adaboost_01.pkl','ab') as adafile:
    pickle.dump(adargs, adafile)