In [1]:
#Python packages
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
import re
from scipy import stats

from meteocalc import Temp, dew_point, heat_index, wind_chill, feels_like
import pvlib

#Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ML modeling packages
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

## Competition Overview

Significant investments are being made to improve building efficiencies to reduce costs and emissions. The question is, are the improvements working? That’s where you come in. Under pay-for-performance financing, the building owner makes payments based on the difference between their real energy consumption and what they would have used without any retrofits. The latter values have to come from a model. Current methods of estimation are fragmented and do not scale well. Some assume a specific meter type or don’t work with different building types.

In this competition, you’ll develop accurate models of metered building energy usage in the following areas: chilled water, electric, hot water, and steam meters. The data comes from over 1,000 buildings over a three-year timeframe. With better estimates of these energy-saving investments, large scale investors and financial institutions will be more inclined to invest in this area to enable progress in building efficiencies.

In [2]:
train = pd.read_csv('train_fe.csv')
# test = pd.read_csv('test_reduced.csv')

In [3]:
train.rename({"(True, 'hour_roc')":'hourly_roc'}, axis=1, inplace=True)

In [4]:
train.drop("(False, 'hour_roc')", axis=1, inplace=True)

In [5]:
train = train.iloc[:,1:]

In [6]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday_name,weekday,month,day_of_year,air_temperature_f,wind_speed_mph,dew_temp_f,wind_chill,heat_index,feels,precip_cm,radiation,pressure_pas,airmass,altitude,solar_spec,hourly_roc
0,0,Electricity,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,Friday,4,1,1,77.0,0.0,68.0,83.5955,77.596,77.0,,1413.981805,101950.0,101950.0,-51.773303,,1.784073
1,1,Electricity,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,Friday,4,1,1,77.0,0.0,68.0,83.5955,77.596,77.0,,1413.981805,101950.0,101950.0,-51.773303,,1.23342
2,2,Electricity,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,Friday,4,1,1,77.0,0.0,68.0,83.5955,77.596,77.0,,1413.981805,101950.0,101950.0,-51.773303,,0.940957
3,3,Electricity,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,Friday,4,1,1,77.0,0.0,68.0,83.5955,77.596,77.0,,1413.981805,101950.0,101950.0,-51.773303,,0.715433
4,4,Electricity,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,0,1,Friday,4,1,1,77.0,0.0,68.0,83.5955,77.596,77.0,,1413.981805,101950.0,101950.0,-51.773303,,0.836235


In [6]:
'''Variable Description'''
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary

In [7]:
description(train)

Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,building_id,int64,0,1449,0,1,2
1,meter,object,0,4,Electricity,Electricity,Electricity
2,timestamp,object,0,8784,2016-01-01 00:00:00,2016-01-01 00:00:00,2016-01-01 00:00:00
3,meter_reading,float64,0,1745619,0,0,0
4,site_id,int64,0,16,0,0,0
5,primary_use,object,0,16,Education,Education,Education
6,square_feet,int64,0,1397,7432,2720,5376
7,year_built,float64,12127645,116,2008,2004,1991
8,floor_count,float64,16709167,18,,,
9,air_temperature,float64,96658,619,25,25,25


In [6]:
train['random'] = np.random.randint(low=0,high=100000000, size=len(train))

'''In my experience, it is worth introducing a completely random feature. This will help in feature selection;
For each model, if a certain feature is consistently above or below the random feature, then we can confidently 
select features knowing that they are important not by chance'''

In [7]:
train['age'] = train['year_built'].max() - (train['year_built']+1)

In [21]:
for col in train:
    if train[col].dtype==np.int64:
        train[col] = train[col].astype(np.int16)
    elif train[col].dtype==np.float64:
        train[col] = train[col].astype(np.float16)

In [22]:
train.to_csv('train_to_encode.csv')