In [48]:
import pandas as pd
import numpy as np

import missingno as msno
import seaborn as sns

import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing

### Load Data From CSV File

In [49]:
df = pd.read_csv('prepared_data.csv', sep='\t')
df.head()

Unnamed: 0,OSEBuildingID,BuildingType,Address,City,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,TotalGHGEmissions,GHGEmissionsIntensity,BuildingAge
0,1,NonResidential,405 Olive way,Seattle,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,...,1156514.0,3946027.0,12764.5293,1276453.0,False,Compliant,Low outlier,249.98,2.83,89.0
1,2,NonResidential,724 Pine street,Seattle,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,...,950425.2,3242851.0,51450.81641,5145082.0,False,Compliant,Low outlier,295.86,2.86,20.0
2,3,NonResidential,1900 5th Avenue,Seattle,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,...,14515440.0,49526664.0,14938.0,1493800.0,False,Compliant,Low outlier,2089.28,2.19,47.0
3,5,NonResidential,620 STEWART ST,Seattle,DOWNTOWN,47.61412,-122.33664,1926,1.0,10,...,811525.3,2768924.0,18112.13086,1811213.0,False,Compliant,Low outlier,286.43,4.67,90.0
4,8,NonResidential,401 LENORA ST,Seattle,DOWNTOWN,47.61375,-122.34047,1980,1.0,18,...,1573449.0,5368607.0,88039.98438,8803998.0,False,Compliant,Low outlier,505.01,2.88,36.0


In [50]:
list(df.columns)

['OSEBuildingID',
 'BuildingType',
 'Address',
 'City',
 'Neighborhood',
 'Latitude',
 'Longitude',
 'YearBuilt',
 'NumberofBuildings',
 'NumberofFloors',
 'PropertyGFATotal',
 'PropertyGFAParking',
 'PropertyGFABuilding(s)',
 'ListOfAllPropertyUseTypes',
 'LargestPropertyUseType',
 'LargestPropertyUseTypeGFA',
 'SecondLargestPropertyUseType',
 'SecondLargestPropertyUseTypeGFA',
 'ThirdLargestPropertyUseType',
 'ThirdLargestPropertyUseTypeGFA',
 'YearsENERGYSTARCertified',
 'ENERGYSTARScore',
 'SiteEUI(kBtu/sf)',
 'SiteEUIWN(kBtu/sf)',
 'SourceEUI(kBtu/sf)',
 'SourceEUIWN(kBtu/sf)',
 'SiteEnergyUse(kBtu)',
 'SiteEnergyUseWN(kBtu)',
 'SteamUse(kBtu)',
 'Electricity(kWh)',
 'Electricity(kBtu)',
 'NaturalGas(therms)',
 'NaturalGas(kBtu)',
 'DefaultData',
 'ComplianceStatus',
 'Outlier',
 'TotalGHGEmissions',
 'GHGEmissionsIntensity',
 'BuildingAge']

#  Feature selection/extraction

## One Hot encoder

In [51]:
df['BuildingType'].unique()

array(['NonResidential', 'Nonresidential COS', 'SPS-District K-12',
       'Campus', 'Nonresidential WA'], dtype=object)

In [52]:
df['ComplianceStatus'].unique()

array(['Compliant', 'Error - Correct Default Data', 'Missing Data',
       'Non-Compliant'], dtype=object)

In [53]:
df['BuildingAge'].unique()

array([ 89.,  20.,  47.,  90.,  36.,  17., 112.,  18.,  88.,  94.,  12.,
        86.,  33., 109., 100.,  31.,  55.,  15.,  25.,  61.,  38.,  67.,
        27., 110.,  22.,  24.,  26.,  66., 116.,  62., 105.,  43.,  96.,
       106.,  34., 108.,  57.,  16.,  19.,  54.,  46.,   8.,  68.,  51.,
        87.,   6.,  78.,  30.,  14.,  93.,  13.,  59.,  52.,  75.,  53.,
        10., 101.,  58.,   7.,   5.,  45.,   9.,  65.,  63.,  64.,  56.,
        79.,  50.,  48.,  91.,  92.,  11.,  85.,  44., 102., 103.,  21.,
        35.,  40., 107.,  28.,  37.,  69.,  32.,  60., 104.,  39.,  95.,
        71.,  42.,  41.,  70.,  49.,  29.,  76.,  84.,  23.,  98., 111.,
       114.,  77.,  72.,  99.,  74., 113.,   4.,   3.,  97., 115.,  80.,
        81.,   2.,   1.])

## Convert boolean values to numerical ones

In [54]:
df['DefaultData'].replace(to_replace=[False,True], value=[0,1],inplace=True)

## Feature selection

In [55]:
Features=['NumberofBuildings','NumberofFloors','PropertyGFATotal','PropertyGFAParking','PropertyGFABuilding(s)','ENERGYSTARScore',
         'SiteEUI(kBtu/sf)','SourceEUI(kBtu/sf)','SiteEnergyUse(kBtu)','SteamUse(kBtu)','Electricity(kWh)','NaturalGas(kBtu)',
         'DefaultData','BuildingAge']

X=df[Features]

In [56]:
# Label
y=df[['TotalGHGEmissions']]

# Normalize Data

In [57]:
X= preprocessing.StandardScaler().fit(X).transform(X)

# Prediction

In [58]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=4)
print("number of test samples :", xtest.shape[0])
print("number of training samples:",xtrain.shape[0])

number of test samples : 333
number of training samples: 1330


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
LR = LinearRegression().fit(xtrain,ytrain)
LR

LinearRegression()

In [60]:
yhat = LR.predict(xtest)
yhat

array([[1.19464842e+02],
       [1.08479433e+03],
       [1.28080517e+03],
       [1.70839159e+02],
       [1.99484048e+02],
       [6.27874370e+02],
       [1.72358241e+01],
       [9.58578910e+01],
       [2.16432421e+02],
       [2.46832210e+02],
       [3.25944538e+02],
       [2.13696148e+02],
       [1.70439648e+01],
       [3.30642758e+01],
       [8.79847781e+01],
       [3.24417793e+01],
       [8.03226054e+01],
       [1.80297468e+01],
       [1.60018503e+00],
       [1.46306472e+01],
       [2.52368396e+01],
       [3.94736276e+01],
       [2.03267314e+01],
       [3.30962461e+01],
       [2.43865475e+01],
       [2.87315922e+02],
       [1.83352196e+01],
       [7.27782823e+01],
       [1.41715724e+01],
       [2.25686534e+02],
       [5.11082926e+01],
       [4.65212697e+01],
       [7.81871314e+01],
       [7.04722912e+01],
       [1.63845607e+03],
       [1.72710862e+03],
       [9.08549436e+00],
       [5.85986112e+00],
       [6.40005454e+01],
       [6.03799907e+00],


# Model Evaluation using Test set

In [61]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss