# PART 1: THE SETUP & THE CLEANUP 

####Installing the necessary packages

In [433]:
#!pip install --upgrade pip seaborn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn import preprocessing
import scipy
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

####Importing my data as a CSV file.

In [434]:
data = pd.read_csv("/Users/owner/desktop/ecb_data_2.csv")

####Getting a quick glance at the data

In [435]:
data.head()

Unnamed: 0,year,apn,buillding_address,building_zip,neighborhood,floor_area,sqft_group,floor_area_group,year_built,year_built_group,...,land_use,total_uses,cie,med,mips,retail,pdr,visitor,construction_type,building_description
0,2010,0326/003,135 POWELL ST,94102,Downtown/Civic Center,18990,10-25K,10-20K,1909,1900-1909,...,RETAIL/ENT,24671,0,0,0,24671,0,0,Masonry or concrete,Commercial Stores
1,2010,1401/002,3132 CLEMENT ST,94121,Seacliff,16669,10-25K,10-20K,2002,2000-2009,...,RETAIL/ENT,39708,0,0,0,39708,0,0,Masonry or concrete,Commercial Stores
2,2010,4347A/003,2090 Evan St.,94124,Bayview,119900,50K+,100-200K,1956,1950-1959,...,PDR,4671,0,0,0,0,4671,0,Masonry or concrete,Industrial
3,2010,3780/078,836 BRANNAN ST,94103,South of Market,33549,25-50K,30-40K,2001,2000-2009,...,MIXED,25396,0,0,0,8316,17080,0,0,Commercial Stores
4,2010,3511/093,40 LAFAYETTE ST,94103,South of Market,127900,50K+,100-200K,1934,1930-1939,...,PDR,0,0,0,0,0,0,0,0,Industrial


####I needed to convert a few columns from objects into floats.

####Here I ran into some issues as some features had quotation marks or commas.

####To solve this, I used """map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)""".

In [436]:
data.Site_EUI_kBtu_ft = data.Site_EUI_kBtu_ft.map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)
data.National_Median_Site_EUI_kBtu_ft = data.National_Median_Site_EUI_kBtu_ft.map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)
data.Weather_Normalized_Site_EUI_kBtu_ft = data.Weather_Normalized_Site_EUI_kBtu_ft.map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)
data.Total_GHG_Emissions_MtCO2e = data.Total_GHG_Emissions_MtCO2e.map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)
data.floor_area = data.floor_area.map(lambda x: re.sub('[^0-9\.]', '', x)).astype(float)

In [437]:
#Making sure the data is in the right format.
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9320 entries, 0 to 9319
Data columns (total 43 columns):
year                                          9320 non-null int64
apn                                           9320 non-null object
buillding_address                             9320 non-null object
building_zip                                  9320 non-null int64
neighborhood                                  9320 non-null object
floor_area                                    9320 non-null float64
sqft_group                                    9320 non-null object
floor_area_group                              9320 non-null object
year_built                                    8935 non-null float64
year_built_group                              8935 non-null object
benchmark_status                              9320 non-null object
Primary_Property_Type_EPA_Calculated          9320 non-null object
Primary_Property_Type_EPA_Calculated_NA       9310 non-null float64
Primary_Property_Type

####Next, I made a copy of the dataframe so that I could use label encoding to transform my qualitative data into quantitative data, while maintaining the original dataframe.

In [438]:
data_numerical = data.copy(deep = True)

In [439]:
#creating a variable for labeling
label = preprocessing.LabelEncoder()

In [440]:
#transforming the data from categorical to numerical
data_numerical.year = label.fit_transform(data.year)
data_numerical.building_zip = label.fit_transform(data.building_zip)
data_numerical.neighborhood = label.fit_transform(data.neighborhood)
data_numerical.sqft_group = label.fit_transform(data.sqft_group)
data_numerical.floor_area_group = label.fit_transform(data.floor_area_group)
data_numerical.year_built_group = label.fit_transform(data.year_built_group)
data_numerical.benchmark_status = label.fit_transform(data.benchmark_status)
data_numerical.land_use = label.fit_transform(data.land_use)
data_numerical.construction_type = label.fit_transform(data.construction_type)
data_numerical.building_description = label.fit_transform(data.building_description)

In [441]:
#Viewing the transformed data
data.head()

Unnamed: 0,year,apn,buillding_address,building_zip,neighborhood,floor_area,sqft_group,floor_area_group,year_built,year_built_group,...,land_use,total_uses,cie,med,mips,retail,pdr,visitor,construction_type,building_description
0,2010,0326/003,135 POWELL ST,94102,Downtown/Civic Center,18990,10-25K,10-20K,1909,1900-1909,...,RETAIL/ENT,24671,0,0,0,24671,0,0,Masonry or concrete,Commercial Stores
1,2010,1401/002,3132 CLEMENT ST,94121,Seacliff,16669,10-25K,10-20K,2002,2000-2009,...,RETAIL/ENT,39708,0,0,0,39708,0,0,Masonry or concrete,Commercial Stores
2,2010,4347A/003,2090 Evan St.,94124,Bayview,119900,50K+,100-200K,1956,1950-1959,...,PDR,4671,0,0,0,0,4671,0,Masonry or concrete,Industrial
3,2010,3780/078,836 BRANNAN ST,94103,South of Market,33549,25-50K,30-40K,2001,2000-2009,...,MIXED,25396,0,0,0,8316,17080,0,0,Commercial Stores
4,2010,3511/093,40 LAFAYETTE ST,94103,South of Market,127900,50K+,100-200K,1934,1930-1939,...,PDR,0,0,0,0,0,0,0,0,Industrial


In [442]:
#Here, I used a for loop to remove a few columns that were extraneous.
for col in data_numerical:
    if data_numerical[col].dtype == object:
        data_numerical.drop(col, axis =1, inplace= True)

In [443]:
#Viewing the data to make sure the columns were dropped.
data_numerical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9320 entries, 0 to 9319
Data columns (total 33 columns):
year                                          9320 non-null int64
building_zip                                  9320 non-null int64
neighborhood                                  9320 non-null int64
floor_area                                    9320 non-null float64
sqft_group                                    9320 non-null int64
floor_area_group                              9320 non-null int64
year_built                                    8935 non-null float64
year_built_group                              9320 non-null int64
benchmark_status                              9320 non-null int64
Primary_Property_Type_EPA_Calculated_NA       9310 non-null float64
Primary_Property_Type_Self_selected_NA        9320 non-null int64
National_Median_Reference_Property_Type_NA    9320 non-null int64
ENERGY_STAR_Score                             9320 non-null float64
ENERGY_STAR_Score_NA       

####Cross-validation time!

In [444]:
#This is the column that I'm trying to predict.
y = data_numerical.benchmark_status

In [445]:
#All of the other data, excluding the column I want to predict.
X = data_numerical.drop('benchmark_status', axis =1)

In [446]:
# Importing train_test_split and pass split data into a new instance
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [447]:
data.benchmark_status.value_counts()

Did Not Comply    5837
Complied          3188
Exempt             295
dtype: int64

In [448]:
y.value_counts()

1    5837
0    3188
2     295
dtype: int64

##RANDOM FOREST APPROACH

####Manually entering the column labels

In [452]:
header_row = ['year','building_zip','neighborhood','floor_area','sqft_group','floor_area_group','year_built','year_built_group','benchmark_status','Primary_Property_Type_EPA_Calculated_NA','Primary_Property_Type_Self_selected_NA','National_Median_Reference_Property_Type_NA','ENERGY_STAR_Score','ENERGY_STAR_Score_NA', 'Site_EUI_kBtu_ft','Site_EUI_kBtu_ft_NA','National_Median_Site_EUI_kBtu_ft','National_Median_Site_EUI_kBtu_ft_NA','Weather_Normalized_Site_EUI_kBtu_ft','Weather_Normalized_Site_EUI_kBtu_ft_NA','Total_GHG_Emissions_MtCO2e','Total_GHG_Emissions_MtCO2e_NA','stories','land_use','total_uses','cie','med','mips','retail','pdr','visitor','construction_type','building_description']

####Defining the features variables to prep for the random forest algorithm

In [453]:
features = [col for col in header_row if col not in ['benchmark_status']]

#### Selecting features and converting target to boolean for Benchmark Statuses

In [462]:
X = data_numerical[features]
binary_y = data_numerical['benchmark_status'] > 0
categorical_y = data_numerical['benchmark_status']

In [459]:
y = binary_y

# using cross-validation
from sklearn.cross_validation import cross_val_score

dtc = DecisionTreeClassifier(max_depth=5, random_state=1)
scores = cross_val_score(dtc, X, y, cv=10, scoring='accuracy')
print 'Decision tree accuracy: {}'.format(np.mean(scores))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
y = binary_y

# use cross-validation for random forest
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

rfc = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=1)
scores = cross_val_score(rfc, X, y, cv=10, scoring='accuracy')
print 'Random forest accuracy: {}'.format(np.mean(scores))

abc = AdaBoostClassifier(n_estimators=10, random_state=1)
scores = cross_val_score(abc, X, y, cv=10, scoring='accuracy')
print 'AdaBoost accuracy: {}'.format(np.mean(scores))

##NAIVE BAYES APPRAOCH