In [1]:
import pandas as pd
import numpy as np
import os
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor


In [2]:
path = '../data/'

In [3]:
# Read data on building features from the railbelt grid

building_features = pd.read_csv('../output/data_btu_railbelt.csv')

In [4]:
# Drop rows with missing btu values 
building_features_nonmissing = building_features.dropna()

In [5]:
# Create numpy arrays for scikit learn 

features_dropped = ['annual_btu/sqft', 'annual_btu', 'age', 'osm_id', 'zip_code', 'zip_group']
features_kept = building_features_nonmissing.drop(features_dropped, axis=1).columns

X = np.array(building_features_nonmissing.drop(features_dropped, axis=1))
y = np.array(building_features_nonmissing['annual_btu/sqft'])
y = y.reshape([y.shape[0],1])

In [6]:
# Scale X

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

print(X)


[[-2.36647622e-01 -1.36319468e+00  6.30103054e-01 ...  6.24785266e-01
  -4.27196025e-01 -6.55471735e-01]
 [ 5.60365535e-01 -1.40887797e+00 -1.82141606e+00 ... -1.83340954e+00
  -8.28775438e-01  4.68359044e-01]
 [-2.49227319e-01 -1.72916194e+00 -1.62415591e+00 ... -1.64795878e+00
   2.05128845e+00  6.74235544e-01]
 ...
 [-1.89331561e-01 -1.43274316e-02  6.30103054e-01 ...  6.24785266e-01
  -4.27196025e-01 -1.82818238e+00]
 [-1.53472145e-01 -4.06869950e-01  6.30103054e-01 ...  6.24785266e-01
  -4.27196025e-01 -1.82818238e+00]
 [-2.72891665e-01 -6.33809416e-01 -2.43415856e-02 ... -1.67554736e-03
   1.67421160e+00 -1.82818238e+00]]


In [7]:
# Scale y

scaler = StandardScaler()
scaler.fit(y)
y = scaler.transform(y)

print(y)

[[-0.81007095]
 [ 0.79558361]
 [ 0.97646015]
 ...
 [-2.05842429]
 [-2.05842429]
 [-2.05842429]]


In [8]:
# Divide intro training and testing datasets, with a 30-70 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [9]:
# Ridge Regression

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)
coeffs = ridge.coef_
print(coeffs)

[[ 8.56277609e-04 -1.12965618e-02  7.28141101e-02  4.72528207e-02
  -2.98430433e-01 -3.15494522e-01 -2.05989233e-01 -2.25311240e-01
   7.23909447e-02  4.65816615e-02  4.31942986e-01  4.39683824e-01
  -1.51642909e-02 -6.33273400e-02  1.00263184e+00]]


In [10]:
# Ridge Regression with Cross Validation

ridgeCV = RidgeCV(cv=10).fit(X_train, y_train)

y_pred_ridgeCV = ridgeCV.predict(X_test)
coeffs = ridgeCV.coef_
print(coeffs)

[[ 9.08750543e-04 -1.14528383e-02 -8.93330760e-02  6.58325967e-02
  -1.33542280e+00 -3.39402293e-01  5.29952703e-01 -3.13337419e-01
   2.10510440e-01  1.97239839e-02  8.13943116e-01  5.46500935e-01
   1.77453492e-03 -7.42426385e-02  1.00269230e+00]]


In [11]:
# Decision tree

dt = DecisionTreeRegressor(random_state=3).fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

In [12]:

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("MSE model_ridge:", mse_ridge)
mse_ridgeCV = mean_squared_error(y_test, y_pred_ridgeCV)
print("MSE model_ridgeCV:", mse_ridgeCV)
mse_dt = mean_squared_error(y_test, y_pred_dt)
print("MSE model_dt:", mse_dt)

MSE model_ridge: 0.006976936540899352
MSE model_ridgeCV: 0.00697622019169604
MSE model_dt: 2.916655167684027e-07


In [13]:
# Displaying features with their coefficients from ridge


imp_feat = SelectFromModel(ridge, prefit = True, threshold = 0) 

imp_index = imp_feat.get_support()

imp_feature_table = pd.DataFrame({'important predictors':features_kept, 'coefficients': list(ridge.coef_[0])})[imp_index]

print(f'There are {len(imp_feature_table)} features')

imp_feature_table

There are 15 features


Unnamed: 0,important predictors,coefficients
0,areasq_ft,0.000856
1,height,-0.011297
2,FD_1981_2010,0.072814
3,TD_1981_2010,0.047253
4,FD_2000,-0.29843
5,TD_2000,-0.315495
6,FD_2010,-0.205989
7,TD_2010,-0.225311
8,FD_80,0.072391
9,TD_80,0.046582


In [14]:
# # Displaying features with their importances from decision tree

# f = pd.DataFrame({'features' : list(features_kept), 'feature_importance' : list(dt.feature_importances_ )})
# f = f.sort_values(by = 'feature_importance')

# sns.set(rc={'figure.figsize':(12.7,9.27)})

# barplot = sns.barplot(x='features', y = 'feature_importance', data = f)

# ticks = barplot.set_xticklabels(barplot.get_xticklabels(),rotation = 90)