In [268]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [269]:
#Read housing_characteristics csv file in the Table_CSVs folder
df_housing = pd.read_csv("Table_CSVs/housing_characteristics.csv")

#review dataframe
df_housing.head(20)

Unnamed: 0.1,Unnamed: 0,DOEID,TYPEHUQ,YEARMADERANGE,TOTROOMS,WALLTYPE,ROOFTYPE,ADQINSUL,NUMFRIG,EQUIPM,ACEQUIPM_PUB,TOTSQFT_EN,TOTALBTU,TOTALDOL
0,0,100001,2,4,8,1,5,2,2,3,1,2100,144647.71,2656.89
1,1,100002,5,5,3,1,-2,2,1,3,1,590,28034.61,975.0
2,2,100003,5,3,4,1,-2,2,0,2,1,900,30749.71,522.65
3,3,100004,2,5,9,3,5,2,2,3,1,2100,86765.19,2061.77
4,4,100005,5,3,3,7,-2,2,2,3,1,800,59126.93,1463.04
5,5,100006,2,6,8,1,5,1,2,3,1,4520,85400.64,2335.08
6,6,100007,2,2,5,1,5,3,1,3,1,2100,131875.03,2110.5
7,7,100008,5,7,4,3,-2,2,1,4,-2,900,41446.59,1237.05
8,8,100009,5,7,3,7,-2,2,1,5,4,750,14512.02,549.8
9,9,100010,5,5,4,4,-2,2,1,4,1,760,12393.76,625.41


In [270]:
#Drop Unnamed column
df_housing.drop(columns=["DOEID", "Unnamed: 0", 'TOTALDOL'], inplace=True)

df_housing.head()

Unnamed: 0,TYPEHUQ,YEARMADERANGE,TOTROOMS,WALLTYPE,ROOFTYPE,ADQINSUL,NUMFRIG,EQUIPM,ACEQUIPM_PUB,TOTSQFT_EN,TOTALBTU
0,2,4,8,1,5,2,2,3,1,2100,144647.71
1,5,5,3,1,-2,2,1,3,1,590,28034.61
2,5,3,4,1,-2,2,0,2,1,900,30749.71
3,2,5,9,3,5,2,2,3,1,2100,86765.19
4,5,3,3,7,-2,2,2,3,1,800,59126.93


In [271]:
#Split data into features and target variables
X = df_housing.drop(['TOTALBTU'], axis=1)
y = df_housing['TOTALBTU']

In [272]:
#Split the data into training and testing datasets by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Linear Regression Model

In [273]:
#initialize the model
LR_model = LinearRegression()

#train the model
LR_model.fit(X_train, y_train)

In [274]:
#make prediction
y_pred = LR_model.predict(X_test)

In [275]:
#Evaluate the model using MSE & r2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1463427265.7045362
R-squared: 0.4257262460037563


In [276]:
coefficients = LR_model.coef_
intercept = LR_model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [-5.37710807e+03 -3.68551698e+03  4.34750831e+03  6.71460556e+00
 -2.02995624e+01  2.36815727e+02  7.84592198e+03 -1.30223851e+02
  8.58999559e+02  1.76968404e+01]
Intercept: 37978.744140971925


Optimize the Linear Regression Model

In [277]:
#import modules
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [278]:
#define the model
LR_model = Lasso()

#define hyperparameters
param_grid = {'alpha': [0.01, 0.1, 1, 10]}

#perform grid search
grid_search = GridSearchCV(LR_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

#best hyperparameters
best_params = grid_search.best_params_

print(best_params)


{'alpha': 10}


Use Best Parameters

In [279]:
#use best_params
best_LR_model = Lasso(alpha=best_params['alpha'])

#train
best_LR_model.fit(X_train, y_train)

In [280]:
#evaluate
y_pred = best_LR_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(mse)

1463421257.2659745


Random Forest

In [281]:
#import Random Forest module
from sklearn.ensemble import RandomForestRegressor

#train model
RF_model= RandomForestRegressor(n_estimators=100, random_state=42)

#fit model
RF_model.fit(X_train, y_train)


In [282]:
#evaluate model
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1362655418.4371266
R-squared: 0.46527083314080875


In [283]:
feature_importance = pd.Series(RF_model.feature_importances_, index=X.columns)
print("Feature Importance:")
print(feature_importance)

Feature Importance:
TYPEHUQ          0.072862
YEARMADERANGE    0.084581
TOTROOMS         0.085815
WALLTYPE         0.054569
ROOFTYPE         0.040222
ADQINSUL         0.035159
NUMFRIG          0.044820
EQUIPM           0.103470
ACEQUIPM_PUB     0.032609
TOTSQFT_EN       0.445895
dtype: float64


Optimizing Random Forest

In [284]:
#import necessary modules
#from sklearn.model_selection import GridSearchCV

In [285]:
#initialize model
#optimized_rf = RandomForestRegressor()

In [286]:
#define hyperparameters
#param_grid = {
    #'n_estimators': [100, 200, 300],
    #'max_depth': [None, 5, 10, 15],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
    #'max_features': ['auto', 'sqrt', 'log2']
#}

In [287]:
#perform hyperparameter tuning
#grid_search = GridSearchCV(estimator=optimized_rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
#grid_search.fit(X_train, y_train)

Deep Neural Network

In [288]:
#import modules
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

In [289]:
#normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [290]:
#define model

DNN_model = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(128, activation='relu'), 
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1)
])

In [291]:
#compile model
DNN_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

#train
history = DNN_model.fit(X_scaled, y, batch_size=32, epochs=1, validation_split=0.2)

[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 9721784320.0000 - val_loss: 8339973632.0000


In [292]:
#evaluate
loss = DNN_model.evaluate(X_scaled, y)

print("Mean Squared Error:", loss)


[1m578/578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 8335425536.0000
Mean Squared Error: 8554227712.0


Create Bins aka switch from regression to classification

In [293]:
#create bins for TOTALBTU
labels = ["low", "avg", "high"] 
bins = [-float('inf'), 50000, 100000, float('inf')]

df_housing['TOTALBTU'] = pd.cut(df_housing['TOTALBTU'], bins=bins, labels=labels)

#display df
df_housing.head(20)

Unnamed: 0,TYPEHUQ,YEARMADERANGE,TOTROOMS,WALLTYPE,ROOFTYPE,ADQINSUL,NUMFRIG,EQUIPM,ACEQUIPM_PUB,TOTSQFT_EN,TOTALBTU
0,2,4,8,1,5,2,2,3,1,2100,high
1,5,5,3,1,-2,2,1,3,1,590,low
2,5,3,4,1,-2,2,0,2,1,900,low
3,2,5,9,3,5,2,2,3,1,2100,avg
4,5,3,3,7,-2,2,2,3,1,800,avg
5,2,6,8,1,5,1,2,3,1,4520,avg
6,2,2,5,1,5,3,1,3,1,2100,high
7,5,7,4,3,-2,2,1,4,-2,900,low
8,5,7,3,7,-2,2,1,5,4,750,low
9,5,5,4,4,-2,2,1,4,1,760,low


Random Forest Re-Do

In [294]:
#import modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [295]:
#Split data into features and target variables
X = df_housing.drop(['TOTALBTU'], axis=1)
y = df_housing['TOTALBTU']

In [296]:
#Split the data into training and testing datasets by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [297]:
#train model
RFC_model = RandomForestClassifier(n_estimators=100, random_state=42)
RFC_model.fit(X_train, y_train)

In [298]:
#evaluate
y_pred =RFC_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.6286486486486487


In [306]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distributions
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Perform Random Search with cross-validation
random_search = RandomizedSearchCV(estimator=RFC_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters:", best_params_random)
print("Best Score:", best_score_random)

Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}
Best Score: 0.6436201510736826


In [307]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=RFC_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best Score: 0.6441607372834139


In [309]:
#optimize
RFC_optimized_model = RandomForestClassifier(n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features='sqrt', max_depth=10, random_state=42)
RFC_optimized_model.fit(X_train, y_train)

#evaluate
y_pred =RFC_optimized_model.predict(X_test)
opt_accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", opt_accuracy)

Accuracy: 0.6559459459459459


In [305]:
#optimize
RFC_optimized_model = RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features='sqrt', max_depth=10, random_state=42)
RFC_optimized_model.fit(X_train, y_train)

#evaluate
y_pred =RFC_optimized_model.predict(X_test)
opt_accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", opt_accuracy)

Accuracy: 0.6559459459459459
