In [81]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [93]:
#Read housing_characteristics csv file in the Table_CSVs folder
df_housing = pd.read_csv("Table_CSVs/housing_characteristics.csv")

#review dataframe
df_housing.head(20)

Unnamed: 0.1,Unnamed: 0,DOEID,TYPEHUQ,YEARMADERANGE,TOTROOMS,WALLTYPE,ROOFTYPE,ADQINSUL,NUMFRIG,EQUIPM,ACEQUIPM_PUB,TOTSQFT_EN,TOTALBTU,TOTALDOL
0,0,100001,2,4,8,1,5,2,2,3,1,2100,144647.71,2656.89
1,1,100002,5,5,3,1,-2,2,1,3,1,590,28034.61,975.0
2,2,100003,5,3,4,1,-2,2,0,2,1,900,30749.71,522.65
3,3,100004,2,5,9,3,5,2,2,3,1,2100,86765.19,2061.77
4,4,100005,5,3,3,7,-2,2,2,3,1,800,59126.93,1463.04
5,5,100006,2,6,8,1,5,1,2,3,1,4520,85400.64,2335.08
6,6,100007,2,2,5,1,5,3,1,3,1,2100,131875.03,2110.5
7,7,100008,5,7,4,3,-2,2,1,4,-2,900,41446.59,1237.05
8,8,100009,5,7,3,7,-2,2,1,5,4,750,14512.02,549.8
9,9,100010,5,5,4,4,-2,2,1,4,1,760,12393.76,625.41


In [83]:
#Drop Unnamed column
df_housing.drop(columns=["DOEID", "Unnamed: 0", 'TOTALDOL'], inplace=True)

df_housing.head()

Unnamed: 0,TYPEHUQ,YEARMADERANGE,TOTROOMS,WALLTYPE,ROOFTYPE,ADQINSUL,NUMFRIG,EQUIPM,ACEQUIPM_PUB,TOTSQFT_EN,TOTALBTU
0,2,4,8,1,5,2,2,3,1,2100,144647.71
1,5,5,3,1,-2,2,1,3,1,590,28034.61
2,5,3,4,1,-2,2,0,2,1,900,30749.71
3,2,5,9,3,5,2,2,3,1,2100,86765.19
4,5,3,3,7,-2,2,2,3,1,800,59126.93


In [84]:
#Split data into features and target variables
X = df_housing.drop(['TOTALBTU'], axis=1)
y = df_housing['TOTALBTU']

In [85]:
#Split the data into training and testing datasets by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Linear Regression Model

In [86]:
#initialize the model
LR_model = LinearRegression()

#train the model
LR_model.fit(X_train, y_train)

In [87]:
#make prediction
y_pred = LR_model.predict(X_test)

In [88]:
#Evaluate the model using MSE & r2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1463427265.7045362
R-squared: 0.4257262460037563


In [89]:
coefficients = LR_model.coef_
intercept = LR_model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [-5.37710807e+03 -3.68551698e+03  4.34750831e+03  6.71460556e+00
 -2.02995624e+01  2.36815727e+02  7.84592198e+03 -1.30223851e+02
  8.58999559e+02  1.76968404e+01]
Intercept: 37978.744140971925


Random Forest

In [90]:
#import Random Forest module
from sklearn.ensemble import RandomForestRegressor

#train model
RF_model= RandomForestRegressor(n_estimators=100, random_state=42)

#fit model
RF_model.fit(X_train, y_train)


In [91]:
#evaluate model
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1362655418.4371266
R-squared: 0.46527083314080875


In [94]:
feature_importance = pd.Series(RF_model.feature_importances_, index=X.columns)
print("Feature Importance:")
print(feature_importance)

Feature Importance:
TYPEHUQ          0.072862
YEARMADERANGE    0.084581
TOTROOMS         0.085815
WALLTYPE         0.054569
ROOFTYPE         0.040222
ADQINSUL         0.035159
NUMFRIG          0.044820
EQUIPM           0.103470
ACEQUIPM_PUB     0.032609
TOTSQFT_EN       0.445895
dtype: float64


Optimizing Random Forest

In [112]:
#import necessary modules
from sklearn.model_selection import GridSearchCV

In [113]:
#initialize model
optimized_rf = RandomForestRegressor()

In [114]:
#define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [115]:
#perform hyperparameter tuning
grid_search = GridSearchCV(estimator=optimized_rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)