In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [35]:
# Read in CSV file
regional_df = pd.read_csv("Table_CSVs/regional.csv")
regional_df

Unnamed: 0.1,Unnamed: 0,DOEID,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU,TOTALDOL
0,0,100001,WEST,Mountain South,NM,Mixed-Dry,144647.71,2656.89
1,1,100002,SOUTH,West South Central,AR,Mixed-Humid,28034.61,975.00
2,2,100003,WEST,Mountain South,NM,Mixed-Dry,30749.71,522.65
3,3,100004,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19,2061.77
4,4,100005,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93,1463.04
...,...,...,...,...,...,...,...,...
18491,18491,118492,SOUTH,South Atlantic,MD,Mixed-Humid,49930.49,1098.51
18492,18492,118493,NORTHEAST,New England,ME,Very-Cold,222186.04,3613.44
18493,18493,118494,SOUTH,West South Central,TX,Hot-Humid,51593.72,1428.31
18494,18494,118495,SOUTH,South Atlantic,SC,Hot-Humid,63555.21,2224.94


In [36]:
# Create DataFrame with regional information
regional_df = regional_df[['REGIONC', 'DIVISION', 'state_postal', 'BA_climate', 'TOTALBTU']]
regional_df.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU
0,WEST,Mountain South,NM,Mixed-Dry,144647.71
1,SOUTH,West South Central,AR,Mixed-Humid,28034.61
2,WEST,Mountain South,NM,Mixed-Dry,30749.71
3,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93


In [37]:
#Checking nulls
regional_df.isnull().sum()

REGIONC         0
DIVISION        0
state_postal    0
BA_climate      0
TOTALBTU        0
dtype: int64

In [38]:
#Separate the features `X` from the target `y`
y = regional_df['TOTALBTU']
X = regional_df.drop(columns='TOTALBTU')

In [39]:
# Preview the features data
X.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate
0,WEST,Mountain South,NM,Mixed-Dry
1,SOUTH,West South Central,AR,Mixed-Humid
2,WEST,Mountain South,NM,Mixed-Dry
3,SOUTH,South Atlantic,SC,Mixed-Humid
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid


In [40]:
# Preview the first five entries for the target variable
y[:5]

0    144647.71
1     28034.61
2     30749.71
3     86765.19
4     59126.93
Name: TOTALBTU, dtype: float64

In [41]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features data
X.head()

Unnamed: 0,REGIONC_MIDWEST,REGIONC_NORTHEAST,REGIONC_SOUTH,REGIONC_WEST,DIVISION_East North Central,DIVISION_East South Central,DIVISION_Middle Atlantic,DIVISION_Mountain North,DIVISION_Mountain South,DIVISION_New England,...,state_postal_WV,state_postal_WY,BA_climate_Cold,BA_climate_Hot-Dry,BA_climate_Hot-Humid,BA_climate_Marine,BA_climate_Mixed-Dry,BA_climate_Mixed-Humid,BA_climate_Subarctic,BA_climate_Very-Cold
0,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [42]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,  random_state=42)

In [43]:
# Create a model with scikit-learn
model = LinearRegression()

In [44]:
# Fit the data into the model / train the model
model.fit(X_train, y_train)

In [45]:
# Make predictions using the X set
predictions = model.predict(X_test)

In [46]:
#CHECKING THE TRAINING RESULTS
y_train_pred = model.predict(X_train)

r2_training = r2_score(y_train, y_train_pred)
print("R-squared for Training Data:", r2_training)

R-squared for Training Data: 0.13044185039695344


In [47]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.119714202996686.
The r2 is 0.10211291684633839.
The mean squared error is 2406520164.1618495.
The root mean squared error is 49056.29586670655.
The standard deviation is 53204.5566692713.


In [13]:
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [ 3.90092358e+16  6.42761413e+17  3.00074281e+17 -1.31817086e+17
  3.39274837e+17  7.58148228e+16 -3.80295278e+17  4.11829939e+17
  3.77252534e+17 -4.28439935e+17  4.69385144e+17  2.80482347e+16
  2.33632269e+17 -1.32457208e+17 -2.11032138e+16 -5.94242604e+16
  1.48847770e+17  7.10293961e+16 -2.11032138e+16  3.64519911e+16
  1.02143366e+17 -1.16576723e+16 -1.16576723e+16 -1.16576723e+16
 -1.16576723e+16 -2.11032138e+16  4.38233392e+16  3.64519911e+16
 -6.18192293e+16 -6.18192293e+16  4.38233392e+16 -5.94242604e+16
  1.48847770e+17  1.02143366e+17 -1.16576723e+16  1.02143366e+17
 -6.18192293e+16  4.38233392e+16  4.38233392e+16 -5.94242604e+16
  3.64519911e+16 -1.16576723e+16  4.38233392e+16  4.38233392e+16
  1.02143366e+17  5.39987081e+16  7.10293961e+16  7.10293961e+16
  5.39987081e+16 -6.18192293e+16  1.48847770e+17 -2.11032138e+16
  5.39987081e+16  1.02143366e+17 -1.16576723e+16  4.38233392e+16
 -5.94242604e+16  1.48847770e+17  3.64519911e+16 -1.16576723e+16
  1.0214336

Random Forest

In [14]:
# Create a random forest classifier
RF_model= RandomForestRegressor(n_estimators=100, random_state=42)

In [15]:
# Fitting the model
RF_model.fit(X_train, y_train)

In [16]:
#Making predictions
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 2364802816.8932896
R-squared: 0.10621549867373536


In [17]:
# Checking Feature importance
feature_importance = pd.Series(RF_model.feature_importances_, index=X.columns)
print("Feature Importance:")
# We can sort the features by their importance
sorted(zip(RF_model.feature_importances_, X.columns), reverse=True)

Feature Importance:


[(0.3592202624922513, 'BA_climate_Cold'),
 (0.13729014200474143, 'state_postal_AK'),
 (0.11197371127419131, 'BA_climate_Mixed-Humid'),
 (0.06424360287504426, 'BA_climate_Very-Cold'),
 (0.037925744033753137, 'state_postal_HI'),
 (0.036384783726627686, 'REGIONC_SOUTH'),
 (0.03356074594717283, 'state_postal_NJ'),
 (0.022739978852014214, 'state_postal_CA'),
 (0.01897627028790957, 'DIVISION_South Atlantic'),
 (0.016539956508360425, 'REGIONC_MIDWEST'),
 (0.015783153067096725, 'state_postal_DC'),
 (0.011579575861862451, 'DIVISION_Pacific'),
 (0.009794390486733537, 'state_postal_NY'),
 (0.009582933587806063, 'state_postal_NC'),
 (0.008247942717753018, 'DIVISION_Mountain South'),
 (0.006761614069735749, 'state_postal_MI'),
 (0.006035893294178419, 'state_postal_NV'),
 (0.005803688053395209, 'state_postal_SC'),
 (0.004875309409361774, 'DIVISION_East North Central'),
 (0.003934521839940046, 'DIVISION_West North Central'),
 (0.0039027499196768838, 'DIVISION_Mountain North'),
 (0.0038730316692716267

Optimizing Random Forest

In [18]:
#initialize model
optimized_rf = RandomForestRegressor()

In [19]:
#define hyperparameters tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [21]:
# Instantiate the GridSearchCV object
#This object will systematically test different combinations of hyperparameters using cross-validation to identify the optimal configuration for the model.
grid_search = GridSearchCV(
    estimator=optimized_rf, 
    param_grid=param_grid, 
    cv=5, # Number of cross-validation folds (popular choice)
    scoring='neg_mean_squared_error', # Scoring method for evaluation
    verbose=2 # Verbosity level
    )

# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None,

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\ProgramData\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\ProgramData\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_paramete

In [23]:
# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

Best hyperparameters: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best accuracy score: -2552074636.657777


Using te best parameters

In [25]:
# Instantiate the RandomForestRegressor model with the best hyperparameters
new_optimized_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='log2'
)

In [26]:
# Fit the optimized model to the training data
new_optimized_rf.fit(X_train, y_train)

In [30]:
# Make predictions using the optimized model
y_pred_new = new_optimized_rf.predict(X_test)

In [31]:
# Evaluate the model's performance using the test data
mse = mean_squared_error(y_test, y_pred_new)
r2 = r2_score(y_test, y_pred_new)
print("Mean Squared Error on Test Data:", mse)
print("R-squared:", r2)

Mean Squared Error on Test Data: 2356422286.9080796
R-squared: 0.10938294576918528
