In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
# Read in CSV file
regional_df = pd.read_csv("Table_CSVs/regional.csv")
regional_df

Unnamed: 0.1,Unnamed: 0,DOEID,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU,TOTALDOL
0,0,100001,WEST,Mountain South,NM,Mixed-Dry,144647.71,2656.89
1,1,100002,SOUTH,West South Central,AR,Mixed-Humid,28034.61,975.00
2,2,100003,WEST,Mountain South,NM,Mixed-Dry,30749.71,522.65
3,3,100004,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19,2061.77
4,4,100005,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93,1463.04
...,...,...,...,...,...,...,...,...
18491,18491,118492,SOUTH,South Atlantic,MD,Mixed-Humid,49930.49,1098.51
18492,18492,118493,NORTHEAST,New England,ME,Very-Cold,222186.04,3613.44
18493,18493,118494,SOUTH,West South Central,TX,Hot-Humid,51593.72,1428.31
18494,18494,118495,SOUTH,South Atlantic,SC,Hot-Humid,63555.21,2224.94


In [4]:
# Create DataFrame with regional information
regional_df = regional_df[['REGIONC', 'DIVISION', 'state_postal', 'BA_climate', 'TOTALBTU']]
regional_df.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU
0,WEST,Mountain South,NM,Mixed-Dry,144647.71
1,SOUTH,West South Central,AR,Mixed-Humid,28034.61
2,WEST,Mountain South,NM,Mixed-Dry,30749.71
3,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93


In [5]:
#Checking nulls
regional_df.isnull().sum()

REGIONC         0
DIVISION        0
state_postal    0
BA_climate      0
TOTALBTU        0
dtype: int64

In [12]:
# Define the feature columns and the target variable
feature_cols = ['REGIONC', 'DIVISION', 'state_postal', 'BA_climate']
target_col = 'TOTALBTU'

In [14]:
# Split the dataset into features and target variable
X = regional_df[feature_cols]
y = regional_df[target_col]

In [15]:
# Preview the features data
X.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate
0,WEST,Mountain South,NM,Mixed-Dry
1,SOUTH,West South Central,AR,Mixed-Humid
2,WEST,Mountain South,NM,Mixed-Dry
3,SOUTH,South Atlantic,SC,Mixed-Humid
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid


In [16]:
# Preview the first five entries for the target variable
y[:5]

0    144647.71
1     28034.61
2     30749.71
3     86765.19
4     59126.93
Name: TOTALBTU, dtype: float64

In [17]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42)

In [18]:
# Create a OneHotEncoder for categorical features
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [19]:
# Create a column transformer to apply the OneHotEncoder on the categorical features
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_transformer, feature_cols)],
    remainder='passthrough'
)

In [20]:
# Create a pipeline that applies the preprocessor followed by the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [21]:
# Train the Random Forest Regressor
pipeline.fit(X_train, y_train)

In [22]:
# Make predictions on the training data
y_train_pred = pipeline.predict(X_train)

In [23]:
# Make predictions on the test data
y_test_pred = pipeline.predict(X_test)

In [24]:
# Evaluate the model performance
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

In [25]:
print("Training R-squared:", train_r2)
print("Test R-squared:", test_r2)
print("Training Mean Squared Error:", train_mse)
print("Test Mean Squared Error:", test_mse)

Training R-squared: 0.12631578846793634
Test R-squared: 0.10983332942014756
Training Mean Squared Error: 2534674785.6071186
Test Mean Squared Error: 2268420187.554845
