In [2]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
# Read in CSV file
regional_df = pd.read_csv("Table_CSVs/regional.csv")
regional_df

Unnamed: 0.1,Unnamed: 0,DOEID,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU,TOTALDOL
0,0,100001,WEST,Mountain South,NM,Mixed-Dry,144647.71,2656.89
1,1,100002,SOUTH,West South Central,AR,Mixed-Humid,28034.61,975.00
2,2,100003,WEST,Mountain South,NM,Mixed-Dry,30749.71,522.65
3,3,100004,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19,2061.77
4,4,100005,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93,1463.04
...,...,...,...,...,...,...,...,...
18491,18491,118492,SOUTH,South Atlantic,MD,Mixed-Humid,49930.49,1098.51
18492,18492,118493,NORTHEAST,New England,ME,Very-Cold,222186.04,3613.44
18493,18493,118494,SOUTH,West South Central,TX,Hot-Humid,51593.72,1428.31
18494,18494,118495,SOUTH,South Atlantic,SC,Hot-Humid,63555.21,2224.94


In [4]:
# Create DataFrame with regional information
regional_df = regional_df[['REGIONC', 'DIVISION', 'state_postal', 'BA_climate', 'TOTALBTU']]
regional_df.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU
0,WEST,Mountain South,NM,Mixed-Dry,144647.71
1,SOUTH,West South Central,AR,Mixed-Humid,28034.61
2,WEST,Mountain South,NM,Mixed-Dry,30749.71
3,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93


In [5]:
#Checking nulls
regional_df.isnull().sum()

REGIONC         0
DIVISION        0
state_postal    0
BA_climate      0
TOTALBTU        0
dtype: int64

In [6]:
# Split the data into the feature matrix and the target vector
X = regional_df.drop('TOTALBTU', axis=1)
y = regional_df['TOTALBTU']

In [7]:
# Preview the features data
X.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate
0,WEST,Mountain South,NM,Mixed-Dry
1,SOUTH,West South Central,AR,Mixed-Humid
2,WEST,Mountain South,NM,Mixed-Dry
3,SOUTH,South Atlantic,SC,Mixed-Humid
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid


In [9]:
# Preview the first five entries for the target variable
y[:5]

0    144647.71
1     28034.61
2     30749.71
3     86765.19
4     59126.93
Name: TOTALBTU, dtype: float64

In [13]:
# Create a list of categorical feature indices
cat_features = list(range(X.shape[1])) # all features are categorical

In [14]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42)

In [15]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=cat_features,
    loss_function='RMSE',
    verbose=200,
    random_seed=42
)

In [16]:
# Fit model
model.fit(X_train, y_train)

0:	learn: 53432.9341497	total: 175ms	remaining: 2m 55s
200:	learn: 50190.0033311	total: 6.4s	remaining: 25.4s
400:	learn: 49805.3311753	total: 12.9s	remaining: 19.3s
600:	learn: 49498.4169711	total: 19.3s	remaining: 12.8s
800:	learn: 49211.8439213	total: 25.6s	remaining: 6.35s
999:	learn: 48990.1598638	total: 31.9s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x28f0b6af820>

In [17]:
# Make predictions
y_pred = model.predict(X_test)

In [18]:
# Evaluate the model
train_r2 = model.score(X_train, y_train)
test_r2 = model.score(X_test, y_test)

In [19]:
print("Training R-squared:", train_r2)
print("Test R-squared:", test_r2)

Training R-squared: 0.12250990784063331
Test R-squared: 0.11186680602851973
