In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
# Read in CSV file
regional_df = pd.read_csv("Table_CSVs/regional.csv")
regional_df

Unnamed: 0.1,Unnamed: 0,DOEID,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU,TOTALDOL
0,0,100001,WEST,Mountain South,NM,Mixed-Dry,144647.71,2656.89
1,1,100002,SOUTH,West South Central,AR,Mixed-Humid,28034.61,975.00
2,2,100003,WEST,Mountain South,NM,Mixed-Dry,30749.71,522.65
3,3,100004,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19,2061.77
4,4,100005,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93,1463.04
...,...,...,...,...,...,...,...,...
18491,18491,118492,SOUTH,South Atlantic,MD,Mixed-Humid,49930.49,1098.51
18492,18492,118493,NORTHEAST,New England,ME,Very-Cold,222186.04,3613.44
18493,18493,118494,SOUTH,West South Central,TX,Hot-Humid,51593.72,1428.31
18494,18494,118495,SOUTH,South Atlantic,SC,Hot-Humid,63555.21,2224.94


In [3]:
# Create DataFrame with regional information
regional_df = regional_df[['REGIONC', 'DIVISION', 'state_postal', 'BA_climate', 'TOTALBTU']]
regional_df.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate,TOTALBTU
0,WEST,Mountain South,NM,Mixed-Dry,144647.71
1,SOUTH,West South Central,AR,Mixed-Humid,28034.61
2,WEST,Mountain South,NM,Mixed-Dry,30749.71
3,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93


In [4]:
#Checking nulls
regional_df.isnull().sum()

REGIONC         0
DIVISION        0
state_postal    0
BA_climate      0
TOTALBTU        0
dtype: int64

In [6]:
# Encode the categorical columns as 'category' types (LightGBM needs this)
for cat_col in ['REGIONC', 'DIVISION', 'state_postal', 'BA_climate']:
    regional_df[cat_col] = regional_df[cat_col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regional_df[cat_col] = regional_df[cat_col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regional_df[cat_col] = regional_df[cat_col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regional_df[cat_col] = regional_df[cat_col].astype('category')
A value is tryin

In [7]:
# Split the data into the feature matrix and the target vector
X = regional_df.drop('TOTALBTU', axis=1)
y = regional_df['TOTALBTU']

In [8]:
# Preview the features data
X.head()

Unnamed: 0,REGIONC,DIVISION,state_postal,BA_climate
0,WEST,Mountain South,NM,Mixed-Dry
1,SOUTH,West South Central,AR,Mixed-Humid
2,WEST,Mountain South,NM,Mixed-Dry
3,SOUTH,South Atlantic,SC,Mixed-Humid
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid


In [9]:
# Preview the first five entries for the target variable
y[:5]

0    144647.71
1     28034.61
2     30749.71
3     86765.19
4     59126.93
Name: TOTALBTU, dtype: float64

In [15]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42)

In [16]:
# Create an LGBM dataset for training
train_data = lgb.Dataset(X_train, label=y_train)

In [17]:
# Define the parameters
parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}

In [18]:
# Train the model
gbm = lgb.train(parameters, train_data, num_boost_round=500)

In [19]:
# Make predictions
y_train_pred = gbm.predict(X_train)
y_test_pred = gbm.predict(X_test)

In [20]:
# Evaluate the model performance
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [21]:
print("Training R-squared:", train_r2)
print("Test R-squared:", test_r2)

Training R-squared: 0.1262659760412148
Test R-squared: 0.11176486825412113
