In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv('../../Final_table.csv')

#  Performing Descision Tree Linear Regression
### Splitting dataframe into train and test datasets

In [4]:
feature_var = data.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,26,27,28]]
target_var = data['DPL_historical_da']

X_train, X_test, y_train, y_test = train_test_split(feature_var, target_var, test_size=0.2, random_state=156)

### Running regression tree model with depth of 10

In [5]:
model = tree.DecisionTreeRegressor(max_depth = 10, random_state = 156)
model = model.fit(X_train, y_train)

model_text = tree.export_text(model, feature_names=list(X_train.columns))
print(model_text)

|--- APS_forecast <= 7171.33
|   |--- Henry Hub Natural Gas Spot Price (Dollars per Million Btu) <= 5.27
|   |   |--- APS_forecast <= 6438.88
|   |   |   |--- Henry Hub Natural Gas Spot Price (Dollars per Million Btu) <= 4.03
|   |   |   |   |--- OTHER_wind <= 17.73
|   |   |   |   |   |--- APS_forecast <= 5616.60
|   |   |   |   |   |   |--- APS_forecast <= 4801.83
|   |   |   |   |   |   |   |--- forecast_gen_outage_mw_other <= 19529.79
|   |   |   |   |   |   |   |   |--- APS_forecast <= 4284.82
|   |   |   |   |   |   |   |   |   |--- AEP_forecast <= 10711.62
|   |   |   |   |   |   |   |   |   |   |--- value: [9.99]
|   |   |   |   |   |   |   |   |   |--- AEP_forecast >  10711.62
|   |   |   |   |   |   |   |   |   |   |--- value: [12.69]
|   |   |   |   |   |   |   |   |--- APS_forecast >  4284.82
|   |   |   |   |   |   |   |   |   |--- Henry Hub Natural Gas Spot Price (Dollars per Million Btu) <= 3.19
|   |   |   |   |   |   |   |   |   |   |--- value: [15.77]
|   |   |   |   

### Determining importance of feature columns

In [6]:
fi = model.feature_importances_

names = X_train.columns
importance_dict = dict(zip(names, fi))

print("Feature Importance:")
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")

Feature Importance:
MIDATL_solar: 0.020052096775953942
OTHER_solar: 0.0027742791512936583
RFC_solar: 0.010602146044949079
RTO_solar: 0.0007881240531504536
SOUTH_solar: 0.0047906310177377185
WEST_solar: 0.0009595765017299727
MIDATL_wind: 0.0028324502383230586
OTHER_wind: 0.0143876364827465
RFC_wind: 0.0015033732114928683
RTO_wind: 0.0032465586841967818
SOUTH_wind: 0.0059438570790188595
WEST_wind: 0.002914779883208047
AEP_forecast: 0.007589905661115137
APS_forecast: 0.40076061078819236
ATSI_forecast: 0.0008498970712578761
COMED_forecast: 0.0066281501461989195
DAY_forecast: 0.0029998306142857934
DEOK_forecast: 0.006229860852297359
DOM_forecast: 0.12226952690492078
DUQ_forecast: 0.005726389485705696
EKPC_forecast: 0.011381762245808065
MIDATL_forecast: 0.0608994444902208
RTO_forecast: 0.011238052340797463
forecast_gen_outage_mw_rto: 0.00591923448969646
forecast_gen_outage_mw_west: 0.04718174646619918
forecast_gen_outage_mw_other: 0.02619271184503344
Henry Hub Natural Gas Spot Price (Dollars

### MSE and R^2

In [7]:
preds = model.predict(X_test)
print(mean_squared_error(y_test, preds), r2_score(y_test, preds))

541.1199405750033 0.7967122859329586


### Determining Optimal Depth between 1 and 30

In [8]:
mse = {'k':[], 'train_mse':[], 'test_mse':[]}
for k in range(1,30):
    print("Fit with max_depth:", k, end='\r', flush=True)
    
    model = tree.DecisionTreeRegressor(max_depth=k, random_state= 156)
    model = model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)

    mse['k'].append(k)
    mse['train_mse'].append(mean_squared_error(y_train, preds_train))
    mse['test_mse'].append(mean_squared_error(y_test, preds_test))
    
idx = mse['test_mse'].index(min(mse['test_mse']))
print('Depth of the model yielding minimum test MSE is:', mse['k'][idx])
print('Optimized model has MSE:', min(mse['test_mse']))

Depth of the model yielding minimum test MSE is: 25
Optimized model has MSE: 480.4415659857369


### Running regression tree model with optimal depth of 25

In [9]:
model = tree.DecisionTreeRegressor(max_depth = 25, random_state = 156)
model = model.fit(X_train, y_train)

model_text = tree.export_text(model, feature_names=list(X_train.columns))
print(model_text)

|--- APS_forecast <= 7171.33
|   |--- Henry Hub Natural Gas Spot Price (Dollars per Million Btu) <= 5.27
|   |   |--- APS_forecast <= 6438.88
|   |   |   |--- Henry Hub Natural Gas Spot Price (Dollars per Million Btu) <= 4.03
|   |   |   |   |--- OTHER_wind <= 17.73
|   |   |   |   |   |--- APS_forecast <= 5616.60
|   |   |   |   |   |   |--- APS_forecast <= 4801.83
|   |   |   |   |   |   |   |--- forecast_gen_outage_mw_other <= 19529.79
|   |   |   |   |   |   |   |   |--- APS_forecast <= 4284.82
|   |   |   |   |   |   |   |   |   |--- AEP_forecast <= 10711.62
|   |   |   |   |   |   |   |   |   |   |--- forecast_gen_outage_mw_rto <= 37317.17
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 14
|   |   |   |   |   |   |   |   |   |   |--- forecast_gen_outage_mw_rto >  37317.17
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 5
|   |   |   |   |   |   |   |   |   |--- AEP_forecast >  10711.62
|   |   |   |   |   |   |   |   |   |   |

### Determining importance of feature columns

In [10]:
fi = model.feature_importances_

names = X_train.columns
importance_dict = dict(zip(names, fi))

print("Feature Importance:")
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")

Feature Importance:
MIDATL_solar: 0.01978557948714338
OTHER_solar: 0.0029890639151678084
RFC_solar: 0.008409791404208585
RTO_solar: 0.00439953364992183
SOUTH_solar: 0.001481978795290021
WEST_solar: 0.0038378087870487034
MIDATL_wind: 0.005808190420483075
OTHER_wind: 0.015518603355378491
RFC_wind: 0.0010067279902418905
RTO_wind: 0.004703023262917588
SOUTH_wind: 0.006228160020047712
WEST_wind: 0.004899377791278393
AEP_forecast: 0.008848115365789102
APS_forecast: 0.3794323764719937
ATSI_forecast: 0.0027221074073245245
COMED_forecast: 0.009860867113802555
DAY_forecast: 0.00447059931674008
DEOK_forecast: 0.011896694334454264
DOM_forecast: 0.12140648559731942
DUQ_forecast: 0.004076729871855333
EKPC_forecast: 0.014054384390838793
MIDATL_forecast: 0.06245524758031954
RTO_forecast: 0.01306315376340437
forecast_gen_outage_mw_rto: 0.011799581432673149
forecast_gen_outage_mw_west: 0.04841671471388981
forecast_gen_outage_mw_other: 0.021402810114841138
Henry Hub Natural Gas Spot Price (Dollars per Mi