In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


### Reading the merged dataframe

In [None]:
data = pd.read_csv('../../Final_table.csv')
data.head()

In [None]:
sns.pairplot(data=data)

In [None]:
sns.heatmap(data.iloc[:, 1:].corr(), annot=True)

### Splitting the dataframe into test and training sets

In [None]:
feature_var = data.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,26,27,28]]
target_var = data['DPL_historical_da']

X_train, X_test, y_train, y_test = train_test_split(feature_var, target_var, test_size=0.2, random_state=156)

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

model_fr = SelectKBest(score_func=f_regression, k=5)

data_new_fr = model_fr.fit(X_train, y_train)

print("the F-test method computes the F-statistic for each feature, measuring the linear dependency between the feature and the target")
print("f_regression: ", data_new_fr.get_feature_names_out())


### Observing the details between variane and linear dependancy 

In [None]:
df_fc = pd.DataFrame({'column':X_train.columns, 'p':data_new_fc.pvalues_, 'score':data_new_fc.scores_})
df_fc

In [None]:
df_fr = pd.DataFrame({'column':X_train.columns, 'p':data_new_fr.pvalues_, 'score':data_new_fr.scores_})
df_fr

# Lasso Regression

In [None]:
from sklearn import metrics
from sklearn.linear_model import Lasso

model_la = Lasso(alpha=0.1)

model_la.fit(X_train, y_train)

predict = model_la.predict(X_test)
print('r2 ', r2_score(y_test, predict))
print('mse: ',metrics.mean_squared_error(predict, y_test))
print('mape: ',metrics.mean_absolute_percentage_error(predict, y_test))

In [None]:
import matplotlib.pyplot as plt

index = range(len(X_test))

plt.scatter(index, predict, color='blue', label='Predicted')
plt.scatter(index, y_test, color='red', label='True')
plt.title('Lasso True Values vs. Predicted Values')
plt.xlabel('Index of X_test')
plt.ylabel('Values')
plt.legend()
plt.grid(True)
plt.show()


In [None]:

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

axs[0].hist(predict, bins=10, color='blue', alpha=0.7)
axs[0].set_title('Lasso Histogram of Predicted Values')
axs[0].set_xlabel('Predicted Values')
axs[0].set_ylabel('Frequency')

axs[1].hist(y_test, bins=10, color='red', alpha=0.7)
axs[1].set_title('Lasso Histogram of test Values')
axs[1].set_xlabel('test')
axs[1].set_ylabel('Frequency')

axs[2].hist(y_train, bins=10, color='red', alpha=0.7)
axs[2].set_title('Lasso Histogram of train Values')
axs[2].set_xlabel('train')
axs[2].set_ylabel('Frequency')

plt.tight_layout()

plt.show()




# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

model_rid = Ridge(alpha=0.1)
model_rid.fit(X_train, y_train)
predict_rid = model_rid.predict(X_test)
print('r2 ', r2_score(y_test, predict_rid))
print('mse: ',metrics.mean_squared_error(predict_rid, y_test))
print('mape: ',metrics.mean_absolute_percentage_error(predict_rid, y_test))


In [None]:
index = range(len(X_test))

plt.scatter(index, predict_rid, color='blue', label='Predicted')
plt.scatter(index, y_test, color='red', label='True')
plt.title('RidgeTrue Values vs. Predicted Values')
plt.xlabel('Index of X_test')
plt.ylabel('Values')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

axs[0].hist(predict_rid, bins=10, color='blue', alpha=0.7)
axs[0].set_title('Ridge Histogram of Predicted Values')
axs[0].set_xlabel('Predicted Values')
axs[0].set_ylabel('Frequency')

axs[1].hist(y_test, bins=10, color='red', alpha=0.7)
axs[1].set_title('Lasso Histogram of test Values')
axs[1].set_xlabel('test')
axs[1].set_ylabel('Frequency')

axs[2].hist(y_train, bins=10, color='red', alpha=0.7)
axs[2].set_title('Lasso Histogram of train Values')
axs[2].set_xlabel('train')
axs[2].set_ylabel('Frequency')

plt.tight_layout()

plt.show()

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm_sub = LinearRegression()
lm.fit(X_train, y_train)
lm_sub.fit(X_train.iloc[:,[13,18,21,22,26]], y_train)

predict_lm = lm.predict(X_test)
predict_lm_sub = lm_sub.predict(X_test.iloc[:,[13,18,21,22,26]])

print('r2 ', r2_score(y_test, predict_lm))
print('mse: ', metrics.mean_squared_error(y_train, lm.predict(X_train)))
print('r2 sub ', r2_score(y_test, predict_lm_sub))
print('mse sub: ', metrics.mean_squared_error(y_train, lm_sub.predict(X_train.iloc[:,[13,18,21,22,26]])))

In [None]:

fig, axs = plt.subplots(5, 1, figsize=(8, 20))

sns.regplot(x=X_train['APS_forecast'], y=y_train, order=1, ci=None, scatter_kws={'color':'r', 's':9}, ax=axs[0])
axs[0].set_title('Regression Plot for APS_forecast')
axs[0].set_xlabel('APS_forecast')
axs[0].set_ylabel('y_train')

sns.regplot(x=X_train['DOM_forecast'], y=y_train, order=1, ci=None, scatter_kws={'color':'g', 's':9}, ax=axs[1])
axs[1].set_title('Regression Plot for DOM_forecast')
axs[1].set_xlabel('DOM_forecast')
axs[1].set_ylabel('y_train')

sns.regplot(x=X_train['MIDATL_forecast'], y=y_train, order=1, ci=None, scatter_kws={'color':'purple', 's':9}, ax=axs[2])
axs[2].set_title('Regression Plot for MIDATL_forecast')
axs[2].set_xlabel('MIDATL_forecast')
axs[2].set_ylabel('y_train')

sns.regplot(x=X_train['RTO_forecast'], y=y_train, order=1, ci=None, scatter_kws={'color':'y', 's':9}, ax=axs[3])
axs[3].set_title('Regression Plot for RTO_forecast')
axs[3].set_xlabel('RTO_forecast')
axs[3].set_ylabel('y_train')

sns.regplot(x=X_train['Henry Hub Natural Gas Spot Price (Dollars per Million Btu)'], y=y_train, order=1, ci=None, scatter_kws={'color':'orange', 's':9}, ax=axs[4])
axs[4].set_title('Regression Plot for Henry Hub Price')
axs[4].set_xlabel('Henry Hub Price')
axs[4].set_ylabel('y_train')

plt.tight_layout()

plt.show()


# Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate

from mlxtend.evaluate import bias_variance_decomp

In [None]:

mse, bias,var = bias_variance_decomp(lm_sub,
                                     X_train.iloc[:,[13,18,21,22,26]].to_numpy(), y_train.to_numpy(), X_test.iloc[:,[13,18,21,22,26]].to_numpy(), y_test.to_numpy(), 
                                     loss='mse', num_rounds=200, random_seed=100)

In [None]:
MSEs   = []
Biases = []
Vars   = []
poly_models = []
from sklearn.metrics import r2_score

for i in range(1,8):
    poly         = PolynomialFeatures (degree=i, include_bias=False)
    X_train_poly = poly.fit_transform(X_train.iloc[:,[13,18,21,22,26]])
    X_test_poly  = poly.fit_transform(X_test.iloc[:,[13,18,21,22,26]])

    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    poly_models.append(model)
    
    mse, bias, var = bias_variance_decomp(model,
                                           X_train_poly, y_train.to_numpy(), X_test_poly, y_test.to_numpy(),
                                           loss='mse', num_rounds=200, random_seed=100)

    MSEs.append(mse)
    Biases.append(bias)
    Vars.append(var)
    

    cv_results = cross_validate(model, X_train_poly, y_train, cv=10, scoring='neg_mean_absolute_error')
    mse_avg = -cv_results['test_score'].mean()

    y_pred = model.predict(X_test_poly)
    r2 = r2_score(y_test, y_pred)

    print('Degree: %s, MSE: %s, Bias: %s, Var: %s, Cross-Validation: %s, R2_score %s ' %(i,mse.round(2),bias.round(2),var.round(2),mse_avg.round(2),r2.round(2)))


In [None]:

x_values = range(1, len(MSEs) + 1) 
plt.plot(x_values, MSEs, label='MSE')
plt.plot(x_values, Biases, label='Bias')
plt.plot(x_values, Vars, label='Var')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Plot of Three Arrays')
plt.legend()

plt.show()

# Cross Validation

In [None]:
cv_results_la = cross_validate(model_la, feature_var, target_var, cv=10, scoring='neg_mean_absolute_error')
cv_results_rid = cross_validate(model_rid, feature_var, target_var, cv=10, scoring='neg_mean_absolute_error')
cv_results_lm_sub = cross_validate(lm_sub, feature_var, target_var, cv=10, scoring='neg_mean_absolute_error')
cv_results_lm = cross_validate(lm, feature_var, target_var, cv=10, scoring='neg_mean_absolute_error')

mse_avg_la = -cv_results_la['test_score'].mean()
mse_avg_rid = -cv_results_rid['test_score'].mean()
mse_avg_lm_sub = -cv_results_lm_sub['test_score'].mean()
mse_avg_lm = -cv_results_lm['test_score'].mean()

y_pred_la = model.predict(X_test_poly)
y_pred_rid = model.predict(X_test_poly)
y_pred_lm_sub = model.predict(X_test_poly)
y_pred_lm = model.predict(X_test_poly)

r2_la = r2_score(y_test, y_pred_la)
r2_rid = r2_score(y_test, y_pred_rid)
r2_lm_sub = r2_score(y_test, y_pred_lm_sub)
r2_lm = r2_score(y_test, y_pred_lm)

print('Lasso MSE (10-Fold CV): ', mse_avg_la)
print('Ridge MSE (10-Fold CV): ', mse_avg_rid)
print('Linear Subset MSE (10-Fold CV): ', mse_avg_lm_sub)
print('Linear MSE (10-Fold CV): ', mse_avg_lm)

