In [24]:
from catboost import CatBoostRegressor, Pool

import pandas as pd
from sklearn.model_selection import KFold
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

df = pd.read_csv('X_dropped_DE.csv').drop(['ID',"DAY_ID"], axis=1)
labels=pd.read_csv('Y_dropped_DE.csv').drop('ID', axis=1)


df_FR = pd.read_csv('X_dropped_FR.csv').drop(['ID',"DAY_ID"], axis=1)
labels_FR=pd.read_csv('Y_dropped_FR.csv').drop('ID', axis=1)

df.columns

Index(['DE_CONSUMPTION', 'DE_FR_EXCHANGE', 'DE_NET_EXPORT', 'DE_GAS',
       'DE_COAL', 'DE_HYDRO', 'DE_NUCLEAR', 'DE_SOLAR', 'DE_WINDPOW',
       'DE_LIGNITE', 'DE_RESIDUAL_LOAD', 'DE_RAIN', 'DE_WIND', 'DE_TEMP',
       'GAS_RET', 'COAL_RET', 'CARBON_RET'],
      dtype='object')

In [74]:

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

lr = LinearRegression()



X_train_clean = df
Y_train_clean = labels

lr.fit(X_train_clean, Y_train_clean)

output_train = lr.predict(X_train_clean)

# print (output_train[:,1])
def metric_train(output):
    if not isinstance(output, list):
        return  100 *spearmanr(output, Y_train_clean["TARGET"]).correlation
    else:
        return  [100 *spearmanr(output[:,0], Y_train_clean["TARGET"]).correlation , 100 *spearmanr(output[:,1], Y_train_clean["Rank"]).correlation ]

print('Spearman correlation for the train set', ( metric_train(output_train)))


X2 = sm.add_constant(X_train_clean)
est = sm.OLS(Y_train_clean["TARGET"], X2)
est2 = est.fit()
print(est2.summary())

Spearman correlation for the train set [[100.          94.51046899  41.26156366]
 [ 94.51046899 100.          42.60874859]
 [ 41.26156366  42.60874859 100.        ]]
                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     4.155
Date:                Sat, 29 Jul 2023   Prob (F-statistic):           5.81e-08
Time:                        18:21:19   Log-Likelihood:                -845.69
No. Observations:                 596   AIC:                             1727.
Df Residuals:                     578   BIC:                             1806.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                       coef    std err      

In [28]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.5)
# Fit the ridge regressor
ridge.fit(X_train_clean, Y_train_clean)
output_train = ridge.predict(X_train_clean)
print('Spearman correlation for the train set', ( metric_train(output_train)))

Spearman correlation for the train set [41.381457021810775, 42.52897460790931]


In [68]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.001,max_iter=100000)
# Fit the ridge regressor
lasso.fit(X_train_clean, Y_train_clean)
output_train = lasso.predict(X_train_clean)
print('Spearman correlation for the train set', ( metric_train(output_train)))
included_features = [[feature,coef] for feature,coef in zip(X_train_clean.columns,lasso.coef_[1]) ]
print(included_features)

Spearman correlation for the train set [41.363715615113335, 42.60399865233334]
[['DE_CONSUMPTION', 14.395335702921487], ['DE_FR_EXCHANGE', 108.00423196351038], ['DE_NET_EXPORT', -50.4816326117934], ['DE_GAS', -58.13944541592827], ['DE_COAL', -137.27186466884032], ['DE_HYDRO', 41.97159425595621], ['DE_NUCLEAR', -67.45189307531568], ['DE_SOLAR', -36.26044719538377], ['DE_WINDPOW', -39.51064612508928], ['DE_LIGNITE', -120.58908559817978], ['DE_RESIDUAL_LOAD', 371.11297865256114], ['DE_RAIN', 20.30606101194104], ['DE_WIND', 10.456432002280122], ['DE_TEMP', 13.675122317052145], ['GAS_RET', -17.694134152734364], ['COAL_RET', -2.5936878206695813], ['CARBON_RET', -0.9756681528975524]]


In [51]:
X_train_clean.isnull().sum()

DE_CONSUMPTION      0
DE_FR_EXCHANGE      0
DE_NET_EXPORT       0
DE_GAS              0
DE_COAL             0
DE_HYDRO            0
DE_NUCLEAR          0
DE_SOLAR            0
DE_WINDPOW          0
DE_LIGNITE          0
DE_RESIDUAL_LOAD    0
DE_RAIN             0
DE_WIND             0
DE_TEMP             0
GAS_RET             0
COAL_RET            0
CARBON_RET          0
dtype: int64

In [81]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()
# Fit the ridge regressor
DT.fit(X_train_clean, Y_train_clean["TARGET"])
output_train = DT.predict(X_train_clean)
# print(output_train)
print('Spearman correlation for the train set', ( metric_train(output_train)))

Spearman correlation for the train set 100.0


In [80]:
from sklearn.ensemble import RandomForestRegressor
DF = RandomForestRegressor()
# Fit the ridge regressor
DF.fit(X_train_clean, Y_train_clean["TARGET"])
output_train = DF.predict(X_train_clean)
# print(output_train)
print('Spearman correlation for the train set', ( metric_train(output_train)))

Spearman correlation for the train set 93.94370725341274
