In [851]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LinearRegression, BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from bayes_opt import BayesianOptimization
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers

In [684]:
results_matrix=pd.DataFrame(index=["SR","MSE"])

## Reading data in/define necessary functions

In [685]:
X_train_raw=pd.read_csv("X_train.csv")
y_train_raw=pd.read_csv("y_train.csv")
X_test_raw=pd.read_csv("X_test.csv")

X_train_raw=X_train_raw.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)


#lets drop the redundant stuff
X_train,X_test,y_train,y_test=train_test_split(X_train_raw,y_train_raw,test_size=0.2,random_state=13)

y_train=y_train["TARGET"]
y_test=y_test["TARGET"]

In [817]:
def split_into_country(df,y_df,country: str):
    
    y_df=y_df[df["COUNTRY"]==country]
    df=df[df["COUNTRY"]==country]
    df=df.drop(["ID","DAY_ID","COUNTRY"],axis=1)
    df.fillna(df.median(),inplace=True)
    
    return df,y_df

def ridgeregression(df,y_df):
    
    lrcv=RidgeCV(store_cv_values=True)
    lrcv.fit(df,y_df)
    #lr=Ridge(alpha=lrcv.alpha_)
    lr=Ridge(alpha=10)
    lr.fit(df,y_df)
    
    return lr

def SVRegression(df,y_df,C=10,epsilon=0.01):
    
    model=SVR(kernel="linear",C=C,epsilon=epsilon)
    model.fit(df,y_df["TARGET"])
    
    return model

def SVRegressionP(df,y_df,degree,C=10,epsilon=0.01):
    
    model=SVR(kernel="poly",degree=degree,C=C,epsilon=epsilon)
    model.fit(df,y_df["TARGET"])
    
    return model

def SVR_error_de(C,epsilon):
    
    df=de_scale
    y_df=y_de_scale
    model=SVR(kernel="linear",C=C,epsilon=epsilon)
    model.fit(df,y_df["TARGET"])
    y_pred=model.predict(df)
    mse=mean_squared_error(y_df["TARGET"],y_pred)
    
    return -mse

def SVR_error_fr(C,epsilon):
    
    df=fr_scale
    y_df=y_fr_scale
    model=SVR(kernel="linear",C=C,epsilon=epsilon)
    model.fit(df,y_df["TARGET"])
    y_pred=model.predict(df)
    mse=mean_squared_error(y_df["TARGET"],y_pred)
    
    return -mse
     
def postprocess(df_fr,df_de,x,y):
    
    fr=0
    de=0
    out=[]

    for i in range (y.shape[0]):
        if x['COUNTRY'].values[i]=='FR':
            out.append([df_fr[fr]])
            fr = fr + 1
        else:
            out.append([df_de[de]])
            de = de + 1

    return np.array(out)

def z_scale(data):
    
    data=pd.DataFrame(data)
    scaler=StandardScaler()
    normalized_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

    return normalized_data

def lasso(df,y_df):
    #lasso_cv=LassoCV(cv=10,max_iter=10000)
    #lasso_cv.fit(df,y_df)
    lasso=Lasso(alpha=0.05)
    lasso.fit(df,y_df)
    relevant_vars=df.columns[lasso.coef_!=0]

    return list(relevant_vars)

def Gboost(df,y_df):
    gbr=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1,max_depth=3)
    gbr.fit(df,y_df["TARGET"])
    
    return gbr

def spearman(output, y):

    return spearmanr(output, y).correlation

def spear_loss(y_true,y_pred):
    rho=tf.py_function(lambda yt, yp: spearmanr(yt, yp).correlation, [y_true,y_pred], [tf.float32])
    return -rho[0]

    

## Benchmark

In [687]:
lr=LinearRegression()

X_train_clean = X_train.drop(['COUNTRY'], axis=1).fillna(0)
Y_train_clean = y_train

lr.fit(X_train_clean, Y_train_clean)
output_train = lr.predict(X_train_clean)

print('Spearman correlation for the train set: {:.1f}%'.format(100 * spearman(output_train,y_train)))

Spearman correlation for the train set: 29.0%


In [688]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = lr.predict(X_test_clean)
mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])

print('Spearman correlation for the train set: {:.1f}%'.format(100 * spearman(lr.predict(X_test_clean),y_test)))
print("MSE: {:.3f}".format(mse_test))

results_matrix["Benchmark"]=[100*spearman(lr.predict(X_test_clean),y_test),mse_test]

Spearman correlation for the train set: 21.6%
MSE: 0.806


## Split the dataset into countries

In [689]:
#training set
de_train,y_de_train=split_into_country(X_train,y_train,"DE")
fr_train,y_fr_train=split_into_country(X_train,y_train,"FR")

#test set
de_test,y_de_test=split_into_country(X_test,y_test,"DE")
fr_test,y_fr_test=split_into_country(X_test,y_test,"FR")

## Ridge regression testing

In [820]:
#fit the model
de_train_ridge=ridgeregression(de_train,y_de_train)
fr_train_ridge=ridgeregression(fr_train,y_fr_train)

#make predictions on test set
fr_train_pred=fr_train_ridge.predict(fr_train)
de_train_pred=de_train_ridge.predict(de_train)

train_out=postprocess(fr_train_pred,de_train_pred,X_train,y_train)

print('Spearman correlation for the train set using RidgeRegression divided into countries: {:.1f}%'.format(100 *spearman(train_out, y_train) ))

Spearman correlation for the train set using RidgeRegression divided into countries: 37.0%


In [821]:
#use the model to predict form the test set
fr_test_pred=fr_train_ridge.predict(fr_test)
de_test_pred=de_train_ridge.predict(de_test)

test_out=postprocess(fr_test_pred,de_test_pred,X_test,y_test)

print('Spearman correlation for the test set using RidgeRegression divided into countries: {:.1f}%'.format(100 *spearman(test_out, y_test) ))



Spearman correlation for the test set using RidgeRegression divided into countries: 25.1%


In [822]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["Ridge"]=[100*spearman(test_out,y_test),mse_test]

## SVR Linear

In [693]:
#lets normalise for SVM

#train set
de_train_s=z_scale(de_train)
y_de_train_s=z_scale(y_de_train)
fr_train_s=z_scale(fr_train)
y_fr_train_s=z_scale(y_fr_train)

#test set
de_test_s=z_scale(de_test)
y_de_test_s=z_scale(y_de_test)
fr_test_s=z_scale(fr_test)
y_fr_test_s=z_scale(y_fr_test)

In [694]:
#fit the model
de_svr=SVRegression(de_train_s,y_de_train_s)
fr_svr=SVRegression(fr_train_s,y_fr_train_s)

#make predictions on train set
de_svr_pred=de_svr.predict(de_train_s)
fr_svr_pred=de_svr.predict(fr_train_s)

train_out_svr=postprocess(fr_svr_pred,de_svr_pred,X_train,y_train)
print('Spearman correlation for the train set with SVR split country: {:.1f}%'.format(100*spearman(train_out_svr, y_train) ))

Spearman correlation for the train set with SVR split country: 22.8%


In [695]:
#make predictions on the test set
de_svr_test=de_svr.predict(de_test_s)
fr_svr_test=fr_svr.predict(fr_test_s)

test_out_svr=postprocess(fr_svr_test,de_svr_test,X_test,y_test)
print('Spearman correlation for the test set with SVR split country: {:.1f}%'.format(100*spearman(test_out_svr, y_test) ))

Spearman correlation for the test set with SVR split country: 19.8%


In [696]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = test_out_svr

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR-L"]=[100*spearman(test_out_svr,y_test),mse_test]

## SVR Polynomial

In [697]:
#fit the model
de_svr_p=SVRegressionP(de_train_s,y_de_train_s,degree=3)
fr_svr_p=SVRegressionP(fr_train_s,y_fr_train_s,degree=3)

#make predictions on train set
de_svr_pred_p=de_svr_p.predict(de_train_s)
fr_svr_pred_p=de_svr_p.predict(fr_train_s)

train_out_svr_p=postprocess(fr_svr_pred_p,de_svr_pred_p,X_train,y_train)
print('Spearman correlation for the train set with SVR split country: {:.1f}%'.format(100*spearman(train_out_svr_p, y_train) ))

Spearman correlation for the train set with SVR split country: 50.8%


In [698]:
#make predictions on the test set
de_svr_test_p=de_svr_p.predict(de_test_s)
fr_svr_test_p=fr_svr_p.predict(fr_test_s)

test_out_svr_p=postprocess(fr_svr_test_p,de_svr_test_p,X_test,y_test)
print('Spearman correlation for the test set with SVR split country: {:.1f}%'.format(100*spearman(test_out_svr_p, y_test) ))

Spearman correlation for the test set with SVR split country: 5.0%


In [699]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = test_out_svr_p

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR-P"]=[100*spearman(test_out_svr_p,y_test),mse_test]

## Feature Selection-L

In [700]:
de_relevant_vars=lasso(de_train_s,y_de_train_s["TARGET"])
fr_relevant_vars=lasso(fr_train_s,y_fr_train_s["TARGET"])

selected_de=de_train_s[de_relevant_vars]
selected_fr=fr_train_s[fr_relevant_vars]

selected_de_test=de_test_s[de_relevant_vars]
selected_fr_test=fr_test_s[fr_relevant_vars]

print("DE",de_relevant_vars)
print("FR",fr_relevant_vars)

DE ['DE_NET_EXPORT', 'FR_WINDPOW', 'DE_RESIDUAL_LOAD']
FR ['DE_NET_EXPORT', 'DE_HYDRO', 'FR_HYDRO', 'FR_WINDPOW', 'DE_TEMP', 'GAS_RET', 'CARBON_RET']


In [701]:
#fit the model
de_svr_sel=SVRegression(selected_de,y_de_train_s,C=best_C,epsilon=best_epsilon)
fr_svr_sel=SVRegression(selected_fr,y_fr_train_s,C=best_C,epsilon=best_epsilon)

#make predictions on train set
selected_de_pred=de_svr_sel.predict(selected_de)
selected_fr_pred=fr_svr_sel.predict(selected_fr)

select_svr=postprocess(selected_fr_pred,selected_de_pred,X_train,y_train)
print('Spearman correlation for the train set with SVR and selected: {:.1f}%'.format(100*spearman(select_svr, y_train) ))

Spearman correlation for the train set with SVR and selected: 30.4%


In [702]:
#make predictions on the test set
selected_de_test_pred=de_svr_sel.predict(selected_de_test)
selected_fr_test_pred=fr_svr_sel.predict(selected_fr_test)

select_svr_test=postprocess(selected_fr_test_pred,selected_de_test_pred,X_test,y_test)
print('Spearman correlation for the test set with SVR and selected: {:.1f}%'.format(100*spearman(select_svr_test, y_test)))

Spearman correlation for the test set with SVR and selected: 26.1%


In [703]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = select_svr_test

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR_sel_L"]=[100*spearman(select_svr_test,y_test),mse_test]

## Feature Selection-P

In [704]:
#fit the model
de_svr_sel_p=SVRegressionP(selected_de,y_de_train_s,C=best_C,epsilon=best_epsilon,degree=3)
fr_svr_sel_p=SVRegressionP(selected_fr,y_fr_train_s,C=best_C,epsilon=best_epsilon,degree=3)

#make predictions on train set
selected_de_pred_p=de_svr_sel_p.predict(selected_de)
selected_fr_pred_p=fr_svr_sel_p.predict(selected_fr)

select_svr_p=postprocess(selected_fr_pred_p,selected_de_pred_p,X_train,y_train)
print('Spearman correlation for the train set with SVR and selected: {:.1f}%'.format(100*spearman(select_svr_p, y_train) ))

Spearman correlation for the train set with SVR and selected: 30.0%


In [705]:
#make predictions on the test set
selected_de_test_pred_p=de_svr_sel_p.predict(selected_de_test)
selected_fr_test_pred_p=fr_svr_sel_p.predict(selected_fr_test)

select_svr_test_p=postprocess(selected_fr_test_pred_p,selected_de_test_pred_p,X_test,y_test)
print('Spearman correlation for the test set with SVR and selected: {:.1f}%'.format(100*spearman(select_svr_test_p, y_test)))

Spearman correlation for the test set with SVR and selected: 16.1%


In [706]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = select_svr_test_p

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR_sel_P"]=[100*spearman(select_svr_test_p,y_test),mse_test]

## Gradient boosting

In [707]:
#fit the model
gbr_de=Gboost(de_train_s,y_de_train_s)
gbr_fr=Gboost(fr_train_s,y_fr_train_s)

#make predictions on the train set
gbr_de_pred=gbr_de.predict(de_train_s)
gbr_fr_pred=gbr_fr.predict(fr_train_s)

gbr_out=postprocess(gbr_fr_pred,gbr_de_pred,X_train,y_train)
print('Spearman correlation for the test set with GBR: {:.1f}%'.format(100*spearman(gbr_out, y_train)))

Spearman correlation for the test set with GBR: 74.9%


In [708]:
#make predictions on the test set

gbr_de_test=gbr_de.predict(de_test_s)
gbr_fr_test=gbr_fr.predict(fr_test_s)

gbr_test_out=postprocess(gbr_fr_test,gbr_de_test,X_test,y_test)
print('Spearman correlation for the test set with GBR: {:.1f}%'.format(100*spearman(gbr_test_out, y_test)))

Spearman correlation for the test set with GBR: 11.4%


In [709]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = gbr_test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["Grad Boost"]=[100*spearman(gbr_test_out,y_test),mse_test]

## Neural Network

In [710]:
de_nn_cols=[i for i,x in enumerate(de_train_ridge.coef_*100>5) if x==True]
fr_nn_cols=[i for i,x in enumerate(fr_train_ridge.coef_*100>5) if x==True]

de_nn_train=de_train_s.iloc[:,de_nn_cols]
fr_nn_train=fr_train_s.iloc[:,fr_nn_cols]


# Create a simple neural network model
de_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(de_nn_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Single output for regression
])

# Compile the model
de_model.compile(loss="mse", optimizer='sgd')

# Train the model
de_model.fit(de_nn_train, y_de_train, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 

<keras.src.callbacks.History at 0x23f229101d0>

In [711]:
# Create a simple neural network model
fr_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(fr_nn_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Single output for regression
])

# Compile the model
fr_model.compile(loss="mse", optimizer='sgd')

# Train the model
fr_model.fit(fr_nn_train, y_fr_train, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 

<keras.src.callbacks.History at 0x23f3afbfd90>

In [712]:
de_nn_test=de_test_s.iloc[:,de_nn_cols]
fr_nn_test=fr_test_s.iloc[:,fr_nn_cols]

de_nn_pred=de_model.predict(de_nn_test)
fr_nn_pred=fr_model.predict(fr_nn_test)



In [713]:
nn_test_out=postprocess(fr_nn_pred,de_nn_pred,X_test,y_test)
nn_test_out=nn_test_out.reshape(nn_test_out.shape[0],-1)
print('Spearman correlation for the test set with NN: {:.1f}%'.format(100*spearman(nn_test_out, y_test)))

Spearman correlation for the test set with NN: 10.9%


In [714]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = nn_test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["NN"]=[100*spearman(nn_test_out,y_test),mse_test]

## Poly with selected features

In [715]:
lr_cv=LinearRegression()
scores_de=cross_val_score(lr_cv,de_nn_train,y_de_train,scoring="r2",cv=5)
scores_fr=cross_val_score(lr_cv,fr_nn_train,y_fr_train,scoring="r2",cv=5)

poly_de=PolynomialFeatures(degree=2, include_bias=False)
poly_fr=PolynomialFeatures(degree=2, include_bias=False)

poly_feat_de=poly_de.fit_transform(de_nn_train)
poly_feat_fr=poly_fr.fit_transform(fr_nn_train)
poly_de_out=poly_de.transform(de_nn_train)
poly_fr_out=poly_fr.transform(fr_nn_train)
poly_de_test_out=poly_de.transform(de_nn_test)
poly_fr_test_out=poly_fr.transform(fr_nn_test)

lr_poly_de=LinearRegression()
lr_poly_fr=LinearRegression()

#fit the model
lr_poly_de.fit(poly_feat_de,y_de_train)
lr_poly_fr.fit(poly_feat_fr,y_fr_train)

#predict on train set
lr_poly_de_pred=lr_poly_de.predict(poly_de_out)
lr_poly_fr_pred=lr_poly_fr.predict(poly_fr_out)


poly_train_out=postprocess(lr_poly_fr_pred,lr_poly_de_pred,X_train,y_train)
print('Spearman correlation for the train set with poly features: {:.1f}%'.format(100*spearman(poly_train_out, y_train)))

Spearman correlation for the train set with poly features: 27.5%


In [716]:
#predict on test set
lr_poly_de_test=lr_poly_de.predict(poly_de_test_out)
lr_poly_fr_test=lr_poly_fr.predict(poly_fr_test_out)

poly_test_out=postprocess(lr_poly_fr_test,lr_poly_de_test,X_test,y_test)
print('Spearman correlation for the test set with poly features: {:.1f}%'.format(100*spearman(poly_test_out, y_test)))

Spearman correlation for the test set with poly features: 14.2%


In [717]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = poly_test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["Poly"]=[100*spearman(poly_test_out,y_test),mse_test]

## Removing low correlation features from train set

In [813]:
threshold=0.06

corr_de=de_train_s.corrwith(y_de_train)
de_train_c=de_train_s.loc[:,abs(corr_de)>threshold]
de_test_c=de_test_s.loc[:,abs(corr_de)>threshold]
corr_fr=fr_train_s.corrwith(y_fr_train)
fr_train_c=fr_train_s.loc[:,abs(corr_fr)>threshold]
fr_test_c=fr_test_s.loc[:,abs(corr_fr)>threshold]

de_test_c

Unnamed: 0,DE_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,DE_GAS,DE_COAL,DE_HYDRO,DE_WINDPOW,FR_WINDPOW,DE_LIGNITE,DE_RESIDUAL_LOAD,DE_WIND,FR_WIND
173,-0.990367,0.198628,-0.112689,0.746947,-1.231858,0.955491,0.354309,-0.373501,-0.994481,-1.106939,-0.397712,-0.611540
448,-1.257929,-0.904792,-1.128043,0.336427,-0.248216,-0.093396,-1.012889,-0.485072,0.431948,-0.036186,-0.145876,-0.001253
1109,1.247370,-0.238350,1.957018,-0.828563,0.535648,-0.724273,1.229340,2.253539,1.534373,0.052753,-0.433753,0.259285
1311,0.757594,0.384985,0.503730,0.583682,0.508438,-0.934737,0.260839,-0.081582,1.454654,0.746415,-0.714662,-1.082914
643,-1.064563,0.322065,-0.809799,0.990596,-1.449589,0.575730,-0.669549,-0.353574,-0.741543,-0.414415,-0.624999,-0.571376
...,...,...,...,...,...,...,...,...,...,...,...,...
1484,0.014438,1.130875,-0.992926,-0.564654,-0.634773,1.935940,0.159720,-0.640959,-0.894973,-0.136310,-0.321932,-0.889216
366,-1.100194,-0.154824,-0.787350,1.016932,-1.560615,0.063379,-0.764226,-0.842038,-1.019175,-0.438511,-1.015912,-1.311008
523,-0.261258,-1.716863,1.131331,-1.406768,0.542796,0.519813,0.601356,-0.563063,0.530200,-0.871749,0.840039,0.276001
1062,0.722508,-0.083001,1.433830,-1.298742,-0.428195,-0.013122,1.645208,2.137466,0.455033,-0.850250,-0.358423,0.056160


In [814]:
#fit the model
de_svr_c=SVRegression(de_train_c,y_de_train_s,C=best_C,epsilon=best_epsilon)
fr_svr_c=SVRegression(fr_train_c,y_fr_train_s,C=best_C,epsilon=best_epsilon)

#make predictions on train set
de_svr_pred_c=de_svr_c.predict(de_train_c)
fr_svr_pred_c=fr_svr_c.predict(fr_train_c)

train_out_svr_c=postprocess(fr_svr_pred_c,de_svr_pred_c,X_train,y_train)
print('Spearman correlation for the train set with SVR split country: {:.1f}%'.format(100*spearman(train_out_svr_c, y_train) ))

Spearman correlation for the train set with SVR split country: 32.7%


In [815]:
#make predictions on the test set
de_svr_test_c=de_svr_c.predict(de_test_c)
fr_svr_test_c=fr_svr_c.predict(fr_test_c)

test_out_svr_c=postprocess(fr_svr_test_c,de_svr_test_c,X_test,y_test)
print('Spearman correlation for the test set with SVR split country: {:.1f}%'.format(100*spearman(test_out_svr_c, y_test) ))

Spearman correlation for the test set with SVR split country: 27.3%


In [816]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = test_out_svr_c

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR-C"]=[100*spearman(test_out_svr_c,y_test),mse_test]

## Manual Selection

In [800]:
de_cols=["FR_DE_EXCHANGE"]
fr_cols=[]
for i in list(de_train_s.columns):
    if i[0:2]=="DE":
        de_cols.append(i)
    elif i[0:2]=="FR":
        fr_cols.append(i)
    else:
        pass

de_train_m=de_train_s[de_cols]
fr_train_m=fr_train_s[de_cols]
de_test_m=de_test_s[de_cols]
fr_test_m=fr_test_s[de_cols]

In [801]:
#fit the model
de_svr_m=SVRegression(de_train_m,y_de_train_s,C=best_C,epsilon=best_epsilon)
fr_svr_m=SVRegression(fr_train_m,y_fr_train_s,C=best_C,epsilon=best_epsilon)

#make predictions on train set
de_svr_pred_m=de_svr_m.predict(de_train_m)
fr_svr_pred_m=fr_svr_m.predict(fr_train_m)

train_out_svr_m=postprocess(fr_svr_pred_m,de_svr_pred_m,X_train,y_train)
print('Spearman correlation for the train set with SVR split country: {:.1f}%'.format(100*spearman(train_out_svr_m, y_train)))

Spearman correlation for the train set with SVR split country: 29.8%


In [802]:
#make predictions on the test set
de_svr_test_m=de_svr_m.predict(de_test_m)
fr_svr_test_m=fr_svr_m.predict(fr_test_m)

test_out_svr_m=postprocess(fr_svr_test_m,de_svr_test_m,X_test,y_test)
print('Spearman correlation for the test set with SVR split country: {:.1f}%'.format(100*spearman(test_out_svr_m, y_test) ))

Spearman correlation for the test set with SVR split country: 23.1%


In [803]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = test_out_svr_m

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["SVR-M"]=[100*spearman(test_out_svr_m,y_test),mse_test]

## Neutral Network with selection

In [783]:
de_nn_train=de_train_s.loc[:,de_cols]
fr_nn_train=fr_train_s.loc[:,fr_cols]


# Create a simple neural network model
de_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(de_nn_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Single output for regression
])

# Compile the model
de_model.compile(loss="mse", optimizer='sgd')

# Train the model
de_model.fit(de_nn_train, y_de_train, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 

<keras.src.callbacks.History at 0x23f361cbf90>

In [784]:
# Create a simple neural network model
fr_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(fr_nn_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Single output for regression
])

# Compile the model
fr_model.compile(loss="mse", optimizer='sgd')

# Train the model
fr_model.fit(fr_nn_train, y_fr_train, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 

<keras.src.callbacks.History at 0x23f3794bf90>

In [785]:
de_nn_test=de_test_s.loc[:,de_cols]
fr_nn_test=fr_test_s.loc[:,fr_cols]

de_nn_pred=de_model.predict(de_nn_test)
fr_nn_pred=fr_model.predict(fr_nn_test)



In [786]:
nn_test_out=postprocess(fr_nn_pred,de_nn_pred,X_test,y_test)
nn_test_out=nn_test_out.reshape(nn_test_out.shape[0],-1)
print('Spearman correlation for the test set with NN: {:.1f}%'.format(100*spearman(nn_test_out, y_test)))

Spearman correlation for the test set with NN: 14.2%


In [787]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = nn_test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["NN-m"]=[100*spearman(nn_test_out,y_test),mse_test]

## KNN Regression

In [846]:
knn_de = KNeighborsRegressor(n_neighbors=11)
knn_fr = KNeighborsRegressor(n_neighbors=11)

#fit the model
knn_de.fit(de_train_c,y_de_train_s)
knn_fr.fit(fr_train_c,y_fr_train_s)

#make predictions on train set
knn_de_pred=knn_de.predict(de_train_c)
knn_fr_pred=knn_fr.predict(fr_train_c)

knn_train_out=postprocess(knn_fr_pred,knn_de_pred,X_train,y_train)
knn_train_out=knn_train_out.reshape(knn_train_out.shape[0],-1)
print('Spearman correlation for the train set with KNN: {:.1f}%'.format(100*spearman(knn_train_out, y_train)))

Spearman correlation for the train set with KNN: 38.3%


In [847]:
knn_de_test=knn_de.predict(de_test_c)
knn_fr_test=knn_fr.predict(fr_test_c)
knn_test_out=postprocess(knn_fr_test,knn_de_test,X_test,y_test)
knn_test_out=knn_test_out.reshape(knn_test_out.shape[0],-1)
print('Spearman correlation for the test set with KNN: {:.1f}%'.format(100*spearman(knn_test_out, y_test)))

Spearman correlation for the test set with KNN: 15.6%


In [848]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

Y_test_submission = X_test[['ID']].copy()
Y_test_submission['TARGET'] = knn_test_out

mse_test=mean_squared_error(y_test,Y_test_submission["TARGET"])
mse_test

results_matrix["KNN"]=[100*spearman(knn_test_out,y_test),mse_test]

## Bayesian Regression

In [856]:
br_de = BayesianRidge(max_iter=300,tol=0.001,alpha_1=1e-6,lambda_1=1e-6)
br_fr = BayesianRidge(max_iter=300,tol=0.001,alpha_1=1e-6,lambda_1=1e-6)

#fit the model
br_de.fit(de_train_c,y_de_train_s["TARGET"])
br_fr.fit(fr_train_c,y_fr_train_s["TARGET"])

#predict on train set
br_de_pred=br_de.predict(de_train_c)
br_fr_pred=br_fr.predict(fr_train_c)

br_train_out=postprocess(br_fr_pred,br_de_pred,X_train,y_train)
br_train_out=br_train_out.reshape(br_train_out.shape[0],-1)
print('Spearman correlation for the train set with Bayesian: {:.1f}%'.format(100*spearman(br_train_out, y_train)))

Spearman correlation for the train set with Bayesian: 31.3%


In [857]:
#predict on test set
br_de_test=br_de.predict(de_test_c)
br_fr_test=br_fr.predict(fr_test_c)

br_test_out=postprocess(br_fr_test,br_de_test,X_test,y_test)
br_test_out=br_test_out.reshape(br_test_out.shape[0],-1)
print('Spearman correlation for the train set with Bayesian: {:.1f}%'.format(100*spearman(br_test_out, y_test)))

Spearman correlation for the train set with Bayesian: 25.3%


In [849]:
results_matrix=results_matrix.round(2)
results_matrix

Unnamed: 0,Benchmark,Ridge,SVR-L,SVR-P,SVR_sel_L,SVR_sel_P,Grad Boost,NN,Poly,SVR-C,SVR-M,SVR-MM,NN-m,KNN
SR,21.56,25.11,19.84,5.0,26.15,16.14,11.42,10.94,14.16,27.29,23.12,20.5,14.23,15.58
MSE,0.81,0.79,0.8,1.49,0.8,0.91,0.93,0.89,0.8,0.8,0.81,0.8,1.14,0.85
