In [1]:
import pandas as pd
from scipy.stats import spearmanr
import numpy as np
from utils.Dataset_modification import *


X_train = pd.read_csv(datasets+'X_train.csv') #datasets is defined in utils
Y_train=pd.read_csv(datasets+'Y_train.csv')

X_train_DE,Y_train_DE,X_train_FR,Y_train_FR=seperate_data_by_countries(X_train,Y_train)
del X_train,Y_train


  X_train_DE = X_train[X_train['COUNTRY']==1].loc[:, X_train.columns.str.contains('^(ID|DAY_ID|DE|GAS|COAL|CARBON)')]
  X_train_FR = X_train[X_train['COUNTRY']==0].loc[:, X_train.columns.str.contains('^(ID|DAY_ID|FR|GAS|COAL|CARBON)')]


In [2]:
def evaluation(output,df):
    term=output[0]
    if not isinstance(term, np.ndarray):
        return  100 *spearmanr(output, df["TARGET"]).correlation
    else:
        return  [100 *spearmanr(output[:,i], df["TARGET"]).correlation for i in range(len(term)) ]

In [3]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
lr = LinearRegression()
lr.fit(X_train_FR, Y_train_FR)
print('Spearman correlation for the French train set', ( evaluation(lr.predict(X_train_FR),Y_train_FR)))
X2 = sm.add_constant(X_train_FR)
est = sm.OLS(Y_train_FR["TARGET"], X2)
est2 = est.fit()
print(est2.summary(),"\n")
lr = LinearRegression()
lr.fit(X_train_DE, Y_train_DE)
print('\n Spearman correlation for the Deutsch train set', ( evaluation(lr.predict(X_train_DE),Y_train_DE)))
X2 = sm.add_constant(X_train_DE)
est = sm.OLS(Y_train_DE["TARGET"], X2)
est2 = est.fit()
print(est2.summary())

Spearman correlation for the French train set [19.299993481034463, 24.845637966522727, 23.84146208964189]
                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1.520
Date:                Sat, 05 Aug 2023   Prob (F-statistic):             0.0858
Time:                        16:56:26   Log-Likelihood:                -1214.6
No. Observations:                 851   AIC:                             2463.
Df Residuals:                     834   BIC:                             2544.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
-------------------

In [39]:
from sklearn.model_selection import train_test_split
def average_stats(df,labels,regressor,columns=['TARGET','Rank'],iterations=20,depth=None,group=False,linear=False,custom_group=None):
    valid_list=[]
    train_list=[] 

    
    for i in range(iterations):
        if group :
            columns+=['Rank_group']
        if linear :
            depth=None
            _,X_test,_, y_test = train_test_split(df, labels, test_size=0.25)
            X_train,Y_train =df,labels
        else:
            X_train, X_test, Y_train, y_test = train_test_split(df, labels, test_size=0.25)
            Y_train['Rank']= Y_train['TARGET'].rank()
            if custom_group is not None:
                Y_train['Rank_group'+str(custom_group)]=pd.qcut(Y_train.Rank,custom_group).cat.codes
                columns='Rank_group'+str(custom_group)
        
                
        if depth is None:
            DF = regressor()
        else:            
            DF = regressor(max_depth=depth)
        #   Fit the ridge regressor
        
        DF.fit(X_train, Y_train[columns])
        output_train = DF.predict(X_test)
        # print(output_train)
        validation=evaluation(output_train,y_test)
        training=evaluation(DF.predict(X_train),Y_train)
        valid_list+=[validation]
        train_list+=[training]
    print(columns)
    if isinstance(columns,list):
        for i,column in enumerate(columns):
            print( f"average {column} evaluation score at ", sum([row[i] for row in valid_list])/len(valid_list) , "training at ", sum([row[i] for row in train_list])/len(train_list) )
    else:
        print( f"average {columns} evaluation score at ", sum(valid_list)/len(valid_list) , "training at :" ,  sum(train_list)/len(train_list) )
    
    print("\n")

In [5]:
from sklearn.ensemble import RandomForestRegressor

average_stats(X_train_FR,Y_train_FR,RandomForestRegressor,depth=3)
average_stats(X_train_FR,Y_train_FR,RandomForestRegressor,columns='Rank',depth=3)
average_stats(X_train_FR,Y_train_FR,RandomForestRegressor,columns='TARGET',depth=3)

average TARGET evaluation score at  17.85260609065711 training at  53.82022568268769
average Rank evaluation score at  20.023107841630267 training at  50.04896455932142


average Rank evaluation score at  17.202484829344378 training at : 50.430705492453846


average TARGET evaluation score at  10.630210263444384 training at : 32.87176932921054




In [6]:
average_stats(X_train_DE,Y_train_DE,RandomForestRegressor,depth=3)
average_stats(X_train_DE,Y_train_DE,RandomForestRegressor,columns='Rank',depth=4)
average_stats(X_train_DE,Y_train_DE,RandomForestRegressor,columns='TARGET',depth=3)

average TARGET evaluation score at  34.22718517399047 training at  62.35478460695815
average Rank evaluation score at  35.078339797009605 training at  59.80129633196492


average Rank evaluation score at  34.828691464329026 training at : 71.71266385255014


average TARGET evaluation score at  24.927559706627523 training at : 48.71229455568933




In [23]:
average_stats(X_train_DE,Y_train_DE,LinearRegression,linear=True)

average_stats(X_train_FR,Y_train_FR,LinearRegression,linear=True)


average TARGET evaluation score at  39.573785848998604 training at  41.92243892176021
average Rank evaluation score at  40.83160758647242 training at  42.890619678566196


average TARGET evaluation score at  19.139199982486524 training at  19.299993481034456
average Rank evaluation score at  24.49062407274294 training at  24.845637966522716




In [40]:
from mord import OrdinalRidge
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print("##################  DE")
    average_stats(X_train_DE,Y_train_DE,OrdinalRidge,linear=True,group=True)
    
    print("##################  FR")
    average_stats(X_train_FR,Y_train_FR,OrdinalRidge,linear=True,group=True)
    


##################  DE
average TARGET evaluation score at  24.973635041433795 training at  22.889492618136938
average Rank evaluation score at  45.13037868293989 training at  42.84383703531719
average Rank_group evaluation score at  39.52151515674127 training at  36.83642992951844


IndexError: list index out of range

In [21]:
from sklearn.linear_model import LogisticRegression

average_stats(X_train_DE,Y_train_DE,LogisticRegression,columns='Rank_group',custom_group=2)

print("##################  FR")
average_stats(X_train_FR,Y_train_FR,LogisticRegression,columns='Rank_group',custom_group=2)



average Rank_group_2 evaluation score at  29.245032019594827 training at : 35.55958255971934


##################  FR
average Rank_group_2 evaluation score at  14.434407496364155 training at : 21.72433225801675




In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier