In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR

## Data Preprocessing

Splitting the data into train and test sets

In [109]:
X_train_raw=pd.read_csv("X_train.csv")
y_train_raw=pd.read_csv("y_train.csv")
X_test_raw=pd.read_csv("X_test.csv")

X_train_raw=X_train_raw.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)


#lets drop the redundant stuff
X_train,X_test,y_train,y_test=train_test_split(X_train_raw,y_train_raw,test_size=0.2,random_state=13)

y_train=y_train["TARGET"]
y_test=y_test["TARGET"]

### Functions

In [113]:
def split_into_country(df,y_df,country: str):
    
    y_df=y_df[df["COUNTRY"]==country]
    df=df[df["COUNTRY"]==country]
    df=df.drop(["ID","DAY_ID","COUNTRY"],axis=1)
    df.fillna(df.median(),inplace=True)
    
    return df,y_df

def SVRegression(df,y_df,C=10,epsilon=0.01):
    
    model=SVR(kernel="linear",C=C,epsilon=epsilon)
    model.fit(df,y_df["TARGET"])
    
    return model
   
def postprocess(df_fr,df_de,x,y):
    
    fr=0
    de=0
    out=[]

    for i in range (y.shape[0]):
        if x['COUNTRY'].values[i]=='FR':
            out.append([df_fr[fr]])
            fr = fr + 1
        else:
            out.append([df_de[de]])
            de = de + 1

    return np.array(out)

def z_scale(data):
    
    data=pd.DataFrame(data)
    scaler=StandardScaler()
    normalized_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

    return normalized_data

def spearman(output, y):

    return spearmanr(output, y).correlation

## Split data by country

In [115]:
#training set
de_train,y_de_train=split_into_country(X_train,y_train,"DE")
fr_train,y_fr_train=split_into_country(X_train,y_train,"FR")

de_train_s=z_scale(de_train)
y_de_train_s=z_scale(y_de_train)
fr_train_s=z_scale(fr_train)
y_fr_train_s=z_scale(y_fr_train)

#test set
de_test,y_de_test=split_into_country(X_test,y_test,"DE")
fr_test,y_fr_test=split_into_country(X_test,y_test,"FR")

de_test_s=z_scale(de_test)
y_de_test_s=z_scale(y_de_test)
fr_test_s=z_scale(fr_test)
y_fr_test_s=z_scale(y_fr_test)

## Take the top ten features ranked by correlation

Then use Support Vector Regression (SVR) to predict answer

In [129]:
corr_de=de_train_s.corrwith(y_de_train)
corr_fr=fr_train_s.corrwith(y_fr_train)
corr_de=abs(corr_de).sort_values(ascending=False)[:10]
corr_fr=abs(corr_fr).sort_values(ascending=False)[:10]

de_train_c=de_train_s[corr_de.index]
de_test_c=de_test_s[corr_de.index]
fr_train_c=fr_train_s[corr_fr.index]
fr_test_c=fr_test_s[corr_fr.index]

#fit the model
C=20
epsilon=0.511

de_svr_c=SVRegression(de_train_c,y_de_train_s,C=C,epsilon=epsilon)
fr_svr_c=SVRegression(fr_train_c,y_fr_train_s,C=C,epsilon=epsilon)

#make predictions on test set
de_svr_pred_c=de_svr_c.predict(de_test_c)
fr_svr_pred_c=fr_svr_c.predict(fr_test_c)

test_out_svr_c=postprocess(fr_svr_pred_c,de_svr_pred_c,X_test,y_test)
print('Spearman correlation for the test set with SVR split country: {:.1f}%'.format(100*spearman(test_out_svr_c, y_test) ))

Spearman correlation for the test set with SVR split country: 27.8%


In [131]:
X_test_raw_de=X_test_raw[X_test_raw["COUNTRY"]=="DE"]
X_test_raw_de=X_test_raw_de.drop(["ID","DAY_ID","COUNTRY"],axis=1)
X_test_raw_de.fillna(X_test_raw_de.median(),inplace=True)

X_test_raw_fr=X_test_raw[X_test_raw["COUNTRY"]=="FR"]
X_test_raw_fr=X_test_raw_fr.drop(["ID","DAY_ID","COUNTRY"],axis=1)
X_test_raw_fr.fillna(X_test_raw_fr.median(),inplace=True)

X_test_raw_de=X_test_raw_de[corr_de.index]
X_test_raw_fr=X_test_raw_fr[corr_fr.index]

de_raw_test=de_svr_c.predict(X_test_raw_de)
fr_raw_test=fr_svr_c.predict(X_test_raw_fr)

raw_test_out=postprocess(fr_raw_test,de_raw_test,X_test_raw,X_test_raw)

In [128]:
X_test_clean = X_test_raw.drop(['COUNTRY'], axis=1)
Y_test_submission = X_test_raw[['ID']].copy()
Y_test_submission['TARGET'] = raw_test_out
Y_test_submission.to_csv('GA_qrt_submission.csv', index=False)