In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import spearmanr

In [15]:
# read the data
raw_X_train = pd.read_csv('./x_train.csv')
raw_y_train = pd.read_csv('./y_train.csv')

raw_X_train.head()

raw_X_test = pd.read_csv('./x_test.csv')
raw_X_test.head()

# merge the X_train and y_train data
raw_train = pd.merge(raw_X_train, raw_y_train, on='ID')
raw_train.head()

# sort data by ID and then by Date
raw_train = raw_train.sort_values(['DAY_ID'])
raw_train.head()

# fill the NaN values with the mean of the column
filled_train = raw_train.fillna(raw_train.mean())
filled_train.head()

# check if there are still NaN values
print('The number of rows containing NaN values in filled_train is: {}'.format(filled_train.isnull().any(axis=1).sum()))

# check duplicates in the data
print('The number of duplicates in the data is: {}'.format(filled_train.duplicated().sum()))

The number of rows containing NaN values in filled_train is: 0
The number of duplicates in the data is: 0


In [16]:
# split the data by value of the column 'COUNTRY'
raw_train_DE = filled_train[filled_train['COUNTRY']=='DE']
raw_train_FR = filled_train[filled_train['COUNTRY']=='FR']

# drop the columns which has name start with 'DE' in raw_train_FR
raw_train_FR = raw_train_FR.drop(raw_train_FR.columns[raw_train_FR.columns.str.startswith('DE')], axis=1)

# drop the columns of 'ID', 'COUNTRY' and 'DAY_ID'
raw_train_DE = raw_train_DE.drop(['ID', 'COUNTRY', 'DAY_ID'], axis=1)
# raw_train_FR.head()

# drop the columns which has name start with 'FR' in raw_train_DE
raw_train_DE = raw_train_DE.drop(raw_train_DE.columns[raw_train_DE.columns.str.startswith('FR')], axis=1)

# drop the columns of 'ID', 'COUNTRY' and 'DAY_ID'
raw_train_FR = raw_train_FR.drop(['ID', 'COUNTRY', 'DAY_ID'], axis=1)
# raw_train_DE.head()

# print the shape of the data
print('The shape of the raw_train_DE data is: {}'.format(raw_train_DE.shape))
print('The shape of the raw_train_FR data is: {}'.format(raw_train_FR.shape))

The shape of the raw_train_DE data is: (643, 19)
The shape of the raw_train_FR data is: (851, 18)


In [17]:
# split the data into train and validation sets
from sklearn.model_selection import train_test_split

train_FR, val_FR = train_test_split(raw_train_FR, test_size=0.2, random_state=42)

train_DE, val_DE = train_test_split(raw_train_DE, test_size=0.2, random_state=42)

# print the shape of the train and validation sets
print('The shape of the train_FR data is: {}'.format(train_FR.shape))
print('The shape of the val_FR data is: {}'.format(val_FR.shape))

print('The shape of the train_DE data is: {}'.format(train_DE.shape))
print('The shape of the val_DE data is: {}'.format(val_DE.shape))

The shape of the train_FR data is: (680, 18)
The shape of the val_FR data is: (171, 18)
The shape of the train_DE data is: (514, 19)
The shape of the val_DE data is: (129, 19)


In [18]:
# For 'TARGET' in train_FR, change all positive values to 1 and negative values to -1 stored in 'train_FR_cl'
train_FR_cl = train_FR.copy()
train_FR_cl['TARGET'] = np.where(train_FR_cl['TARGET'] > 0, 1, -1)

# For 'TARGET' in val_FR, change all positive values to 1 and negative values to -1 stored in 'val_FR_cl'
val_FR_cl = val_FR.copy()
val_FR_cl['TARGET'] = np.where(val_FR_cl['TARGET'] > 0, 1, -1)

X_train_FR_cl = train_FR_cl.drop('TARGET', axis=1)
y_train_FR_cl = train_FR_cl['TARGET']

X_val_FR_cl = val_FR_cl.drop('TARGET', axis=1)
y_val_FR_cl = val_FR_cl['TARGET']

# Build Random Forest model
from sklearn.ensemble import RandomForestClassifier

best_score = 0
best_n = 0
for i in range(50, 171, 3):

    rf_model_FR = RandomForestClassifier(n_estimators=i, random_state=42)  # hyperparameters are tuned manually

    rf_model_FR.fit(X_train_FR_cl, y_train_FR_cl)

    preds_FR_cl = rf_model_FR.predict(X_val_FR_cl)

    # evaluate the model using the Spearman correlation coefficient
    score = spearmanr(preds_FR_cl, y_val_FR_cl).correlation * 100
    print("Score on validation for {} estimators : {}".format(i, score))
    if score > best_score:
        best_score = score
        best_n = i

    # evaluate based on the training set
    train_preds_cl = rf_model_FR.predict(X_train_FR_cl)
    train_score = spearmanr(train_preds_cl, y_train_FR_cl).correlation * 100
    print("Score on train for {} estimators : {}".format(i, train_score))

print("Best score on validation is {} for {} estimators".format(best_score, best_n))



Score on validation for 50 estimators : 19.08401338457116
Score on train for 50 estimators : 100.0
Score on validation for 53 estimators : 22.638109654563852
Score on train for 53 estimators : 100.0
Score on validation for 56 estimators : 17.912760554700462
Score on train for 56 estimators : 100.0
Score on validation for 59 estimators : 19.258144712821114
Score on train for 59 estimators : 100.0
Score on validation for 62 estimators : 16.74497848582345
Score on train for 62 estimators : 100.0
Score on validation for 65 estimators : 16.81296845976107
Score on train for 65 estimators : 100.0
Score on validation for 68 estimators : 19.09974108539237
Score on train for 68 estimators : 100.0
Score on validation for 71 estimators : 25.006938788866705
Score on train for 71 estimators : 100.0
Score on validation for 74 estimators : 19.09974108539237
Score on train for 74 estimators : 100.0
Score on validation for 77 estimators : 16.74497848582345
Score on train for 77 estimators : 100.0
Score 

In [19]:
# For 'TARGET' in train_FR, change all positive values to 1 and negative values to -1 stored in 'train_FR_cl'
train_DE_cl = train_DE.copy()
train_DE_cl['TARGET'] = np.where(train_DE_cl['TARGET'] > 0, 1, -1)

# For 'TARGET' in val_FR, change all positive values to 1 and negative values to -1 stored in 'val_FR_cl'
val_DE_cl = val_DE.copy()
val_DE_cl['TARGET'] = np.where(val_DE_cl['TARGET'] > 0, 1, -1)

X_train_DE_cl = train_DE_cl.drop('TARGET', axis=1)
y_train_DE_cl = train_DE_cl['TARGET']

X_val_DE_cl = val_DE_cl.drop('TARGET', axis=1)
y_val_DE_cl = val_DE_cl['TARGET']

# Build Random Forest model
from sklearn.ensemble import RandomForestClassifier

best_score = 0
best_n = 0
for i in range(50, 171, 3):

    rf_model_DE = RandomForestClassifier(n_estimators=i, random_state=42)  # hyperparameters are tuned manually

    rf_model_DE.fit(X_train_DE_cl, y_train_DE_cl)

    preds_DE_cl = rf_model_DE.predict(X_val_DE_cl)

    # evaluate the model using the Spearman correlation coefficient
    score = spearmanr(preds_DE_cl, y_val_DE_cl).correlation * 100
    print("Score on validation for {} estimators : {}".format(i, score))
    if score > best_score:
        best_score = score
        best_n = i

    # evaluate based on the training set
    train_preds_cl = rf_model_DE.predict(X_train_DE_cl)
    train_score = spearmanr(train_preds_cl, y_train_DE_cl).correlation * 100
    print("Score on train for {} estimators : {}".format(i, train_score))

print("Best score on validation is {} for {} estimators".format(best_score, best_n))

Score on validation for 50 estimators : 16.03182256509161
Score on train for 50 estimators : 100.0
Score on validation for 53 estimators : 19.28727619847363
Score on train for 53 estimators : 100.0
Score on validation for 56 estimators : 17.658400179005678
Score on train for 56 estimators : 100.0
Score on validation for 59 estimators : 17.814173613850176
Score on train for 59 estimators : 100.0
Score on validation for 62 estimators : 14.708729835219536
Score on train for 62 estimators : 100.0
Score on validation for 65 estimators : 19.633828302714363
Score on train for 65 estimators : 100.0
Score on validation for 68 estimators : 22.556595508348128
Score on train for 68 estimators : 100.0
Score on validation for 71 estimators : 22.741514009524366
Score on train for 71 estimators : 100.0
Score on validation for 74 estimators : 22.556595508348128
Score on train for 74 estimators : 100.0
Score on validation for 77 estimators : 19.633828302714363
Score on train for 77 estimators : 100.0
Sc

In [20]:
# For 'TARGET' in train_FR, change all positive values to 1 and negative values to -1 stored in 'train_FR_cl'
train_FR_cl = raw_train_FR.copy()
train_FR_cl['TARGET'] = np.where(train_FR_cl['TARGET'] > 0, 1, -1)
X_train_FR_cl = train_FR_cl.drop('TARGET', axis=1)
y_train_FR_cl = train_FR_cl['TARGET']

train_DE_cl = raw_train_DE.copy()
train_DE_cl['TARGET'] = np.where(train_DE_cl['TARGET'] > 0, 1, -1)
X_train_DE_cl = train_DE_cl.drop('TARGET', axis=1)
y_train_DE_cl = train_DE_cl['TARGET']

In [21]:
best_rf_FR = RandomForestClassifier(n_estimators=best_n, random_state=42)
best_rf_FR.fit(X_train_FR_cl, y_train_FR_cl)

best_rf_DE = RandomForestClassifier(n_estimators=best_n, random_state=42)
best_rf_DE.fit(X_train_DE_cl, y_train_DE_cl)

In [22]:
# read the test data
raw_X_test = pd.read_csv('./x_test.csv')

# fill the NaN values with the mean of the column
filled_test = raw_X_test.fillna(raw_X_test.mean())

# drop the columns which has name start with 'DE' in filled_test
filled_test_FR = filled_test.drop(filled_test.columns[filled_test.columns.str.startswith('DE')], axis=1)

# drop the columns which has name start with 'FR' in filled_test
filled_test_DE = filled_test.drop(filled_test.columns[filled_test.columns.str.startswith('FR')], axis=1)

# drop the columns of 'ID', 'COUNTRY' and 'DAY_ID'
filled_test_FR = filled_test_FR.drop(['ID', 'COUNTRY', 'DAY_ID'], axis=1)
# filled_test_FR.head()

# drop the columns of 'ID', 'COUNTRY' and 'DAY_ID'
filled_test_DE = filled_test_DE.drop(['ID', 'COUNTRY', 'DAY_ID'], axis=1)

# predict the test data
preds_FR = best_rf_FR.predict(filled_test_FR)
preds_DE = best_rf_DE.predict(filled_test_DE)

# create a dataframe with the ID and the predictions
output_FR = pd.DataFrame({'ID': filled_test['ID'], 'TARGET': preds_FR})
output_DE = pd.DataFrame({'ID': filled_test['ID'], 'TARGET': preds_DE})

# merge the two dataframes
output = pd.concat([output_FR, output_DE], axis=0)

# save the dataframe as a csv file
output.to_csv('./trading_new_hands1.csv', index=False)

