In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# General
import pandas as pd
import numpy as np
import sys
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Preprocessing
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Machine Learning
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
import lightgbm as lightgbm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV

# # Perso
sys.path.append('C:/Users/hperrin/Desktop/Numerai/Algo')
from NeuralNetworks import *

def diff(t_a, t_b):
    t_diff = relativedelta(t_a, t_b)
    return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)

score = make_scorer(score_func = log_loss)



In [6]:
Xtrain = pd.read_csv("C:/Users/hperrin/Desktop/Numerai/w80/numerai_datasets/numerai_training_data.csv")
Xtest = pd.read_csv("C:/Users/hperrin/Desktop/Numerai/w80/numerai_datasets/numerai_tournament_data.csv")

real_data = Xtest
ids = Xtest['id']

Xtest = Xtest[Xtest['data_type'] == 'validation']

Ytrain = Xtrain['target']
Ytest = Xtest['target']

Xtrain.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
Xtest.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
real_data.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)

print(Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape, real_data.shape)

metafeature_train = pd.DataFrame()
metafeature_test = pd.DataFrame()
metafeature_final = pd.DataFrame()

# Variance
metafeature_train['variance'] = Xtrain.std(axis = 1)
metafeature_test['variance'] = Xtest.std(axis = 1)
metafeature_final['variance'] = real_data.std(axis = 1)

# Moyenne
metafeature_train['mean'] = Xtrain.mean(axis = 1)
metafeature_test['mean'] = Xtest.mean(axis = 1)
metafeature_final['mean'] = real_data.mean(axis = 1)

# Distance to mean individual
mean_indiv = Xtrain.mean(axis = 0)

metafeature_train['distance'] = Xtrain.apply(lambda row: euclidean(row,mean_indiv), axis = 1)
metafeature_test['distance'] = Xtest.apply(lambda row: euclidean(row,mean_indiv), axis = 1)
metafeature_final['distance'] = real_data.apply(lambda row: euclidean(row,mean_indiv), axis = 1)

(535713, 50) (535713,) (73865, 50) (73865,) (348689, 50)


In [3]:
n_cores = -1

first_stage_train = pd.DataFrame()
first_stage_test = pd.DataFrame()
first_stage_final = pd.DataFrame()

In [4]:
model_names = ['ExtraTrees1',
               'ExtraTrees2',
               'ExtraTrees3',
               'ExtraTrees4',
               'ExtraTrees5',
               'ExtraTrees6']


models =[ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'entropy', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 10,
                              max_depth = 3,
                              min_samples_leaf = 10),
         
         ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'gini', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 10,
                              max_depth = 3,
                              min_samples_leaf = 10),
         
         ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'entropy', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 500,
                              max_depth = 3,
                              min_samples_leaf = 500),
         
         ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'gini', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 500,
                              max_depth = 3,
                              min_samples_leaf = 500),
        
         ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'entropy', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 2000,
                              max_depth = 3,
                              min_samples_leaf = 2000),
         
         ExtraTreesClassifier(n_jobs = n_cores, 
                              criterion = 'gini', 
                              n_estimators = 100, 
                              bootstrap = True,
                              min_samples_split = 2000,
                              max_depth = 3,
                              min_samples_leaf = 2000)]

In [5]:
bagging_steps = 8
n_feature = 20

features = [name for name in Xtrain.columns]

for name,model in zip(model_names, models):
    
    time1 = datetime.now()
    print('\n---------------------------------------------')
    print('>> Processing {}\n'.format(name))
    
    for step in range(bagging_steps):
        
        time2 = datetime.now()
        print("Step {}".format(step+1), end = '...')
        
        # Creating data
        np.random.shuffle(features)
        train = Xtrain[features[:n_feature]]
        test = Xtest[features[:n_feature]]
        final = real_data[features[:n_feature]]
        
        # Adding metafeature weight
        for feature in train.columns:
            for meta in ['variance', 'mean', 'distance']:
                train['{}_{}'.format(feature, meta)] = train[feature] * metafeature_train['{}'.format(meta)]
                test['{}_{}'.format(feature, meta)] = test[feature] * metafeature_test['{}'.format(meta)]
                final['{}_{}'.format(feature, meta)] = final[feature] * metafeature_final['{}'.format(meta)]
        
        # Tuning
        gscv = model
        gscv.fit(train, Ytrain)

        # Saving best predictions
        first_stage_train['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(train)[:,1]
        first_stage_test['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(test)[:,1]
        first_stage_final['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(final)[:,1]

        print('done in {}'.format(diff(datetime.now(),time2)))
        print('log loss : {}\n'.format(log_loss(Ytest, first_stage_test['{}_prediction_{}'.format(name,step+1)])))


---------------------------------------------
>> Processing ExtraTrees1

Step 1...done in 0h 0m 19s
log loss : 0.6926960021613164

Step 2...done in 0h 0m 19s
log loss : 0.6927608024834063

Step 3...done in 0h 0m 19s
log loss : 0.6926353219442417

Step 4...done in 0h 0m 19s
log loss : 0.6927974337007831

Step 5...done in 0h 0m 19s
log loss : 0.692675707216713

Step 6...done in 0h 0m 19s
log loss : 0.6926875145936503

Step 7...done in 0h 0m 19s
log loss : 0.6927901372496886

Step 8...done in 0h 0m 18s
log loss : 0.6928138150983699


---------------------------------------------
>> Processing ExtraTrees2

Step 1...done in 0h 0m 19s
log loss : 0.6927737572754838

Step 2...done in 0h 0m 19s
log loss : 0.6926904243359732

Step 3...done in 0h 0m 19s
log loss : 0.6926466453275585

Step 4...done in 0h 0m 19s
log loss : 0.6926649243774308

Step 5...done in 0h 0m 19s
log loss : 0.692673541160151

Step 6...done in 0h 0m 19s
log loss : 0.6927819494971073

Step 7...done in 0h 0m 19s
log loss : 0.69

In [6]:
model_names = ['SGDClassifier']

models = [SGDClassifier(loss = 'log', 
                        penalty = 'elasticnet', 
                        learning_rate = 'optimal',
                        n_jobs = n_cores)]

In [7]:
bagging_steps = 30
n_feature = 15

features = [name for name in Xtrain.columns]

for name,model in zip(model_names, models):
    
    time1 = datetime.now()
    print('\n---------------------------------------------')
    print('>> Processing {}\n'.format(name))
    
    for step in range(bagging_steps):
        
        time2 = datetime.now()
        print("Step {}".format(step+1), end = '...')
        
        # Creating data
        np.random.shuffle(features)
        train = Xtrain[features[:n_feature]]
        test = Xtest[features[:n_feature]]
        final = real_data[features[:n_feature]]
        
        # Adding metafeature weight
        for feature in train.columns:
            for meta in ['variance', 'mean', 'distance']:
                train['{}_{}'.format(feature, meta)] = train[feature] * metafeature_train['{}'.format(meta)]
                test['{}_{}'.format(feature, meta)] = test[feature] * metafeature_test['{}'.format(meta)]
                final['{}_{}'.format(feature, meta)] = final[feature] * metafeature_final['{}'.format(meta)]

        # Tuning
        gscv = model
        gscv.fit(train, Ytrain)

        # Saving best predictions
        first_stage_train['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(train)[:,1]
        first_stage_test['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(test)[:,1]
        first_stage_final['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(final)[:,1]

        print('done in {}'.format(diff(datetime.now(),time2)))
        print('log loss : {}\n'.format(log_loss(Ytest, first_stage_test['{}_prediction_{}'.format(name,step+1)])))


---------------------------------------------
>> Processing SGDClassifier

Step 1...done in 0h 0m 8s
log loss : 0.6937208467964969

Step 2...done in 0h 0m 8s
log loss : 0.6964528878648224

Step 3...done in 0h 0m 8s
log loss : 0.6967101867526921

Step 4...done in 0h 0m 8s
log loss : 0.692345115600741

Step 5...done in 0h 0m 8s
log loss : 0.692742063962092

Step 6...done in 0h 0m 8s
log loss : 0.6956599242569506

Step 7...done in 0h 0m 8s
log loss : 0.6954561903829398

Step 8...done in 0h 0m 9s
log loss : 0.6962481959235597

Step 9...done in 0h 0m 9s
log loss : 0.6925834231379141

Step 10...done in 0h 0m 10s
log loss : 0.6925822783686636

Step 11...done in 0h 0m 8s
log loss : 0.6937710717176553

Step 12...done in 0h 0m 8s
log loss : 0.6932572174189645

Step 13...done in 0h 0m 8s
log loss : 0.6956014075355812

Step 14...done in 0h 0m 8s
log loss : 0.6927080504237146

Step 15...done in 0h 0m 8s
log loss : 0.692588559144487

Step 16...done in 0h 0m 8s
log loss : 0.6963068910008052

Step 17

In [17]:
pd.DataFrame(first_stage_train).to_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_train.csv', index = False)
pd.DataFrame(first_stage_test).to_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_test.csv', index = False)
pd.DataFrame(first_stage_final).to_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_final.csv', index = False)

In [8]:
first_stage_train = pd.read_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_train.csv')
first_stage_test = pd.read_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_test.csv')
first_stage_final = pd.read_csv('C:/Users/hperrin/Desktop/Numerai/w80/first_stage_final.csv')

In [9]:
# # Adding metafeature weight
# for feature in first_stage_train.columns:
#     for meta in ['variance', 'mean', 'distance']:
#         first_stage_train['{}_{}'.format(feature, meta)] = first_stage_train[feature] * metafeature_train['{}'.format(meta)]
#         first_stage_test['{}_{}'.format(feature, meta)] = first_stage_test[feature] * metafeature_test['{}'.format(meta)]
#         first_stage_final['{}_{}'.format(feature, meta)] = first_stage_final[feature] * metafeature_final['{}'.format(meta)]

In [10]:
def reformat(labels, num_labels):
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return labels

first_stage_train = first_stage_train.as_matrix().astype(np.float32)
training_label = reformat(Ytrain, 2)

first_stage_test = first_stage_test.as_matrix().astype(np.float32)
testing_label = reformat(Ytest, 2)

first_stage_final = first_stage_final.as_matrix().astype(np.float32)

# Define the scaler
scaler = StandardScaler().fit(first_stage_train)

# Scale the train set
first_stage_train = scaler.transform(first_stage_train)

# Scale the test set
first_stage_test = scaler.transform(first_stage_test)

# Scale the final data
first_stage_final = scaler.transform(first_stage_final)

In [24]:
model = NeuralNetworkClassifier(layers = [20],
                                activation = 'tanh',
                                num_steps = 20000,
                                display_step = 1000,
                                learning_rate = 0.001,
                                L2Regression = 0.05,
                                dropout = 0.4,
                                learning_rate_decay = 1,
                                batch_size = 1000,
                                verbose = None)

# model.fit(first_stage_train,
#           training_label,
#           first_stage_test,
#           testing_label,
#           validation = 0.2)


model.fit(first_stage_train,
          training_label,
          final_data = first_stage_final)

NN_pred = model.predict()


-------------------- PROCESSING LEARNING --------------------

Step : 0   Minibatch loss : 1.7021031379699707
Step : 1000   Minibatch loss : 1.502763032913208
Step : 2000   Minibatch loss : 1.3883020877838135
Step : 3000   Minibatch loss : 1.2990739345550537
Step : 4000   Minibatch loss : 1.230311393737793
Step : 5000   Minibatch loss : 1.1590685844421387
Step : 6000   Minibatch loss : 1.1208832263946533
Step : 7000   Minibatch loss : 1.089595913887024
Step : 8000   Minibatch loss : 1.0394099950790405
Step : 9000   Minibatch loss : 1.0046164989471436
Step : 10000   Minibatch loss : 0.9802024364471436
Step : 11000   Minibatch loss : 0.9498657584190369
Step : 12000   Minibatch loss : 0.9265963435173035
Step : 13000   Minibatch loss : 0.9036453366279602
Step : 14000   Minibatch loss : 0.8730961680412292
Step : 15000   Minibatch loss : 0.8591055274009705
Step : 16000   Minibatch loss : 0.8438348174095154
Step : 17000   Minibatch loss : 0.8274432420730591
Step : 18000   Minibatch loss : 0.

In [25]:
first_stage_train = pd.DataFrame(first_stage_train)
first_stage_test = pd.DataFrame(first_stage_test)
first_stage_final = pd.DataFrame(first_stage_final)

# Adding metafeature weight
for feature in first_stage_train.columns:
    for meta in ['variance', 'mean', 'distance']:
        first_stage_train['{}_{}'.format(feature, meta)] = first_stage_train[feature] * metafeature_train['{}'.format(meta)]
        first_stage_test['{}_{}'.format(feature, meta)] = first_stage_test[feature] * metafeature_test['{}'.format(meta)]
        first_stage_final['{}_{}'.format(feature, meta)] = first_stage_final[feature] * metafeature_final['{}'.format(meta)]

In [26]:
xgb = XGBClassifier(max_depth=3, 
                    n_estimators=100, 
                    learning_rate=0.05).fit(first_stage_train, Ytrain)

xgb_pred = xgb.predict_proba(first_stage_test)[:,1]

print('log loss : {}\n'.format(log_loss(Ytest, xgb_pred)))

log loss : 0.6924954783078869



In [27]:
xgb_pred = xgb.predict_proba(first_stage_final)[:,1]

In [28]:
submission = pd.DataFrame()

submission['id'] = ids
submission['probability'] = (NN_pred + xgb_pred) / 2

submission.to_csv('C:/Users/hperrin/Desktop/Numerai/w80/5th_submit.csv', index = False)