## INITIALIZATION

###  Libraries

In [1]:
# General
import pandas as pd
import numpy as np
import sys
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine Learning
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV

# # Perso
sys.path.append('../Algo')
from NeuralNetworks import *



In [2]:
def diff(t_a, t_b):
    t_diff = relativedelta(t_a, t_b)
    return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)

### Score

In [3]:
score = make_scorer(score_func = log_loss)

## DATA

### Import

In [4]:
Xtrain = pd.read_csv("../w79/numerai_datasets/numerai_training_data.csv")
Xtest = pd.read_csv("../w79/numerai_datasets/numerai_tournament_data.csv")

In [5]:
Xtrain = pd.concat([Xtrain, Xtest[Xtest['data_type'] == 'validation']], axis = 0)

In [6]:
Xtrain.shape, Xtest.shape

((609578, 54), (348831, 54))

In [7]:
real_data = Xtest
ids = Xtest['id']

Xtest = Xtrain[Xtrain['data_type'] == 'validation']
Xtrain = Xtrain[Xtrain['data_type'] != 'validation']

In [8]:
Ytrain = Xtrain['target']
Ytest = Xtest['target']

Xtrain.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
Xtest.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
real_data.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)

Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape, real_data.shape

((535713, 50), (535713,), (73865, 50), (73865,), (348831, 50))

## MODELS

### Extratrees

In [16]:
model_names = ['ExtraTrees']

In [17]:
models = [ExtraTreesClassifier(n_jobs = -1, 
                               criterion = 'entropy', 
                               n_estimators = 100, 
                               bootstrap = True)]

In [23]:
parameters_to_tune = [{'min_samples_split' : [10, 200, 500, 750, 1000, 2000],                   
                       'min_samples_leaf' : [10, 200, 500, 750, 1000, 2000],
                       'max_depth' : [2,3,4],
                       'max_features' : [10, 25, 40, 50]}]

#### Tuning parameters and getting predictions

In [24]:
bagging_steps = 1
n_feature = 50

features = [name for name in Xtrain.columns]

In [25]:
first_stage_train = pd.DataFrame()
first_stage_test = pd.DataFrame()
first_stage_final = pd.DataFrame()

In [None]:
for name,model,parameters in zip(model_names, models, parameters_to_tune):
    
    time1 = datetime.now()
    print('\n---------------------------------------------')
    print('>> Processing {}\n'.format(name))
    
    for step in range(bagging_steps):
        
        time2 = datetime.now()
        print("Step {}".format(step+1), end = '...')
        
        # Creating data
        np.random.shuffle(features)
        train = Xtrain[features[:n_feature]]
        test = Xtest[features[:n_feature]]
        final = real_data[features[:n_feature]]
        
        # Tuning
        gscv = GridSearchCV(model, parameters, scoring = score, n_jobs = -1)
        gscv.fit(train, Ytrain)

        # Saving best predictions
        first_stage_train['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(train)[:,1]
        first_stage_test['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(test)[:,1]
        first_stage_final['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(final)[:,1]

        print('done in {}'.format(diff(datetime.now(),time2)))
        print('log loss : {}\n'.format(log_loss(Ytest, first_stage_test['{}_prediction_{}'.format(name,step+1)])))


---------------------------------------------
>> Processing ExtraTrees

Step 1...

### XGBoost

### Neural Networks

In [12]:
def reformat(labels, num_labels):
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return labels

first_stage_train = first_stage_train.as_matrix().astype(np.float32)
Ytrain = reformat(Ytrain, 2)

first_stage_test = first_stage_test.as_matrix().astype(np.float32)
Ytest = reformat(Ytest, 2)

first_stage_final = first_stage_final.as_matrix().astype(np.float32)

print('Training set :', first_stage_train.shape, Ytrain.shape)
print('Final set :', first_stage_final.shape)
print('Test set :', first_stage_test.shape, Ytest.shape)

Training set : (535713, 30) (535713, 2)
Final set : (348831, 30)
Test set : (73865, 30) (73865, 2)


In [13]:
# Define the scaler
scaler = StandardScaler().fit(first_stage_train)

# Scale the train set
first_stage_train = scaler.transform(first_stage_train)

# Scale the test set
first_stage_test = scaler.transform(first_stage_test)

# Scale the final data
first_stage_final = scaler.transform(first_stage_final)

In [14]:
# DEFINITION
model = NeuralNetworkClassifier(layers = [20],
                                num_steps = 150000,
                                display_step = 5000,
                                learning_rate = 0.001,
                                L2Regression = 0.05,
                                dropout = 0.2,
                                learning_rate_decay = 0.9,
                                batch_size = 500,
                                verbose = None)

model.fit(first_stage_train, Ytrain, first_stage_test, Ytest, validation=0.2)