## INITIALIZATION

###  Libraries

In [1]:
# General
import pandas as pd
import numpy as np
import sys
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine Learning
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV

# # Perso
sys.path.append('../Algo')
from NeuralNetworks import *



In [2]:
def diff(t_a, t_b):
    t_diff = relativedelta(t_a, t_b)
    return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)

### Score

In [3]:
score = make_scorer(score_func = log_loss)

## DATA

### Import

In [4]:
Xtrain = pd.read_csv("../w79/numerai_datasets/numerai_training_data.csv")
Xtest = pd.read_csv("../w79/numerai_datasets/numerai_tournament_data.csv")

In [5]:
Xtrain = pd.concat([Xtrain, Xtest[Xtest['data_type'] == 'validation']], axis = 0)

In [6]:
Xtrain.shape, Xtest.shape

((609578, 54), (348831, 54))

In [7]:
real_data = Xtest
ids = Xtest['id']

Xtest = Xtrain[Xtrain['data_type'] == 'validation']
Xtrain = Xtrain[Xtrain['data_type'] != 'validation']

In [8]:
Ytrain = Xtrain['target']
Ytest = Xtest['target']

Xtrain.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
Xtest.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)
real_data.drop(['id', 'era', 'data_type', 'target'], inplace = True, axis = 1)

Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape, real_data.shape

((535713, 50), (535713,), (73865, 50), (73865,), (348831, 50))

## Model

### First layer

#### Defining models and parameters

In [None]:
n_cores = 2 

In [9]:
model_names = ['ExtraTrees', 
               'XGBoost', 
               'SGDC']

In [10]:
models = [ExtraTreesClassifier(n_jobs = n_cores, 
                               criterion = 'entropy', 
                               max_depth = 4, 
                               n_estimators = 100, 
                               bootstrap = True),
          
          XGBClassifier(learning_rate = 0.5, 
                        max_depth = 3, 
                        n_estimators = 75,
                        nthread = n_cores),
          
          SGDClassifier(loss = 'log', 
                        penalty = 'elasticnet', 
                        learning_rate = 'optimal',
                        n_jobs = n_cores)]

In [11]:
parameters_to_tune = [{'min_samples_split' : [200,1000],                            # ExtraTreesClassifier
                       'min_samples_leaf' : [200,1000]},
                    
                      {'subsample' : [0.5, 0.75, 1]},                                   # XGBoostClassifier

                      {'alpha' : [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1],            # SGDClassifier
                       'l1_ratio' : [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]}]

#### Tuning parameters and getting predictions

In [12]:
bagging_steps = 10
n_feature = 20

features = [name for name in Xtrain.columns]

In [13]:
first_stage_train = pd.DataFrame()
first_stage_test = pd.DataFrame()
first_stage_final = pd.DataFrame()

In [14]:
for name, model, parameters in zip(model_names, models, parameters_to_tune):
    
    time1 = datetime.now()
    print('\n---------------------------------------------')
    print('>> Processing {}\n'.format(name))
    
    for step in range(bagging_steps):
        
        time2 = datetime.now()
        print("Step {}".format(step+1), end = '...')
        
        # Creating data
        np.random.shuffle(features)
        train = Xtrain[features[:n_feature]]
        test = Xtest[features[:n_feature]]
        final = real_data[features[:n_feature]]
        
        # Tuning
        gscv = GridSearchCV(model, parameters, scoring = score, n_jobs = n_cores)
        gscv.fit(train, Ytrain)

        # Saving best predictions
        first_stage_train['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(train)[:,1]
        first_stage_test['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(test)[:,1]
        first_stage_final['{}_prediction_{}'.format(name, step+1)] = gscv.predict_proba(final)[:,1]

        print('done in {}'.format(diff(datetime.now(),time2)))
        print('log loss : {}\n'.format(log_loss(Ytest, first_stage_test['{}_prediction_{}'.format(name,step+1)])))


---------------------------------------------
>> Processing ExtraTrees

Step 1...done in 0h 1m 37s
log loss : 0.6928227973886574

Step 2...done in 0h 1m 37s
log loss : 0.6927354370871803

Step 3...done in 0h 1m 41s
log loss : 0.6926697392843302

Step 4...done in 0h 1m 44s
log loss : 0.6928330244160793

Step 5...done in 0h 1m 48s
log loss : 0.6927236465158352

Step 6...done in 0h 1m 46s
log loss : 0.6927539761659607

Step 7...done in 0h 2m 2s
log loss : 0.6928067547989717

Step 8...done in 0h 1m 46s
log loss : 0.6926585238728056

Step 9...done in 0h 1m 37s
log loss : 0.6927144925483121

Step 10...done in 0h 1m 33s
log loss : 0.6927338631086211


---------------------------------------------
>> Processing XGBoost

Step 1...done in 0h 4m 12s
log loss : 0.6936589846549777

Step 2...done in 0h 4m 33s
log loss : 0.6940892488871884

Step 3...done in 0h 4m 50s
log loss : 0.693791661117142

Step 4...done in 0h 3m 55s
log loss : 0.6937455820688694

Step 5...done in 0h 4m 16s
log loss : 0.693720

#### Saving results

In [None]:
first_stage_train.to_csv('../w79/first_stage_train.csv')
first_stage_test.to_csv('../w79/first_stage_test.csv')
first_stage_final.to_csv('../w79/first_stage_final_data.csv')

### Second Layer

#### Import data

In [9]:
first_stage_train = pd.read_csv("../w79/first_stage_train.csv")
first_stage_test = pd.read_csv("../w79/first_stage_test.csv")
first_stage_final = pd.read_csv('../w79/first_stage_final_data.csv')

In [10]:
first_stage_train.drop(['Unnamed: 0'], inplace=True, axis=1)
first_stage_test.drop(['Unnamed: 0'], inplace=True, axis=1)
first_stage_final.drop(['Unnamed: 0'], inplace=True, axis=1)

In [11]:
first_stage_train.shape, first_stage_test.shape, first_stage_final.shape

((535713, 30), (73865, 30), (348831, 30))

#### Neural Network

There only are numerical features. Since tensorflow only accept numpy matrix we have to tranform our data. Furthermore we have to reformat the target shape and standardize our data !

In [12]:
def reformat(labels, num_labels):
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return labels

first_stage_train = first_stage_train.as_matrix().astype(np.float32)
Ytrain = reformat(Ytrain, 2)

first_stage_test = first_stage_test.as_matrix().astype(np.float32)
Ytest = reformat(Ytest, 2)

first_stage_final = first_stage_final.as_matrix().astype(np.float32)

print('Training set :', first_stage_train.shape, Ytrain.shape)
print('Final set :', first_stage_final.shape)
print('Test set :', first_stage_test.shape, Ytest.shape)

Training set : (535713, 30) (535713, 2)
Final set : (348831, 30)
Test set : (73865, 30) (73865, 2)


In [13]:
# Define the scaler
scaler = StandardScaler().fit(first_stage_train)

# Scale the train set
first_stage_train = scaler.transform(first_stage_train)

# Scale the test set
first_stage_test = scaler.transform(first_stage_test)

# Scale the final data
first_stage_final = scaler.transform(first_stage_final)

In [14]:
# DEFINITION
model = NeuralNetworkClassifier(layers = [20],
                                num_steps = 150000,
                                display_step = 5000,
                                learning_rate = 0.001,
                                L2Regression = 0.05,
                                dropout = 0.2,
                                learning_rate_decay = 0.9,
                                batch_size = 500,
                                verbose = None)

In [71]:
# model.fit(first_stage_train, Ytrain, first_stage_test, Ytest, validation=0.2)


-------------------- PROCESSING LEARNING --------------------

Step : 0   Minibatch loss : 2.357624053955078   Validation loss : 1.0758034455943422
Step : 5000   Minibatch loss : 1.1798838376998901   Validation loss : 0.6905245721927364
Step : 10000   Minibatch loss : 0.9808145761489868   Validation loss : 0.6906055666636814
Step : 15000   Minibatch loss : 0.8695586323738098   Validation loss : 0.6907181092841693
Step : 20000   Minibatch loss : 0.8008138537406921   Validation loss : 0.6907582694836306
Step : 25000   Minibatch loss : 0.7517625689506531   Validation loss : 0.6906569838973363
Step : 30000   Minibatch loss : 0.7272824645042419   Validation loss : 0.6905339117561108
Step : 35000   Minibatch loss : 0.7146121859550476   Validation loss : 0.6903906456924687
Step : 40000   Minibatch loss : 0.7037126421928406   Validation loss : 0.6902316691825545
Step : 45000   Minibatch loss : 0.7003671526908875   Validation loss : 0.6901135874748848
Step : 50000   Minibatch loss : 0.69919669

In [15]:
training_data = np.concatenate([first_stage_train,first_stage_test], axis = 0)
training_label = np.concatenate([Ytrain, Ytest], axis = 0)

training_data.shape, training_label.shape

((609578, 30), (609578, 2))

In [16]:
# REAL MODEL
model = NeuralNetworkClassifier(layers = [20],
                                num_steps = 150000,
                                display_step = 5000,
                                learning_rate = 0.001,
                                L2Regression = 0.05,
                                dropout = 0.2,
                                learning_rate_decay = 0.9,
                                batch_size = 500,
                                verbose = None)

model.fit(training_data,
          training_label,
          validation = None,
          final_data = first_stage_final)


-------------------- PROCESSING LEARNING --------------------

Step : 0   Minibatch loss : 2.2011709213256836
Step : 5000   Minibatch loss : 1.194794774055481
Step : 10000   Minibatch loss : 0.9961936473846436
Step : 15000   Minibatch loss : 0.8722558617591858
Step : 20000   Minibatch loss : 0.804010808467865
Step : 25000   Minibatch loss : 0.7625634074211121
Step : 30000   Minibatch loss : 0.7293069362640381
Step : 35000   Minibatch loss : 0.7159773707389832
Step : 40000   Minibatch loss : 0.7078022956848145
Step : 45000   Minibatch loss : 0.7028517723083496
Step : 50000   Minibatch loss : 0.6976909637451172
Step : 55000   Minibatch loss : 0.6936199069023132
Step : 60000   Minibatch loss : 0.6927656531333923
Step : 65000   Minibatch loss : 0.6912948489189148
Step : 70000   Minibatch loss : 0.6908606290817261
Step : 75000   Minibatch loss : 0.6940179467201233
Step : 80000   Minibatch loss : 0.6925798654556274
Step : 85000   Minibatch loss : 0.6908078789710999
Step : 90000   Minibatch 

In [17]:
final_prediction = model.predict()

In [18]:
nn_submit = pd.DataFrame()
nn_submit['id'] = ids
nn_submit['probability'] = final_prediction

nn_submit.to_csv('../w79/4th_submit.csv', index = False)

#### Feature Weighted Linear Model

In [36]:
# Create meta-features
first_stage_train = pd.DataFrame(first_stage_train)
first_stage_train['meta1'] = Xtrain.std(axis = 1)

first_stage_test = pd.DataFrame(first_stage_test)
first_stage_test['meta1'] = Xtest.std(axis = 1)

In [37]:
linear_model = LinearRegression()
linear_model.fit(first_stage_train, Ytrain)

predicted = linear_model.predict(first_stage_test)

print("Log loss : {}".format(log_loss(Ytest, predicted)))

Log loss : 0.70031755802259
