In [1]:
"""
Running algorithm to search best fitting model
"""

#=========================================================================================================
#================================ 0. MODULE

# Numerai class
from numerai import Numerai

# Machine Learning models
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier


#=========================================================================================================
#================================ 1. DATA


stacking = Numerai(stageNumber=1)
stacking.load_data(109)


#=========================================================================================================
#================================ 2. FEATURE ENGINEERING


# metafeature = ['variance', 'mean', 'distance']
# stages = [2, 0, 2]
# stacking.add_metafeature(metafeature, stages)


#=========================================================================================================
#================================ 3. MODEL ARCHITECTURE


nCores = -1


modelNames = ['ExtraTrees1',
              'ExtraTrees2',
              # 'XGBoost', 
              'SGDC',
              'Lightgbm']

models = [ExtraTreesClassifier(n_jobs = nCores, 
                               criterion = 'entropy',
                               max_depth = 3,
                               n_estimators = 50,
                               bootstrap = True),
          
          ExtraTreesClassifier(n_jobs = nCores, 
                               criterion = 'gini',
                               max_depth = 3,
                               n_estimators = 50,
                               bootstrap = True),
          
#           XGBClassifier(learning_rate = 0.5, 
#                         max_depth = 3, 
#                         n_estimators = 75,
#                         nthread = nCores),
          
          SGDClassifier(loss = 'log', 
                        penalty = 'elasticnet', 
                        learning_rate = 'optimal',
                        max_iter = 5,
                        tol = None,
                        n_jobs = nCores),
         
          LGBMClassifier(objective = 'binary',
                         max_depth = 3,
                         n_estimators = 100,
                         n_jobs = nCores)]

parameters = [{'min_samples_split' : [200, 1000],                               # ExtraTreesClassifier entropy
               'min_samples_leaf' : [200, 1000]},

              {'min_samples_split' : [200, 1000],                               # ExtraTreesClassifier gini
               'min_samples_leaf' : [200, 1000]},

#               {'subsample' : [0.5, 0.75, 1]},                                   # XGBoostClassifier

              {'alpha' : [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1],            # SGDClassifier
               'l1_ratio' : [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]},

              {# 'n_estimators': [50, 100, 200],                                  # Lightgbm
               'num_leaves ' : [15, 40, 100, 500],
               'min_samples_leaf' : [200, 1000],
               'reg_lambda' : [0.001, 0.01, 0.05, 0.1, 0.5, 1]}]

nFeatures = [15, 15, 35, None]

baggingSteps = [5, 5, 3, 1]

stages = [1, 1, 1, 2]


for name, model, parameters, baggingSteps, nFeatures, stage in zip(modelNames, models, parameters, baggingSteps, nFeatures, stages):
    stacking.add_model(name, model, parameters, baggingSteps, nFeatures, stage)


#=========================================================================================================
#================================ 4. TRAINING MODEL


stacking.fit_tune(nCores)


#=========================================================================================================
#================================ 5. PREDICTION


# stacking.submit(submissionNumber=1, week=109)


---------------------------------------------
>> Loading data...done

Xtrain1: (236167, 50) 
Ytrain1: (236167,) 
Xtrain2: (157446, 50) 
Ytrain2: (157446,) 
Xvalid: (46362, 50) 
Yvalid: (46362,) 
Submit data: (243281, 50)


---------------------------------------------
>> Processing first stage

>> Processing ExtraTrees1

Step 1...done in 0h 0m 23s
log loss : 0.6929679442851254

Step 2...done in 0h 0m 22s
log loss : 0.692999930076504

Step 3...done in 0h 0m 22s
log loss : 0.6929279930732397

Step 4...done in 0h 0m 32s
log loss : 0.693029221113938

Step 5...done in 0h 0m 26s
log loss : 0.6930742962848866

>> Processing ExtraTrees2

Step 1...done in 0h 0m 27s
log loss : 0.6930435393797845

Step 2...done in 0h 0m 26s
log loss : 0.6931049820367886

Step 3...done in 0h 0m 26s
log loss : 0.693055987103797

Step 4...done in 0h 0m 26s
log loss : 0.6930150590111661

Step 5...done in 0h 0m 28s
log loss : 0.6930585546100606

>> Processing SGDC

Step 1...done in 0h 1m 12s
log loss : 0.693147189433

ValueError: Number of features of the model must match the input. Model n_features_ is 15 and input n_features is 35 