In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score
from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge

import pandas as pd

import xgboost as xgb
from xgboost.sklearn import XGBClassifier, XGBRegressor

import lightgbm as lgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import LeakyReLU
from keras.layers import BatchNormalization

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_card = pd.read_csv('data/variable_classification2.csv')
df_card.head()

Unnamed: 0,Variable,Guth Classification,Strict Classification,Unnamed: 3,Notes,Unnamed: 5,Type
0,id,4.0,4.0,,,,
1,HRMONTH,4.0,4.0,,,,
2,HRYEAR4,4.0,4.0,,,,
3,HURESPLI,3.0,4.0,,too many variables,,
4,HUFINAL,2.0,4.0,,,,


In [3]:
sig = df_card.index[df_card['Guth Classification'].isin([1, 2, 3])].tolist()

train_data = np.genfromtxt('data/train_2008.csv', delimiter=',', skip_header=0)

X = train_data[1:, sig]
y = train_data[1:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    stratify=y, random_state=42)
# X_train[X_train < 0] = -1
# X_test[X_test < 0] = -1

pred_data = np.genfromtxt('data/test_2008.csv', delimiter=',', skip_header=0)
X_pred = pred_data[1:, sig]
# X_pred[X_pred < 0] = -1

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_pred.shape)

(48500, 111) (48500,) (16167, 111) (16167,) (16000, 111)


In [4]:
print(np.sum(y_train) / len(y_train), np.sum(y_test)/len(y_test))

0.25538144329896906 0.2553967959423517


In [5]:
print(y_train[:10], y_test[:10])

[0. 0. 1. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]


In [6]:
sig_rf = df_card.index[df_card['Guth Classification'].isin([1, 2])].tolist()

X_rf = train_data[1:, sig_rf]

X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_rf, y, test_size=0.25, 
                                                          stratify=y, random_state=42)

X_pred_rf = pred_data[1:, sig_rf]

In [7]:
print(y_train[:10], y_test[:10], X_train_rf.shape)

[0. 0. 1. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 0.] (48500, 54)


In [8]:
# Optional
# Standardization. Fit on training set only.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_pred = scaler.transform(X_pred)

In [9]:
scaler = StandardScaler()
scaler.fit(X_train_rf)

X_train_rf = scaler.transform(X_train_rf)
X_test_rf = scaler.transform(X_test_rf)
X_pred_rf = scaler.transform(X_pred_rf)

In [10]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

scaler.fit(X_rf)
X_rf = scaler.transform(X_rf)

In [11]:
# Helper function for cross_validation
n_folds = 5

def auc_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    score = np.average(cross_val_score(model, X_train, y_train, scoring="roc_auc", cv = kf))
    return(score)

**Ridge Regression**

In [26]:
ridge1 = RidgeCV(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 3, 10], cv=5, scoring='roc_auc')
ridge1.fit(X_train, y_train)

RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 3, 10], cv=5, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring='roc_auc',
    store_cv_values=False)

In [27]:
print(roc_auc_score(y_test, ridge1.predict(X_test)), ridge1.alpha_)

0.7683312925647523 0.001


In [28]:
ridge2 = RidgeCV(alphas=[3e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1], cv=5, scoring='roc_auc')
ridge2.fit(X_train, y_train)

RidgeCV(alphas=[0.03, 0.05, 0.1, 0.2, 0.3, 0.5], cv=5, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring='roc_auc',
    store_cv_values=False)

In [29]:
print(roc_auc_score(y_test, ridge2.predict(X_test)), ridge2.alpha_)

0.7683264971297816 0.5


**Lasso Regression**

In [30]:
lasso1 = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

auc_cv(lasso1)

0.7676877913876192

In [31]:
lasso1.fit(X_train, y_train)
roc_auc_score(y_test, lasso1.predict(X_test))

0.7658938472081319

**Elastic Net Regression**

In [None]:
# Not good

**Gradient Boosting**

In [32]:
GBoost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

auc_cv(GBoost)

0.7872970394405379

In [33]:
GBoost.fit(X_train, y_train)
roc_auc_score(y_test, GBoost.predict(X_test))

0.7903717336917991

**XGBoost**

In [34]:
xgb1 = XGBRegressor(learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=400)

In [35]:
auc_cv(xgb1)

0.7898283162334581

In [37]:
xgb1.fit(X_train, y_train)
roc_auc_score(y_test, xgb1.predict(X_test))

KeyboardInterrupt: 

In [160]:
xgb2 = XGBRegressor(learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=600)

In [154]:
auc_cv(xgb2)

0.7880072647794094

In [157]:
xgb2.fit(X_train, y_train)
roc_auc_score(y_test, xgb2.predict(X_test))

0.7936397902967398

In [174]:
xgb3 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)

In [172]:
auc_cv(xgb3)

0.7918243559134321

In [None]:
xgb3.fit(X_train, y_train)
roc_auc_score(y_test, xgb3.predict(X_test))

In [164]:
xgb4 = XGBRegressor(learning_rate=0.01,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=1000)

In [166]:
auc_cv(xgb4)

0.7915313055479716

In [165]:
xgb4.fit(X_train, y_train)
roc_auc_score(y_test, xgb4.predict(X_test))

0.7949095644530191

**LGBM**

In [22]:
params = {'objective': 'regression', 'num_leaves': 35,
          'learning_rate': 0.01, 'n_estimators': 4500,
          'max_bin': 100, 'bagging_fraction': 0.8,
          'bagging_freq': 5, 'feature_fraction': 0.5,
          'feature_fraction_seed': 9, 'bagging_seed': 9,
          'min_data_in_leaf': 3, 'min_sum_hessian_in_leaf': 11}

lgbm1 = lgb.LGBMRegressor(**params)

In [33]:
lgbm1.fit(X_train, y_train)
lgb_pred = np.expm1(lgbm1.predict(X_test))
print(roc_auc_score(y_test, lgb_pred))

0.7966988849510256


In [14]:
lgbm1.booster_.feature_importance()

array([2397,  547,  195, 1658, 1023,  353,  621,  660, 6802,    0, 3358,
       2046, 4050, 1649, 1069,  621, 2313, 7825, 6790, 5017, 2162,  650,
        668, 3099, 2670, 2602,  725, 9464,  202, 1937, 1197,  340,    0,
       5365, 2619, 1042,  380, 1307,  725,  314, 1068,    0, 1592, 2426,
       2335,  279, 1922, 1043,  735,  951,  213,  428,  471,  327,  641,
        558,  296,  274, 2767,  537,  613,  456, 3489,   40, 1658,   74,
        145, 1135, 1318, 1045, 1238,   72,  416,  755, 5016, 4349, 1254,
        307, 1285, 1367, 1301,  553,  277,  731,  583,  397,  470,  316,
       2750, 1413,  186,   79,  367,  609,  128,  298, 1569, 2063,  593,
       1416,  898,  538,  403,  837,  289,  249,  551,  484,  273,  579,
        443])

**Random Forest**

In [108]:
parameters_rfc = { 
    'n_estimators': [1500],
    'max_depth': [10, 15, 20, 25],
    'min_samples_leaf': [1, 2]
}

rfc_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42,
                                                               n_jobs=-1),
                               param_grid=parameters_rfc,
                               cv=5, 
                               scoring='roc_auc',
                               return_train_score=True)

rfc_grid_search.fit(X_train_rf, y_train)

print('Best score: {} '.format(rfc_grid_search.best_score_))
print('\n#### Best params ####\n')
print(rfc_grid_search.best_params_)
print(rfc_grid_search.grid_scores_)

Best score: 0.7828925464872372 

#### Best params ####

{'max_depth': 20, 'min_samples_leaf': 2, 'n_estimators': 1500}
[mean: 0.77500, std: 0.00163, params: {'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 1500}, mean: 0.77505, std: 0.00142, params: {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 1500}, mean: 0.78203, std: 0.00194, params: {'max_depth': 15, 'min_samples_leaf': 1, 'n_estimators': 1500}, mean: 0.78180, std: 0.00159, params: {'max_depth': 15, 'min_samples_leaf': 2, 'n_estimators': 1500}, mean: 0.78278, std: 0.00229, params: {'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 1500}, mean: 0.78289, std: 0.00188, params: {'max_depth': 20, 'min_samples_leaf': 2, 'n_estimators': 1500}, mean: 0.78160, std: 0.00222, params: {'max_depth': 25, 'min_samples_leaf': 1, 'n_estimators': 1500}, mean: 0.78285, std: 0.00178, params: {'max_depth': 25, 'min_samples_leaf': 2, 'n_estimators': 1500}]




In [218]:
rf1 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf1.fit(X_train_rf, y_train)
roc_auc_score(y_test, rf1.predict_proba(X_test_rf)[:, 1])

0.7882801177236

In [219]:
roc_auc_score(y_train, rf1.predict_proba(X_train_rf)[:, 1])

0.997551066624777

In [216]:
rf2 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf2.fit(X_train_rf, y_train)
roc_auc_score(y_test, rf2.predict_proba(X_test_rf)[:, 1])

0.7883685597046344

In [217]:
roc_auc_score(y_train, rf2.predict_proba(X_train_rf)[:, 1])

0.9981069788771318

In [192]:
rf3 = RandomForestClassifier(n_estimators=500, max_depth=30, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf3.fit(X_train_rf, y_train)
roc_auc_score(y_test, rf3.predict_proba(X_test_rf)[:, 1])

0.7870892090281155

In [193]:
roc_auc_score(y_train, rf3.predict_proba(X_train_rf)[:, 1])

0.9964203535691708

**Neural Network**

In [359]:
# Neural Network
from keras import optimizers

model = Sequential()
model.add(Dense(200, input_dim=X_train.shape[1]))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))
          
model.add(Dense(400))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))

model.add(Dense(600))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))

model.add(Dense(400))
model.add(LeakyReLU(alpha=0.05))
model.add(Dropout(0.2))

model.add(Dense(100))
model.add(LeakyReLU(alpha=0.05))

model.add(Dense(1))
model.add(Activation("sigmoid"))

model.summary()

adam = optimizers.Adam(lr=0.0001, decay=0.000001)
model.compile(loss="binary_crossentropy", optimizer=adam, metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=64, epochs=20,
                    validation_data=(X_test, y_test))

probs = model.predict(X_test)
print(roc_auc_score(y_test, probs))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_72 (Dense)             (None, 200)               22400     
_________________________________________________________________
leaky_re_lu_16 (LeakyReLU)   (None, 200)               0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_73 (Dense)             (None, 400)               80400     
_________________________________________________________________
leaky_re_lu_17 (LeakyReLU)   (None, 400)               0         
_________________________________________________________________
dropout_18 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 600)               240600    
__________

**Stacking with a Meta model**

In [270]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
i = 0

rf0_pred = np.empty_like(y_train)
rf1_pred = np.empty_like(y_train)
xgb0_pred = np.empty_like(y_train)
xgb1_pred = np.empty_like(y_train)
lgbm0_pred = np.empty_like(y_train)
lgbm1_pred = np.empty_like(y_train)
GBoost0_pred = np.empty_like(y_train)

for train, val in cv.split(X_train, y_train):
    i += 1
    print('Fold %d' % i)
    
    rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                                criterion='entropy', n_jobs=-1)
    rf0.fit(X_train_rf[train], y_train[train])
    rf0_pred[val] = rf0.predict_proba(X_train_rf[val])[:, 1]
    
    rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                                criterion='entropy', n_jobs=-1)
    rf1.fit(X_train_rf[train], y_train[train])
    rf1_pred[val] = rf1.predict_proba(X_train_rf[val])[:, 1]

    xgb0 = XGBRegressor(learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=6,
                    min_child_weight=5,
                    n_estimators=300)
    xgb0.fit(X_train[train], y_train[train])
    xgb0_pred[val] = xgb0.predict(X_train[val])
    
    xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
    xgb1.fit(X_train[train], y_train[train])
    xgb1_pred[val] = xgb1.predict(X_train[val])
    
    lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                              learning_rate=0.03, n_estimators=2000,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
    lgbm0.fit(X_train[train], y_train[train])
    lgbm0_pred[val] = np.expm1(lgbm0.predict(X_train[val]))
    
    lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                              learning_rate=0.01, n_estimators=3000,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
    lgbm1.fit(X_train[train], y_train[train])
    lgbm1_pred[val] = np.expm1(lgbm1.predict(X_train[val]))
    
    GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=4, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10, 
                                       loss='huber', random_state =5)
    GBoost0.fit(X_train[train], y_train[train])
    GBoost0_pred[val] = GBoost0.predict(X_train[val])
    
    

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [271]:
X_train_stack = np.stack((rf0_pred, rf1_pred, xgb0_pred, xgb1_pred, 
                          lgbm0_pred, lgbm1_pred, GBoost0_pred), axis=-1)

In [272]:
lasso0 = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=1))
lasso0.fit(X_train_stack, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [273]:
rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf0.fit(X_train_rf, y_train)
rf0_test_pred = rf0.predict_proba(X_test_rf)[:, 1]

rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf1.fit(X_train_rf, y_train)
rf1_test_pred = rf1.predict_proba(X_test_rf)[:, 1]

xgb0 = XGBRegressor(learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                objective= 'binary:logistic',
                seed=27,
                gamma=0.2,
                max_depth=6,
                min_child_weight=5,
                n_estimators=300)
xgb0.fit(X_train, y_train)
xgb0_test_pred = xgb0.predict(X_test)

xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
xgb1.fit(X_train, y_train)
xgb1_test_pred = xgb1.predict(X_test)

lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                          learning_rate=0.03, n_estimators=2000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
lgbm0.fit(X_train, y_train)
lgbm0_test_pred = np.expm1(lgbm0.predict(X_test))

lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                          learning_rate=0.01, n_estimators=3000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
lgbm1.fit(X_train, y_train)
lgbm1_test_pred = np.expm1(lgbm1.predict(X_test))

GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost0.fit(X_train, y_train)
GBoost0_test_pred = GBoost0.predict(X_test)


In [274]:
X_test_stack = np.stack((rf0_test_pred, rf1_test_pred, xgb0_test_pred, 
                         xgb1_test_pred, lgbm0_test_pred, lgbm1_test_pred, GBoost0_test_pred), axis=-1)

print(roc_auc_score(y_train, lasso0.predict(X_train_stack)))
print(roc_auc_score(y_test, lasso0.predict(X_test_stack)))

0.7956432051683117
0.79820457145253


In [275]:
ridge1 = RidgeCV(alphas=[1e-2, 1e-1, 1], cv=5, scoring='roc_auc')
ridge1.fit(X_train_stack, y_train)
print(roc_auc_score(y_train, ridge1.predict(X_train_stack)), ridge1.alpha_)
print(roc_auc_score(y_test, ridge1.predict(X_test_stack)))

0.7956268361341461 1
0.798283577744505


In [276]:
print(rf1_test_pred[:10], xgb0_test_pred[:10], xgb1_test_pred[:10], 
      lgbm0_test_pred[:10], GBoost0_test_pred[:10])

[0.05427664 0.51473095 0.43966818 0.15080389 0.21254166 0.76821544
 0.50763219 0.22587325 0.12323257 0.16563487] [0.02700327 0.45207655 0.40617937 0.20551553 0.21792324 0.8051366
 0.48895347 0.1553283  0.0484005  0.1284436 ] [0.02777799 0.35300177 0.4384511  0.18335803 0.19940907 0.829139
 0.5471693  0.12741223 0.06295323 0.13642126] [-0.01913085  0.57280493  0.55488735  0.15451071  0.26361533  1.14150328
  0.61747328  0.1452715   0.06962452  0.18799581] [-0.00747068  0.39173927  0.31021518  0.11178373  0.20924724  0.76607215
  0.51257632  0.13307725  0.02298572  0.1407127 ]


In [279]:
lasso1 = LassoCV(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 3, 5, 10], cv=5)
lasso1.fit(X_train_stack, y_train)
print(roc_auc_score(y_train, lasso1.predict(X_train_stack)), lasso1.alpha_)
print(roc_auc_score(y_test, lasso1.predict(X_test_stack)))

0.7956517742079124 0.0001
0.7982117739614496


In [277]:
ridge1.coef_

array([ 0.14142824,  0.22483611,  0.06444748,  0.35203983,  0.05046318,
        0.1752807 , -0.05374918])

In [278]:
lasso1.coef_

array([0.        , 0.30326831, 0.18259294, 0.35509964, 0.13831325,
       0.        ])

In [None]:
# print(roc_auc_score(y_train, rf0.predict_proba(X_train_rf)[:, 1]))

**Submission**

In [382]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
i = 0

rf0_pred = np.empty_like(y)
rf1_pred = np.empty_like(y)
xgb0_pred = np.empty_like(y)
xgb1_pred = np.empty_like(y)
lgbm0_pred = np.empty_like(y)
lgbm1_pred = np.empty_like(y)
lgbm2_pred = np.empty_like(y)
GBoost0_pred = np.empty_like(y)

for train, val in cv.split(X, y):
    i += 1
    print('Fold %d' % i)
    
    rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                                criterion='entropy', n_jobs=-1)
    rf0.fit(X_rf[train], y[train])
    rf0_pred[val] = rf0.predict_proba(X_rf[val])[:, 1]
    
    rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
    rf1.fit(X_rf[train], y[train])
    rf1_pred[val] = rf1.predict_proba(X_rf[val])[:, 1]

    xgb0 = XGBRegressor(learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=6,
                    min_child_weight=5,
                    n_estimators=300)
    xgb0.fit(X[train], y[train])
    xgb0_pred[val] = xgb0.predict(X[val])
    
    xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
    xgb1.fit(X[train], y[train])
    xgb1_pred[val] = xgb1.predict(X[val])
    
    lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                          learning_rate=0.03, n_estimators=2000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
    lgbm0.fit(X[train], y[train])
    lgbm0_pred[val] = np.expm1(lgbm0.predict(X[val]))
    
    lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                          learning_rate=0.01, n_estimators=3000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
    lgbm1.fit(X[train], y[train])
    lgbm1_pred[val] = np.expm1(lgbm1.predict(X[val]))
    
    lgbm2 = lgb.LGBMRegressor(objective='regression', num_leaves=35,
                              learning_rate=0.005, n_estimators=4500,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.5,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
    lgbm2.fit(X[train], y[train])
    lgbm2_pred[val] = np.expm1(lgbm2.predict(X[val]))

    GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=4, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10, 
                                       loss='huber', random_state =5)
    GBoost0.fit(X[train], y[train])
    GBoost0_pred[val] = GBoost0.predict(X[val])


Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [383]:
X_stack = np.stack((rf0_pred, rf1_pred, xgb0_pred, xgb1_pred, lgbm0_pred, 
                    lgbm1_pred, lgbm2_pred, GBoost0_pred), axis=-1)

In [384]:
lasso0 = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=1))
lasso0.fit(X_stack, y)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [385]:
print(roc_auc_score(y, lasso0.predict(X_stack)))

0.7970043507397631


In [386]:
rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf0.fit(X_rf, y)
rf0_final_pred = rf0.predict_proba(X_pred_rf)[:, 1]

rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf1.fit(X_rf, y)
rf1_final_pred = rf1.predict_proba(X_pred_rf)[:, 1]

xgb0 = XGBRegressor(learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                objective= 'binary:logistic',
                seed=27,
                gamma=0.2,
                max_depth=6,
                min_child_weight=5,
                n_estimators=300)
xgb0.fit(X, y)
xgb0_final_pred = xgb0.predict(X_pred)

xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
xgb1.fit(X, y)
xgb1_final_pred = xgb1.predict(X_pred)

lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                      learning_rate=0.03, n_estimators=2000,
                      max_bin = 100, bagging_fraction = 0.8,
                      bagging_freq = 5, feature_fraction = 0.2319,
                      feature_fraction_seed=9, bagging_seed=9,
                      min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
lgbm0.fit(X, y)
lgbm0_final_pred = np.expm1(lgbm0.predict(X_pred))

lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                          learning_rate=0.01, n_estimators=3000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
lgbm1.fit(X, y)
lgbm1_final_pred = np.expm1(lgbm1.predict(X_pred))

lgbm2 = lgb.LGBMRegressor(objective='regression', num_leaves=35,
                              learning_rate=0.005, n_estimators=4500,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.5,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
lgbm2.fit(X, y)
lgbm2_final_pred = np.expm1(lgbm1.predict(X_pred))

GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost0.fit(X, y)
GBoost0_final_pred = GBoost0.predict(X_pred)


In [387]:
X_pred_stack = np.stack((rf0_final_pred, rf1_final_pred, xgb0_final_pred,
                         xgb1_final_pred, lgbm0_final_pred, lgbm1_final_pred,
                         lgbm2_final_pred, GBoost0_final_pred), axis=-1)

In [388]:
ridge1 = RidgeCV(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 5e-1, 1, 1.5, 2, 3, 5, 10, 30, 50, 100], cv=5, scoring='roc_auc')
ridge1.fit(X_stack, y)
print(roc_auc_score(y, ridge1.predict(X_stack)), ridge1.alpha_)

0.7970776389450361 50


In [389]:
y_pred = ridge1.predict(X_pred_stack)

In [390]:
y_pred[:20]

array([0.24330442, 0.07374529, 0.12534674, 0.36491771, 0.2451156 ,
       0.05909378, 0.48324117, 0.74069621, 0.00399904, 0.14946549,
       0.05429185, 0.63167903, 0.25631653, 0.20353574, 0.26606331,
       0.18893726, 0.31325271, 0.05807429, 0.22187097, 0.32127934])

In [391]:
y_pred = (y_pred - min(y_pred))/(max(y_pred) - min(y_pred))
print(y_pred[:20])

[0.21670188 0.07251642 0.11639598 0.32011635 0.21824202 0.06005744
 0.4207333  0.63966148 0.01320735 0.13690548 0.05597409 0.54695817
 0.22776679 0.18288438 0.23605501 0.17047049 0.27618276 0.05919051
 0.19847583 0.28300824]


In [392]:
ridge1.coef_

array([ 0.17862603,  0.18255607,  0.11484109,  0.22747291,  0.0687245 ,
        0.21593595, -0.00134703, -0.0587778 ])

In [397]:
# Save File
np.savetxt("result.csv", np.dstack((np.arange(y_pred.size), y_pred))[0],"%d,%f",
           delimiter=' ', header="id,target", comments='')

In [393]:
print(roc_auc_score(y, rf0_pred), roc_auc_score(y, rf1_pred), roc_auc_score(y, xgb0_pred), 
      roc_auc_score(y, xgb1_pred), roc_auc_score(y, lgbm0_pred), roc_auc_score(y, lgbm1_pred), 
      roc_auc_score(y, lgbm2_pred), roc_auc_score(y, GBoost0_pred))

0.7871781712084707 0.7869574024520294 0.7923117030956115 0.7934608161550388 0.7933315026686358 0.7953673947123844 0.7945907655830209 0.7886963685537729


In [396]:
from keras import optimizers

model = Sequential()
model.add(Dense(5, input_dim=X_stack.shape[1]))
model.add(Activation('relu'))
          
model.add(Dense(10))
model.add(Activation('relu'))

model.add(Dense(2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation("sigmoid"))

model.summary()

adam = optimizers.Adam(lr=0.0001, decay=0.00001)
model.compile(loss="binary_crossentropy", optimizer=adam, metrics=['accuracy'])

history = model.fit(X_stack, y, batch_size=64, epochs=10)

probs = model.predict(X_stack)
print(roc_auc_score(y, probs))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_86 (Dense)             (None, 5)                 45        
_________________________________________________________________
activation_65 (Activation)   (None, 5)                 0         
_________________________________________________________________
dense_87 (Dense)             (None, 10)                60        
_________________________________________________________________
activation_66 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_88 (Dense)             (None, 2)                 22        
_________________________________________________________________
activation_67 (Activation)   (None, 2)                 0         
_________________________________________________________________
dense_89 (Dense)             (None, 1)                 3         
__________

In [308]:
y_pred

array([0.21934843, 0.07102345, 0.11797089, ..., 0.18478357, 0.05475999,
       0.13407531])