In [83]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score
from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge

import pandas as pd

import xgboost as xgb
from xgboost.sklearn import XGBClassifier, XGBRegressor

import lightgbm as lgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import BatchNormalization

In [84]:
df_card = pd.read_csv('data/variable_classification2.csv')
df_card.head()

Unnamed: 0,Variable,Guth Classification,Strict Classification,Unnamed: 3,Notes,Unnamed: 5,Type
0,id,4.0,4.0,,,,
1,HRMONTH,4.0,4.0,,,,
2,HRYEAR4,4.0,4.0,,,,
3,HURESPLI,3.0,4.0,,too many variables,,
4,HUFINAL,2.0,4.0,,,,


In [85]:
sig = df_card.index[df_card['Guth Classification'].isin([1, 2, 3])].tolist()

train_data = np.genfromtxt('data/train_2008.csv', delimiter=',', skip_header=0)

X = train_data[1:, sig]
y = train_data[1:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    stratify=y, random_state=42)
# X_train[X_train < 0] = -1
# X_test[X_test < 0] = -1



In [86]:
pred_data = np.genfromtxt('data/test_2012.csv', delimiter=',', skip_header=0)
X_pred = pred_data[1:, sig]
# X_pred[X_pred < 0] = -1

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_pred.shape)

(48500, 111) (48500,) (16167, 111) (16167,) (82820, 111)


In [87]:
print(np.sum(y_train) / len(y_train), np.sum(y_test)/len(y_test))

0.25538144329896906 0.2553967959423517


In [88]:
print(y_train[:10], y_test[:10])

[0. 0. 1. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]


In [89]:
sig_rf = df_card.index[df_card['Guth Classification'].isin([1, 2])].tolist()

X_rf = train_data[1:, sig_rf]

X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_rf, y, test_size=0.25, 
                                                          stratify=y, random_state=42)


In [90]:
X_pred_rf = pred_data[1:, sig_rf]

In [91]:
print(y_train[:10], y_test[:10], X_train_rf.shape)

[0. 0. 1. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 0.] (48500, 54)


In [92]:
# Optional
# Standardization. Fit on training set only.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_pred = scaler.transform(X_pred)

In [93]:
scaler = StandardScaler()
scaler.fit(X_train_rf)

X_train_rf = scaler.transform(X_train_rf)
X_test_rf = scaler.transform(X_test_rf)
X_pred_rf = scaler.transform(X_pred_rf)

In [94]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

scaler.fit(X_rf)
X_rf = scaler.transform(X_rf)

In [95]:
# Helper function for cross_validation
n_folds = 5

def auc_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    score = np.average(cross_val_score(model, X_train, y_train, scoring="roc_auc", cv = kf))
    return(score)

In [96]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
i = 0

rf0_pred = np.empty_like(y)
rf1_pred = np.empty_like(y)
xgb0_pred = np.empty_like(y)
xgb1_pred = np.empty_like(y)
lgbm0_pred = np.empty_like(y)
lgbm1_pred = np.empty_like(y)
lgbm2_pred = np.empty_like(y)
GBoost0_pred = np.empty_like(y)

for train, val in cv.split(X, y):
    i += 1
    print('Fold %d' % i)
    
    rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                                criterion='entropy', n_jobs=-1)
    rf0.fit(X_rf[train], y[train])
    rf0_pred[val] = rf0.predict_proba(X_rf[val])[:, 1]
    
    rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
    rf1.fit(X_rf[train], y[train])
    rf1_pred[val] = rf1.predict_proba(X_rf[val])[:, 1]

    xgb0 = XGBRegressor(learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=6,
                    min_child_weight=5,
                    n_estimators=300)
    xgb0.fit(X[train], y[train])
    xgb0_pred[val] = xgb0.predict(X[val])
    
    xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
    xgb1.fit(X[train], y[train])
    xgb1_pred[val] = xgb1.predict(X[val])
    
    lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                          learning_rate=0.03, n_estimators=2000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
    lgbm0.fit(X[train], y[train])
    lgbm0_pred[val] = np.expm1(lgbm0.predict(X[val]))
    
    lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                          learning_rate=0.01, n_estimators=3000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
    lgbm1.fit(X[train], y[train])
    lgbm1_pred[val] = np.expm1(lgbm1.predict(X[val]))
    
    lgbm2 = lgb.LGBMRegressor(objective='regression', num_leaves=35,
                              learning_rate=0.005, n_estimators=4500,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.5,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
    lgbm2.fit(X[train], y[train])
    lgbm2_pred[val] = np.expm1(lgbm2.predict(X[val]))

    GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=4, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10, 
                                       loss='huber', random_state =5)
    GBoost0.fit(X[train], y[train])
    GBoost0_pred[val] = GBoost0.predict(X[val])
    

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [97]:
X_stack = np.stack((rf0_pred, rf1_pred, xgb0_pred, xgb1_pred, lgbm0_pred, 
                    lgbm1_pred, lgbm2_pred, GBoost0_pred), axis=-1)

In [98]:
lasso0 = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=1))
lasso0.fit(X_stack, y)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [99]:
print(roc_auc_score(y, lasso0.predict(X_stack)))

0.7980748381462537


In [100]:
rf0 = RandomForestClassifier(n_estimators=1500, max_depth=30, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf0.fit(X_rf, y)
rf0_final_pred = rf0.predict_proba(X_pred_rf)[:, 1]

rf1 = RandomForestClassifier(n_estimators=1200, max_depth=35, min_samples_leaf=2, 
                            criterion='entropy', n_jobs=-1)
rf1.fit(X_rf, y)
rf1_final_pred = rf1.predict_proba(X_pred_rf)[:, 1]

xgb0 = XGBRegressor(learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                objective= 'binary:logistic',
                seed=27,
                gamma=0.2,
                max_depth=6,
                min_child_weight=5,
                n_estimators=300)
xgb0.fit(X, y)
xgb0_final_pred = xgb0.predict(X_pred)

xgb1 = XGBRegressor(learning_rate=0.02,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    seed=27,
                    gamma=0.2,
                    max_depth=7,
                    min_child_weight=5,
                    n_estimators=800)
xgb1.fit(X, y)
xgb1_final_pred = xgb1.predict(X_pred)

lgbm0 = lgb.LGBMRegressor(objective='regression', num_leaves=20,
                      learning_rate=0.03, n_estimators=2000,
                      max_bin = 100, bagging_fraction = 0.8,
                      bagging_freq = 5, feature_fraction = 0.2319,
                      feature_fraction_seed=9, bagging_seed=9,
                      min_data_in_leaf = 6, min_sum_hessian_in_leaf = 11)
lgbm0.fit(X, y)
lgbm0_final_pred = np.expm1(lgbm0.predict(X_pred))

lgbm1 = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                          learning_rate=0.01, n_estimators=3000,
                          max_bin = 100, bagging_fraction = 0.8,
                          bagging_freq = 5, feature_fraction = 0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
lgbm1.fit(X, y)
lgbm1_final_pred = np.expm1(lgbm1.predict(X_pred))

lgbm2 = lgb.LGBMRegressor(objective='regression', num_leaves=35,
                              learning_rate=0.005, n_estimators=4500,
                              max_bin = 100, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.5,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf = 3, min_sum_hessian_in_leaf = 11)
lgbm2.fit(X, y)
lgbm2_final_pred = np.expm1(lgbm1.predict(X_pred))

GBoost0 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost0.fit(X, y)
GBoost0_final_pred = GBoost0.predict(X_pred)


In [101]:
X_pred_stack = np.stack((rf0_final_pred, rf1_final_pred, xgb0_final_pred,
                         xgb1_final_pred, lgbm0_final_pred, lgbm1_final_pred,
                         lgbm2_final_pred, GBoost0_final_pred), axis=-1)

In [102]:
ridge1 = RidgeCV(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 5e-1, 1, 1.5, 2, 3, 5, 10, 50, 100], cv=5, scoring='roc_auc')
ridge1.fit(X_stack, y)
print(roc_auc_score(y, ridge1.predict(X_stack)), ridge1.alpha_)

0.7981263892516769 10


In [103]:
y_pred = ridge1.predict(X_pred_stack)

print(y_pred[:20])

y_pred = (y_pred - min(y_pred))/(max(y_pred) - min(y_pred))
print(y_pred[:20])

[0.04502395 0.15171326 0.16486158 0.19580485 0.17682159 0.5891934
 0.56545021 0.69771015 0.06698721 0.29701103 0.20514805 0.17485968
 0.32389987 0.10602648 0.56161191 0.25607545 0.19698244 0.10385295
 0.0727334  0.00952961]
[0.05031955 0.14026639 0.15135138 0.1774388  0.16143454 0.50909393
 0.48907669 0.60058142 0.06883618 0.26276296 0.1853158  0.1597805
 0.28543221 0.10174912 0.48584073 0.2282513  0.1784316  0.09991667
 0.07368063 0.02039525]


In [104]:
ridge1.coef_

array([ 0.08904679,  0.29989335,  0.14070377,  0.30691918,  0.10213684,
        0.23775983, -0.08542721, -0.14392006])

In [105]:
# Save File
np.savetxt("result_2012_2.csv", np.dstack((np.arange(y_pred.size), y_pred))[0],"%d,%f",
           delimiter=' ', header="id,target", comments='')