In [1]:
from __future__ import print_function
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

from keras.models import Sequential
from keras.layers import Input
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop, Adam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.datasets import mnist
from keras.utils import np_utils

import pandas as pd
import numpy as np

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

In [3]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [4]:
from sklearn.metrics import f1_score, roc_auc_score
import keras.callbacks as kc

class Metrics(kc.Callback):
    def on_epoch_end(self, batch, logs={}):
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        self.ginis=(2*roc_auc_score(targ, predict))-1
        return
metrics = Metrics()

In [39]:
def data():
    non_imp = ['ps_ind_12_bin','ps_ind_13_bin','ps_ind_18_bin','ps_car_10_cat','ps_ind_11_bin','ps_ind_10_bin','ps_ind_14']
    
    train = (pd.read_csv('../data/train.csv', na_values=999)
              .fillna(value=999))
    unwanted = list((set(train.columns[train.columns.str.startswith('ps_calc_')])|set(non_imp)))
    train.drop(unwanted, axis=1, inplace=True)
            
    test  = (pd.read_csv('../data/test.csv', na_values=999)
              .fillna(value=999)
              .drop(unwanted, axis=1)) 
            
    X = train.drop(['id', 'target'], axis=1).values
    y = train.target.values
    test_id = test.id.values
    test = test.drop('id', axis=1)

    
    f_dicts ={}

    features_dict_data_type = {}
    for k in ['cat', 'bin', 'con']:
        features_dict_data_type[k]=[]
        for i in train.columns.tolist()[2:]:
            j = i.split('_')
            if len(j) == 3:
                j.append('con')
            if j[3]==k:
                features_dict_data_type[k].append(i)
    f_dicts['type'] = features_dict_data_type

    train_cat = np.array(train[f_dicts['type']['cat']])
    train_cat[:,:] = np.add(train_cat[:,:],np.ones((train_cat.shape[0],train_cat.shape[1])))
    test_cat  = np.array(test[f_dicts['type']['cat']])
    test_cat[:,:] =  np.add(test_cat[:,:],np.ones((test_cat.shape[0],train_cat.shape[1])))

    OH = OneHotEncoder()
    OH.fit(np.array(list(train_cat) + list(test_cat)))
    train_cat = OH.transform(train_cat).toarray()
    test_cat = OH.transform(test_cat).toarray()

    train_con = np.array(train[f_dicts['type']['con']])
    test_con  = np.array(test[f_dicts['type']['con']])
    RS = StandardScaler()
    RS.fit(list(train_con) + list(test_con))
    train_con = RS.transform(train_con)
    train_con = RS.transform(train_con)

    train_bin = np.array(train[f_dicts['type']['bin']])
    test_bin = np.array(test[f_dicts['type']['bin']])


    X = np.hstack((train_cat,train_con,train_bin))

    X_test = np.hstack((test_cat,test_con,test_bin))
    
    #smote = SMOTE(random_state=0)
    #X_resampled, y_resampled = smote.fit_sample(X, y)
    X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.25, shuffle=True,  random_state=np.random.randint(1000), stratify=y)

    return X_train, X_vali, y_train, y_vali, X_test, test_id

In [13]:
class Metrics(kc.Callback):
        def on_epoch_end(self, batch, logs={}):
            predict = np.asarray(self.model.predict(self.validation_data[0]))
            targ = self.validation_data[1]
            self.ginis=(2*roc_auc_score(targ, predict))-1
            return

In [7]:
def model():
    class Metrics(kc.Callback):
        def on_epoch_end(self, batch, logs={}):
            predict = np.asarray(self.model.predict(self.validation_data[0]))
            targ = self.validation_data[1]
            self.ginis=(2*roc_auc_score(targ, predict))-1
            return
    metrics = Metrics()
    model = Sequential()
    model.add(Dense({{choice([128, 256, 512])}}, input_dim=198, kernel_initializer='glorot_uniform',bias_initializer='zeros', activation={{choice(['relu', 'tanh', 'sigmoid'])}}))#{{choice(['relu', 'tanh', 'linear'])}}
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([128, 256, 512])}}, kernel_initializer='glorot_uniform',bias_initializer='zeros', activation={{choice(['relu', 'tanh', 'sigmoid'])}}))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([128, 256, 512, 1024])}}, kernel_initializer='glorot_uniform',bias_initializer='zeros', activation={{choice(['relu', 'tanh', 'sigmoid'])}}))
    model.add(Dense(1,activation={{choice(['tanh', 'sigmoid'])}}, kernel_initializer='glorot_uniform'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['acc'])
    
    ES = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1, mode='auto')
    model.fit(X_train, y_train, batch_size={{choice([64, 128])}}, epochs= 20,validation_data=(X_vali,y_vali), callbacks=[ES,metrics])#, class_weight = class_weight
    score, acc = model.evaluate(X_vali, y_vali, verbose=0)
    print('Test accuracy: ', metrics.ginis)
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [47]:
X_train, X_vali, y_train, y_vali, X_test, test_id = data()

In [9]:
best_run, best_model = optim.minimize(model=model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=50,
                                      trials=Trials(),
                                      notebook_name='dlsearch')

>>> Imports:
#coding=utf-8

from __future__ import print_function

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import Input
except:
    pass

try:
    from keras.layers.core import Dense, Dropout, Activation
except:
    pass

try:
    from keras.optimizers import RMSprop, Adam, SGD
except:
    pass

try:
    from keras.callbacks import EarlyStopping, ModelCheckpoint
except:
    pass

try:
    from keras.datasets import mnist
except:
    pass

try:
    from keras.utils import np_utils
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from keras.wrappers.scikit_learn import KerasRegressor
except:
    pass

try:
    from sklearn.model_selection import cross_val_score, train_tes

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00009: early stopping
Test accuracy:  0.943658360694
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy:  0.904296219351
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 00015: early stopping
Test accuracy:  0.926902090049
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 00008: early stopping
Test accuracy:  0.915416939279
Train on 860277 samples, validate 

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00011: early stopping
Test accuracy:  0.917552754742
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 00010: early stopping
Test accuracy:  0.912656641464
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 00017: early stopping
Test accuracy:  0.631170792094
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00012: early stopping
Test accuracy:  0.957619678642
Train on 860277 sa

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 00016: early stopping
Test accuracy:  0.953302401761
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00011: early stopping
Test accuracy:  0.958324591905
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 00017: early stopping
Test accuracy:  0.955822154497
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00012: early stopping
Test accuracy:  0.959281751225
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/

Epoch 19/20
Epoch 20/20
Test accuracy:  0.945912075544
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 00008: early stopping
Test accuracy:  0.945780484875
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00009: early stopping
Test accuracy:  0.930108234844
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00011: early stopping
Test accuracy:  0.942527148279
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 00010: early stopping
Test accuracy:  0.932331891197
Train on 860277 samples, validate

Epoch 00010: early stopping
Test accuracy:  0.951870175287
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy:  0.723423101834
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00007: early stopping
Test accuracy:  0.92595193648
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy:  0.952028079041
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/

Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy:  0.922667031828
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy:  0.961829972834
Train on 860277 samples, validate on 286759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00007: early stopping
Test accuracy:  0.953034973057


[{'class_name': 'Dense',
  'config': {'activation': 'sigmoid',
   'activity_regularizer': None,
   'batch_input_shape': (None, 198),
   'bias_constraint': None,
   'bias_initializer': {'class_name': 'Zeros', 'config': {}},
   'bias_regularizer': None,
   'dtype': 'float32',
   'kernel_constraint': None,
   'kernel_initializer': {'class_name': 'VarianceScaling',
    'config': {'distribution': 'uniform',
     'mode': 'fan_avg',
     'scale': 1.0,
     'seed': None}},
   'kernel_regularizer': None,
   'name': 'dense_53',
   'trainable': True,
   'units': 512,
   'use_bias': True}},
 {'class_name': 'Dropout',
  'config': {'name': 'dropout_27',
   'rate': 0.10466253116015645,
   'trainable': True}},
 {'class_name': 'Dense',
  'config': {'activation': 'tanh',
   'activity_regularizer': None,
   'bias_constraint': None,
   'bias_initializer': {'class_name': 'Zeros', 'config': {}},
   'bias_regularizer': None,
   'kernel_constraint': None,
   'kernel_initializer': {'class_name': 'VarianceScali

In [46]:
kfold = 1
skf = StratifiedKFold(n_splits=kfold, random_state=42)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.

In [42]:
X = np.vstack((X_train,X_vali))

In [43]:
y = np.hstack((y_train,y_vali))

In [44]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

In [30]:
NUM_EPOCHS = 20
BATCH_SIZE = 100

sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    #class_weight = {1 : (len(y_train) - sum(y_train))/y_train.shape[0], 0: sum(y_train)/y_train.shape[0]}
    ES = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1, mode='auto')
    history = best_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(X_valid,y_valid), callbacks=[ES,metrics])#, class_weight = class_weight
    print('gini metric:', metrics.ginis)
    print()
    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test =best_model.predict_proba(X_test)[:,0]
    sub['target'] += p_test/kfold

[Fold 1/6]
Train on 955862 samples, validate on 191174 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00011: early stopping
gini metric: 0.983480862536

[Fold 1/6 Prediciton:]
Train on 955862 samples, validate on 191174 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 00002: early stopping
gini metric: 0.994438236991

[Fold 2/6 Prediciton:]
Train on 955864 samples, validate on 191172 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 00002: early stopping
gini metric: 0.995542596779

[Fold 3/6 Prediciton:]
Train on 955864 samples, validate on 191172 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00003: early stopping
gini metric: 0.995436624736

[Fold 4/6 Prediciton:]
Train on 955864 samples, validate on 191172 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 00002: early stopping
gini metric: 0.996399488461

[Fold 5/6 Prediciton:]
Train on 955864 samples, validate on 191172 samples

In [31]:
import datetime as dt

In [49]:
ES = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1, mode='auto')
class_weight = {1 : (len(y_train) - sum(y_train))/y_train.shape[0], 0: sum(y_train)/y_train.shape[0]}
history = best_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(X_valid,y_valid), callbacks=[ES,metrics], class_weight = class_weight)#, class_weight = class_weight
print('gini metric:', metrics.ginis)
print()
# Predict on our test data
sub['target'] = best_model.predict_proba(X_test)[:,0]

Train on 446409 samples, validate on 191172 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00004: early stopping
gini metric: 0.994863668361


In [50]:
filename = 'DL_model_'+str(dt.datetime.now()).replace(' ','_').replace(':','').replace('.','')
sub.id = sub.id.astype('Int32')
sub.to_csv('../output/'+filename+'.csv', index=False)
#test = np.load('../output/params.npy').item()