In [22]:
from __future__ import print_function

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import BatchNormalization 
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping
from keras.datasets import mnist
from keras.utils import np_utils

import pandas as pd
import numpy as np
import datetime as dt
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from keras.utils.np_utils import to_categorical

In [2]:
from __future__ import print_function
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

In [4]:
def gini(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [5]:
from sklearn.metrics import f1_score, roc_auc_score
import keras.callbacks as kc

class Metrics(kc.Callback):
    def on_epoch_end(self, batch, logs={}):
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        self.ginis=(2*roc_auc_score(targ, predict))-1
        return
metrics = Metrics()

In [6]:
def prepare_data():
    non_imp = ['ps_ind_12_bin','ps_ind_13_bin','ps_ind_18_bin','ps_car_10_cat','ps_ind_11_bin','ps_ind_10_bin','ps_ind_14']
    
    train = (pd.read_csv('../data/train.csv', na_values=999)
              .fillna(value=999))
    unwanted = list((set(train.columns[train.columns.str.startswith('ps_calc_')])|set(non_imp)))
    train.drop(unwanted, axis=1, inplace=True)
            
    test  = (pd.read_csv('../data/test.csv', na_values=999)
              .fillna(value=999)
              .drop(unwanted, axis=1)) 
            
    X = train.drop(['id', 'target'], axis=1).values
    y = train.target.values
    test_id = test.id.values
    test = test.drop('id', axis=1)

    
    f_dicts = make_dicts(train)

    train_cat = np.array(train[f_dicts['type']['cat']])
    train_cat[:,:] = np.add(train_cat[:,:],np.ones((train_cat.shape[0],train_cat.shape[1])))
    test_cat  = np.array(test[f_dicts['type']['cat']])
    test_cat[:,:] =  np.add(test_cat[:,:],np.ones((test_cat.shape[0],train_cat.shape[1])))

    OH = OneHotEncoder()
    OH.fit(np.array(list(train_cat) + list(test_cat)))
    train_cat = OH.transform(train_cat).toarray()
    test_cat = OH.transform(test_cat).toarray()

    train_con = np.array(train[f_dicts['type']['con']])
    test_con  = np.array(test[f_dicts['type']['con']])
    RS = StandardScaler()
    RS.fit(list(train_con) + list(test_con))
    train_con = RS.transform(train_con)
    train_con = RS.transform(train_con)

    train_bin = np.array(train[f_dicts['type']['bin']])
    test_bin = np.array(test[f_dicts['type']['bin']])


    X = np.hstack((train_cat,train_con,train_bin))

    X_test = np.hstack((test_cat,test_con,test_bin))
    
    smote = SMOTE(random_state=0)
    X_resampled, y_resampled = smote.fit_sample(X, y)
    
    return X_resampled, y_resampled, X_test, test_id

In [7]:
def make_dicts(df):
    f_dicts ={}
    
    features_dict = {}
    for x in ['ind', 'reg', 'car', 'calc']:
        for y in ['cat', 'bin', 'con']:
            features_dict[x+'_'+y] =[]
            for i in df.columns.tolist()[2:]:
                j = i.split('_')
                if len(j) == 3:
                    j.append('con')
                if j[1]==x and j[3]==y:
                    features_dict[x+'_'+y].append(i)
    f_dicts['combo'] = features_dict
    
    features_dict_data_type = {}
    for y in ['cat', 'bin', 'con']:
        features_dict_data_type[y]=[]
        for i in df.columns.tolist()[2:]:
            j = i.split('_')
            if len(j) == 3:
                j.append('con')
            if j[3]==y:
                features_dict_data_type[y].append(i)
    f_dicts['type'] = features_dict_data_type
    
    features_dict_data_label = {}
    for x in ['ind', 'reg', 'car', 'calc']:
        features_dict_data_label[x] =[]
        for i in df.columns.tolist()[2:]:
            j = i.split('_')
            if j[1]==x:
                features_dict_data_label[x].append(i)
    f_dicts['label'] = features_dict_data_label
    return f_dicts

In [8]:
def data():
    non_imp = ['ps_ind_12_bin','ps_ind_13_bin','ps_ind_18_bin','ps_car_10_cat','ps_ind_11_bin','ps_ind_10_bin','ps_ind_14']
    
    train = (pd.read_csv('../data/train.csv', na_values=999)
              .fillna(value=999))
    unwanted = list((set(train.columns[train.columns.str.startswith('ps_calc_')])|set(non_imp)))
    train.drop(unwanted, axis=1, inplace=True)
            
    test  = (pd.read_csv('../data/test.csv', na_values=999)
              .fillna(value=999)
              .drop(unwanted, axis=1)) 
    
    y = train.target.values        
    train = train.drop(['id', 'target'], axis=1)
    
    test_id = test.id.values
    test = test.drop('id', axis=1)
        
    return train.values, y, test.values, test_id

In [9]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(256, input_dim=198, kernel_initializer='glorot_uniform',bias_initializer='zeros', activation='sigmoid'))
    model.add(Dense(256, kernel_initializer='glorot_uniform', activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(512, kernel_initializer='glorot_uniform', activation='sigmoid'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.02),metrics=['acc'])
    return model

In [27]:
# define base model
def optimized_model():
    # create model
    model = Sequential()
    model.add(Dense(512, input_dim=198, kernel_initializer='glorot_uniform',bias_initializer='zeros', activation='sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(256, kernel_initializer='glorot_uniform', activation='tanh'))
    model.add(BatchNormalization())
    model.add(Dropout(0.01))
    model.add(Dense(128, kernel_initializer='glorot_uniform', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(256, kernel_initializer='glorot_uniform', activation='tanh'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, kernel_initializer='glorot_uniform', activation='relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.02),metrics=['acc'])
    return model

In [28]:
model = optimized_model()

In [29]:
X, y, X_test, test_id = prepare_data()

In [None]:
kfold = 6
skf = StratifiedKFold(n_splits=kfold, random_state=42)

NUM_EPOCHS = 20
BATCH_SIZE = 64

sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    #class_weight = {1 : (len(y_train) - sum(y_train))/y_train.shape[0], 0: sum(y_train)/y_train.shape[0]}
    ES = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1, mode='auto')
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(X_valid,y_valid), callbacks=[ES,metrics])#, class_weight = class_weight
    print(metrics.ginis)
    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test =model.predict_proba(X_test)[:,0]
    sub['target'] += p_test/kfold

[Fold 1/6]
Train on 955862 samples, validate on 191174 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00003: early stopping
-0.0004184424303
[Fold 1/6 Prediciton:]

In [None]:
filename = 'DL_model_'+str(dt.datetime.now()).replace(' ','_').replace(':','').replace('.','')
sub.id = sub.id.astype('Int32')
sub.to_csv('../output/'+filename+'.csv', index=False)
#test = np.load('../output/params.npy').item()