In [None]:
import numpy as np 
import pandas as pd

import warnings, gc
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding, MaxPooling1D, Concatenate
from keras.layers import SpatialDropout1D, Reshape, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import Input
from keras import backend as K

In [None]:
def get_emb_size(nunique):
    return int(min(50, np.ceil((nunique+1)/ 2) ))

def do_le(df, cols, single_emb=True):
    for c in cols:
        df[c] = df[c].astype('category')
        df[c]= LabelEncoder().fit_transform(df[c])
    if single_emb:
        nuniques = [df[c].nunique() for c in cols]
        offset = np.cumsum([0] + nuniques[:-1])
        for i,c in enumerate(cols):
            df[c] = df[c] + offset[i]
    return df

In [None]:
def plot_embs(emb_weights_pd):
    m = emb_weights_pd.iloc[:,1:].as_matrix()
    labels = emb_weights_pd.iloc[:,0:1].as_matrix()
    fig = plt.figure(figsize=(20,10))
    ax = Axes3D(fig)

    for i in range(len(labels)):
        ax.scatter(m[i,0],m[i,1],m[i,2], color='b')
        ax.text(m[i,0],m[i,1],m[i,2],'%s'%(str(labels[i][0])), size=20, zorder=1, color='k')

    ax.set_xlabel('Embedding 1')  
    ax.set_ylabel('Embedding 2')  
    ax.set_zlabel('Embedding 3')  
    plt.show()
    
def plot_history(histort):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','val'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train','val'], loc='upper left')
    plt.show()

In [None]:
path = '../input/cat-in-the-dat/'
dtr = pd.read_csv(path + "train.csv")
dts = pd.read_csv(path + "test.csv")
d = pd.concat([dtr, dts], sort=False)
train_set = dtr.shape[0]
del(dtr, dts)
gc.collect()

In [None]:
cat_feats = [c for c in d.columns if c not in ['id','target']]
d = do_le(d, cat_feats, single_emb=False)
train = d[:train_set]
test = d[train_set:]

In [None]:
X = train.drop(['id','target'],1).as_matrix()
y = train.target
X_test = test.drop(['id','target'],1).as_matrix()
test = test[['id']]

In [None]:
X2 = np.concatenate([np.reshape(c, (-1,1) ) for c in X], axis=1)
X_test2 = np.concatenate([np.reshape(c, (-1,1) ) for c in X_test], axis=1)
np.shape(X2),np.shape(X_test2)

In [None]:
def create_model_emb(df, cols):
        max_feats = df.nunique().sum()
        emb_size = get_emb_size(max_feats)
        max_len = df.shape[1]

        model = Sequential()
        model.add(Embedding(max_feats, emb_size, input_length=max_len, name="embedding"))

        model.add(Flatten())
        model.add(Dropout(0.2))

        model.add(Dense(2**6, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(2**6, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(1, activation='sigmoid'))

        return model

def create_model_N_emb(df):
        inps = []
        outs = []
        for c in df:
            nunique = np.max(c)
            emb_size = get_emb_size(nunique)
            inp = Input(shape=(1,))
            out = Embedding(nunique+1, emb_size, input_length=1)(inp)
            out = SpatialDropout1D(0.3)(out)
            inps.append(inp)
            outs.append(out)

        x = Concatenate()(outs)
        x = Flatten()(x)
        x = BatchNormalization()(x)

        x = Dense(2**8, activation="relu")(x)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)

        x = Dense(2**8, activation="relu")(x)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        
        y = Dense(1, activation='sigmoid')(x)
        
        model = Model(inputs=inps, outputs=y)
        return model


In [None]:
NSPLITS = 20
SEED = 5555
BATCH_SIZE = 1024
EPOCHS = 10

oof = np.zeros((X.shape[0]))
test_preds = np.zeros(X_test.shape[0])
skf = StratifiedKFold(n_splits=NSPLITS, random_state=SEED, shuffle=True)

for i, (tr, val) in enumerate(skf.split(X,y)):
    X_tr = [c[tr] for c in X2]
    X_val = [c[val] for c in X2]
#     X_tr, X_val = X[tr,:], X[val,:]
    y_tr, y_val = y[tr], y[val]
    
#     model = create_model_emb(d[cat_feats], cat_feats)
    model = create_model_N_emb(X2)
    
    model.compile(loss='binary_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])
    
    es = EarlyStopping(monitor='val_auc',
                      min_delta=0.001,
                      patience=2,
                      verbose=1,
                      mode='max',
                      baseline=None,
                      restore_best_weights=True)
    
    rlr = ReduceLROnPlateau(monitor='val_auc',
                           factor=0.5,
                           patience=3,
                           min_lr=1e-6,
                           mode='max',
                           verbose=1)
    
    model.fit(X_tr, y_tr,
              validation_data = (X_val, y_val),
              verbose=1,
              callbacks = [es, rlr],
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              shuffle=True)
    
    oof_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    oof[val] = oof_pred.ravel()
    test_preds += test_pred.ravel()
    
    print('-'*40)
    print('Fold {}:\t {:.5f}'.format(i, roc_auc_score(y_val, oof_pred)))
    print('-'*40)
    K.clear_session()

In [None]:
print("Overall AUC={}".format(roc_auc_score(y, oof)))

In [None]:
# Overall AUC=0.7956392905855455 many emb

In [None]:
pred /= NSPLITS
test['pred'] = pred
sub['target'] = pd.merge(sub, test, on='id')['pred']
sub.to_csv('nn.csv', index=False)
sub.tail()