In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import category_encoders
from sklearn.preprocessing import LabelEncoder

In [None]:
class DoubleValidationEncoderNumerical:
    def __init__(self, cols, encoder, folds):
        self.cols = cols
        self.encoder = encoder
        self.encoders_dict = {}
        self.folds = folds

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        for n_fold, (train_idx, val_idx) in enumerate(self.folds.split(X, y)):
            X_train, X_val = X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True)
            y_train, y_val = y[train_idx], y[val_idx]
            _ = self.encoder.fit_transform(X_train[self.cols], y_train)
            
            val_t = self.encoder.transform(X_val)
            val_t = val_t.fillna(np.mean(y_train))
            if n_fold == 0:
                cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
            
            self.encoders_dict[n_fold] = self.encoder
            cols_representation[val_idx, :] += val_t.values
        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        return cols_representation

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        cols_representation = None
        for encoder in self.encoders_dict.values():
            test_tr = encoder.transform(X)

            if cols_representation is None:
                cols_representation = np.zeros(test_tr.shape)
            cols_representation = cols_representation + test_tr / self.folds.n_splits
        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        return cols_representation
    
def process_data(data: pd.DataFrame):
    data['ord_5_1'] = data['ord_5'].apply(lambda x: x[0])
    data['ord_5_2'] = data['ord_5'].apply(lambda x: x[1])

    mapper_ord_1 = {'Novice': 1, 
                'Contributor': 2,
                'Expert': 3, 
                'Master': 4, 
                'Grandmaster': 5}

    mapper_ord_2 = {'Freezing': 1, 
                    'Cold': 2, 
                    'Warm': 3, 
                    'Hot': 4,
                    'Boiling Hot': 5, 
                    'Lava Hot': 6}

    mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                    'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}

    mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                    'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                    'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                    'W': 23, 'X': 24, 'Y': 25, 'Z': 26}
    
    for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
        data[col] = data[col].replace(mapper)
        
    ord_5 = sorted(list(set(data['ord_5'].values)))
    ord_5 = dict(zip(ord_5, range(len(ord_5))))
    data.loc[:, 'ord_5'] = data['ord_5'].apply(lambda x: ord_5[x]).astype(float)
    
    data['bin_3'] = data['bin_3'].apply(lambda x: 1 if x == 'T' else 0)
    data['bin_4'] = data['bin_4'].apply(lambda x: 1 if x == 'Y' else 0)
    
    def date_cyc_enc(df, col, max_vals):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
        return df

    data = date_cyc_enc(data, 'day', 7)
    data = date_cyc_enc(data, 'month', 12)
    
    return data

In [None]:
def do_le(X, X_test, cols):
    X_tr = pd.DataFrame()
    X_te = pd.DataFrame()
    new_cols = []
    for c in cols:
        c_name = c + '_le'
        new_cols.append(c_name)
        le = LabelEncoder()
        le.fit(list(X[c].astype(str).values) + list(X_test[c].astype(str).values))
        X_tr[c_name] = le.transform(list(X[c].astype(str).values))
        X_te[c_name] = le.transform(list(X_test[c].astype(str).values))
    return X_tr, X_te, new_cols

def do_ohe(X, X_test, cols):
    X_tr = pd.DataFrame()
    X_te = pd.DataFrame()
    X_tr = pd.get_dummies(X[cols])
    X_te = pd.get_dummies(X_test[cols])
    return X_tr, X_te

def do_hash(X, X_test, cols, preffix=None, n_feats=None):
    if preffix is None:
        raise ValueError('preffix should be set.')
    if n_feats is None:
        raise ValueError('n_feats should be set.')
    X_tr = pd.DataFrame()
    X_te = pd.DataFrame()
    new_cols = []
    
    for c in cols:
        c_name = c+'_hash'
        new_cols.append(c_name)
        size = X[c].nunique()
        X_tr[c_name] = X[c].apply( lambda x: hash(str(x)) % size )
        X_te[c_name] = X_test[c].apply( lambda x: hash(str(x)) % size )
    
    return X_tr, X_te, new_cols

def do_bin(X, X_test, cols):
    X_tr = pd.DataFrame()
    X_te = pd.DataFrame()
    be = category_encoders.BinaryEncoder(cols=cols).fit(X[cols])
    X_tr = be.transform(X[cols])
    X_te = be.transform(X_test[cols])
    new_cols = list(X_tr.columns)
    return X_tr, X_te, new_cols

def do_freq(X, X_test, cols):
    X_tr = pd.DataFrame()
    X_te = pd.DataFrame()
    new_cols = []
    for c in cols:
        c_name = c+'_freq'
        new_cols.append(c_name)
        tmp = pd.concat([X[[c]], X_test[[c]]])
        enc = tmp[c].value_counts().to_dict()   
        X_tr[c_name] = X_tr[c].map(enc)
        X_te[c_name]  = X_te[c].map(enc)
    return X_tr, X_te, new_cols

In [None]:
path = '/kaggle/input/cat-in-the-dat/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub = pd.read_csv(path+'sample_submission.csv')

In [None]:
# cat_feats = [c for c in train.columns if c not in ['id','target']]
# train = process_data(train)
# test = process_data(test)

In [None]:
X = train.drop(['id','target'],1)
y = train.target
X_test = test.drop(['id'],1)
test = test[['id']]

In [None]:
# lc_nom = [f'nom_{i}'for i in range(0,5)]
# hc_nom = [f'nom_{i}'for i in range(5,10)]
# lc_ord = [f'ord_{i}'for i in range(0,3)]
# hc_ord = [f'ord_{i}'for i in range(3,6)]

# # BINARY ENCODER for HIGH cardinality ORDINAL columns
# for_bin = hc_ord
# X_bin, X_test_bin, bin_cols = do_bin(X, X_test, for_bin)

# # OHE for LOW cardinality NOMINAL and ORDINAL columns
# for_ohe = lc_nom
# X_ohe, X_test_ohe = do_ohe(X, X_test, for_ohe)

# # HASHING for HIGH cardinality NOMINAL and ORDINAL columns
# for_hash = hc_nom
# n_feats = X[for_hash].nunique().sum()
# X_hash, X_test_hash, hash_cols = do_hash(X,X_test,for_hash,'_nom', n_feats=n_feats)

# # LABEL ENCODER for LOW cardinality ORDINAL columns
# for_le = lc_ord + ['ord_5_1', 'ord_5_2']
# X_le, X_test_le, le_cols = do_le(X, X_test, for_le)

# # # FREQUENCY ENCODER
# # for_freq = []
# # X_freq, X_test_freq, freq_cols = do_freq(X, X_test, for_freq)

# # del X_freq, X_test_freq
# # gc.collect()

# X.drop(
#     list(
#         for_bin+
#         for_le+
#         for_ohe+
#         for_hash
#         )
#     ,1, inplace=True)
# X_test.drop(
#    list(
#        for_bin+
#         for_le+
#         for_ohe+
#         for_hash)
#     ,1, inplace=True)

In [None]:
# X = pd.concat([X, 
#                X_bin,
#                X_le,
#                X_ohe,
#                X_hash,
#               ],1)
# del  X_bin, X_le, X_ohe, X_hash
# gc.collect()

# X_test = pd.concat([X_test, 
#                     X_test_bin,
#                     X_test_le,
#                     X_test_ohe,
#                     X_test_hash,
#                    ],1)
# del  X_test_bin, X_test_le, X_test_ohe, X_test_hash
# gc.collect()
# print(X.shape,X_test.shape)

In [None]:
import keras
import tensorflow as tf
import keras.backend as K

from keras.models import Model
from keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Concatenate, Embedding, Flatten
from keras.optimizers import Adam, Nadam
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from  keras.regularizers import l2

In [None]:
def get_dae(X):
    inp = Input((X.shape[1],))
    x = Dense(512, activation='relu')(inp)
    x = Dense(256, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(.2)(x)
    y = Dense(X.shape[1], activation='linear')(x)
    
    model = Model(inputs=inp, outputs=y)
    model.compile(optimizer='adam',
                  loss='mse')
    return model

def add_noise(arr, p):
    n, m = arr.shape
    idx_arr = range(n)
    swap_n = round(n*p)
    for i in range(m):
        col_vals = np.random.permutation(arr[:, i])
        swap_idx = np.random.choice(idx_arr, size= swap_n) 
        arr[swap_idx, i] = np.random.choice(col_vals, size = swap_n) 
    return arr

def data_gen(X, swap_rate, batch_size):
    idxs = np.arange(X.shape[0])
    while True:
        np.random.shuffle(idxs)
        X_orig = X[idxs[:batch_size]] 
        X_noisy = add_noise(X_orig, swap_rate)
        yield X_noisy, X_orig

In [None]:
from sklearn.preprocessing import MinMaxScaler
X, X_test, _ = do_le(X, X_test, list(X.columns))
mm = MinMaxScaler()
X = mm.fit_transform(X)
X_test = mm.transform(X_test)

In [None]:
batch_size = 2**11
epochs = 1
gen = data_gen(X, .15, batch_size)
dae = get_dae(X)
dae.fit_generator(generator=gen,
                 steps_per_epoch=X.shape[0]// batch_size,
                 epochs=epochs,
                 use_multiprocessing=True,
                 verbose=1)

In [None]:
dae.trainable = False
dae.compile(optimizer='adam',
                  loss='mse')
dae.summary()

In [None]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

def get_nn(X, dae):
    x1 = dae.layers[1].output
    x2 = dae.layers[2].output
    x3 = dae.layers[3].output
    x = Concatenate()([x1, x2, x3])
    
    x = Dense(500, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(200, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    nunique = len(np.unique(X))
    emb_size = int(min(50, nunique//2))
    max_len = X.shape[1]
    inp_z = Input(shape=(max_len,))
    z = Embedding(nunique+1, emb_size,input_length=max_len)(inp_z)
    z = Flatten()(z)
    z = Dropout(0.2)(z)
    z = Dense(2**6, activation = 'relu')(z)
    z = Dropout(0.2)(z)
    z = Dense(2**6, activation = 'relu')(z)
    z = Dropout(0.2)(z)
    
    conc = Concatenate()([x, z])
    y = Dense(1, activation='sigmoid')(conc)
    
    model = Model(inputs=[dae.input,inp_z], outputs=y)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                 metrics=[auc])
    
    return model

def get_callbacks():
    es = EarlyStopping(monitor='val_auc',
                      min_delta=0.001,
                      patience=2,
                      verbose=1,
                      mode='max',
                      baseline=None,
                      restore_best_weights=True)
    
    rlr = ReduceLROnPlateau(monitor='val_auc',
                           factor=0.5,
                           patience=3,
                           min_lr=1e-6,
                           mode='max',
                           verbose=1)
    return [es, rlr]

In [None]:
class DoubleValidationEncoderNumerical:
    def __init__(self, cols, encoder, folds):
        self.cols = cols
        self.encoder = encoder
        self.encoders_dict = {}
        self.folds = folds

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        for n_fold, (train_idx, val_idx) in enumerate(self.folds.split(X, y)):
            X_train, X_val = X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True)
            y_train, y_val = y[train_idx], y[val_idx]
            _ = self.encoder.fit_transform(X_train[self.cols], y_train)
            
            val_t = self.encoder.transform(X_val)
            val_t = val_t.fillna(np.mean(y_train))
            if n_fold == 0:
                cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
            
            self.encoders_dict[n_fold] = self.encoder
            cols_representation[val_idx, :] += val_t.values
        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        return cols_representation

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        cols_representation = None
        for encoder in self.encoders_dict.values():
            test_tr = encoder.transform(X)

            if cols_representation is None:
                cols_representation = np.zeros(test_tr.shape)
            cols_representation = cols_representation + test_tr / self.folds.n_splits
        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        return cols_representation

In [None]:
BATCH_SIZE = 2*11
EPOCHS = 10
N_SPLITS = 5
SEED = 555

preds = np.zeros(X_test.shape[0])
oof = np.zeros(X.shape[0])

skf = StratifiedKFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)
cat_feats = list(train.columns)
cat_feats.remove('id')
cat_feats.remove('target')

for i, (tr, val) in enumerate(skf.split(X,y)):
    print(f'Fold #{i}')
    X_tr, X_val = X[tr], X[val]
    y_tr, y_val = y[tr], y[val]
    
    model = get_nn(X,dae)
    model.fit(x=list([X_tr,X_tr]), y=y_tr,
                validation_data=(X_val, y_val),
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                verbose=1,
                callbacks = get_callbacks(),
                shuffle=True)

    train_pred = model.predict(X_tr)[:,0] 
    oof_pred = model.predict(X_val)[:,0]
    pred = model.predict(X_test)[:, 0]
    
    oof[val] = oof_pred.ravel()
    preds += pred
    
    print('-'*40)
    print('Fold {}:\t train {:.5f} val {:.5f}'.format(i, roc_auc_score(y_val, oof_pred),roc_auc_score(y_tr, train_pred)))
    print('-'*40)

In [None]:
test['pred'] = preds / N_SPLITS
sub['target'] = pd.merge(sub, test, on='id')['pred']
sub.to_csv('nn.csv')
sub.tail()