# TDT05 Project - Short Report

Student: Eivind Lie Andreassen

Student ID: 767767

Email: eiviland@stud.ntnu.no

Challenge ID: 2

Challenge Name: Santander Customer Transaction Prediction

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style('dark')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

from tqdm import tqdm_notebook as tqdm
import tensorflow as tf

## Load the data
We set a few flags for whether we are running locally and on Kaggle, and whether we want the reduced or full dataset.

In [2]:
IS_LOCAL = True  # Sets whether we are running locally or on kaggle
USE_REDUCED = True  # Sets whether we should use the smaller dataset

data_index = 2*int(IS_LOCAL) + int(USE_REDUCED)
train_path = ('../input/santander-customer-transaction-prediction/train.csv',
             '../input/santandersmall/train_small.csv',
             'train.csv',
             'train_small.csv')[data_index]
test_path = ('../input/santander-customer-transaction-prediction/test.csv',
             '../input/santandersmall/test_small_with_targets.csv',
             'test.csv',
             'test_small.csv')[data_index]

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [3]:
features = [col for col in train_df.columns if col not in ['target', 'ID_code']]
if not 'target' in test_df:
    test_df['target'] = -1

all_df = pd.concat([train_df, test_df], sort=False)

## Pre-Processing and Feature Engineering

### Removing fake test samples

In [21]:
unique_count = np.zeros((test_df.shape[0], len(features)))

for f, feature in tqdm(enumerate(features), total=len(features)):
    _, i, c = np.unique(test_df[feature], return_counts=True, return_index=True)
    unique_count[i[c == 1], f] += 1

real_sample_indices = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_sample_indices = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
print('Real:', len(real_sample_indices))
print('Synthetic:', len(synthetic_sample_indices))

del unique_count

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Real: 70065
Synthetic: 0


### Calculate counts

In [22]:
all_real_df = pd.concat([train_df, test_df.iloc[real_sample_indices, :]], sort=False)

for feature in tqdm(features):
    real_series = all_real_df[feature]
    
    # We only use the real samples to produce the count
    counts = real_series.groupby(real_series).count()
    
    full_series = all_df[feature]
    all_df[f'{feature}_count'] = full_series.map(counts)

del all_real_df
del real_series
del full_series

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




### Statistical Properties

In [23]:
all_df['sum'] = all_df[features].sum(axis=1)
all_df['mean'] = all_df[features].mean(axis=1)
all_df['min'] = all_df[features].min(axis=1)
all_df['max'] = all_df[features].max(axis=1)
all_df['std'] = all_df[features].std(axis=1)
all_df['median'] = all_df[features].median(axis=1)
all_df['skew'] = all_df[features].skew(axis=1)
all_df['kurt'] = all_df[features].kurt(axis=1)

statistical_features = ['mean', 'min', 'max', 'std', 'median', 'skew', 'kurt']
# Due to normalization, mean and sum become the same value, so we only include one of them

### Normalization

In [24]:
for feature in tqdm(features + statistical_features):
    if feature in features:
        all_df[feature] = StandardScaler().fit_transform(all_df[feature].values.reshape(-1, 1))
        all_df[f'{feature}_count'] = MinMaxScaler().fit_transform(all_df[f'{feature}_count'].values.reshape(-1, 1))
    if feature in statistical_features:
        all_df[feature] = StandardScaler().fit_transform(all_df[feature].values.reshape(-1, 1))

HBox(children=(IntProgress(value=0, max=207), HTML(value='')))




### Update feature list

In [25]:
for f in range(len(features)):
    features.append(f'{features[f]}_count')
features.extend(statistical_features)

### Splitting datasets back up

In [26]:
train_df = all_df.iloc[:train_df.shape[0], :]
test_df = all_df.iloc[train_df.shape[0]:, :]

del all_df

## Model

In [32]:
def get_regularized_cnn_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Reshape((len(features) * 1, 1), input_shape=(len(features) * 1,)),
        tf.keras.layers.Conv1D(32, 1, activation='elu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Conv1D(64, 1, activation='elu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.005)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
    
    return model

In [33]:
N_SPLITS = 5
BATCH_SIZE = 256
EPOCHS = 100
EARLY_STOPPING_PATIENCE = 8

OPTIMIZER = tf.keras.optimizers.Nadam()
LOSS='binary_crossentropy'
METRICS=[tf.keras.metrics.AUC()]

model_fn = get_regularized_cnn_model

In [34]:
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

histories = []
oof_preds_sum = np.zeros((train_df.shape[0],))
train_preds_sum = np.zeros((train_df.shape[0],))
test_preds_sum = np.zeros((test_df.shape[0],))

for fold_num, (train_index, val_index) in tqdm(enumerate(kfold.split(train_df[features].values, train_df['target'].values)), total=N_SPLITS):
    print(f'Fold {fold_num+1}/{N_SPLITS}:')
    
    X_train = train_df.loc[train_index, features].values
    y_train = train_df.loc[train_index, 'target'].values.reshape(-1, 1)
    X_val = train_df.loc[val_index, features].values
    y_val = train_df.loc[val_index, 'target'].values.reshape(-1, 1)
    
    model = model_fn()
    model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)
    
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=EARLY_STOPPING_PATIENCE, restore_best_weights=True)
    
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping_callback])
    histories.append(history)
    
    
    print(f'Creating predictions for fold {fold_num + 1}/{N_SPLITS}')
    val_preds = model.predict(X_val)
    train_preds = model.predict(X_train)
    test_preds = model.predict(test_df[features].values)
    
    oof_preds_sum[val_index] += val_preds[:, 0]
    train_preds_sum[train_index] += train_preds[:, 0]
    test_preds_sum += test_preds[:, 0]
    
    val_auc = roc_auc_score(y_val, val_preds)
    print(f'Fold validation AUC: {val_auc}')
    print()
    #models.append(model)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Fold 1/5:
Train on 56045 samples, validate on 14012 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Creating predictions for fold 1/5
Fold validation AUC: 0.8845454774346373

Fold 2/5:
Train on 56045 samples, validate on 14012 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Creating predictions for fold 2/5
Fold validation AUC: 0.8851989172055574

Fold 3/5:
Train on 56046 samples, validate on 14011 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Creating predictions for fold 3/5
Fold validation AUC: 0.8907000084781553

Fold 4/5:
Train on 

Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Creating predictions for fold 5/5
Fold validation AUC: 0.8911392665816883




### Create submission

In [35]:
sub = pd.DataFrame({'ID_code': test_df['ID_code'], 'target': test_preds_sum})
sub.to_csv('submission.csv', index=False)

In [36]:
from IPython.display import FileLink
FileLink('submission.csv')