# Tabular Playground Series - Nov 21

This month, our data consists of 284 feature variables and our target variable is binary classification. In the last two Kaggle Playgrounds I participated in (August and October), I focused on making predictions using ensembling techniques (bagging and boosting). More specifically, I dived into the use of Random Forest, XGBoost, LightGBM and HistGBM. 

For this month, I wanted to take a different approach and use Neural Networks instead. In this notebook, I will use tensorflow (keras) to test and build different models.

## Imports 

Let's import some of the libraries we will be using throughout the notebook

In [None]:
# Data Import on Kaggle
import os
import time
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing processing libraries
import numpy as np
import pandas as pd

# Importing Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Importing libraries for the metrics
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

# Keras Imports
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# sklearn imports for analysis
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats import randint

In [None]:
data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

data = data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

## Memory Reduction

If you don't have any issues with memory, you can go ahead and skip this step. 
Here, we will take a look at the memory consumption by the current data and each feature following which we will try to reduce it to some extent. 

There are several other methods to save RAM - you can refer to this article on [14 tips to save RAM memory](https://www.kaggle.com/pavansanagapati/14-simple-tips-to-save-ram-memory-for-1-gb-dataset). 

In [None]:
memory_usage = data.memory_usage(deep=True) / 1024 ** 2
print('memory usage of features: \n', memory_usage.head(7))
print('memory usage sum: ',memory_usage.sum())

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

data = reduce_memory_usage(data, verbose=True)
test_data = reduce_memory_usage(test_data, verbose=True)

# Data Prep

In [None]:
from sklearn.preprocessing import StandardScaler

X = data.drop('target', axis=1)
y = data.target

tt = test_data.values

X_scaled = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

tt = StandardScaler().fit_transform(tt)

In [None]:
# scaled_data = data.copy()
# features = data.columns
# scale = MinMaxScaler()
# scaled_data[features]=scale.fit_transform(scaled_data[features])
# scaled_data[features]= scale.transform(scaled_data[features])  

# test_data_scaled = test_data.copy()
# test_data_scaled[features.drop('target')]=scale.fit_transform(test_data_scaled[features.drop('target')])
# test_data_scaled[features.drop('target')]= scale.transform(test_data_scaled[features.drop('target')])  

# X = scaled_data.drop('target', axis=1)
# y = scaled_data.target
# tt = test_data_scaled.values

In [None]:
# X = data.drop('target', axis=1)
# y = data.target

# tt = test_data.values

In [None]:
model = Sequential([
#     Flatten(input_shape=(100,)),
    Dense(128, activation=tf.nn.swish),    
#     Dropout(0.2),
    Dense(128, activation=tf.nn.swish),
#     Dropout(0.2),
    Dense(128, activation=tf.nn.swish),
#     Dropout(0.2),
    Dense(64, activation=tf.nn.swish),
#     Dropout(0.2),
    Dense(1, activation=tf.nn.sigmoid),
])

In [None]:
from sklearn.model_selection import StratifiedKFold
# from keras.optimizers import Adam

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, verbose=0,
    mode='min',restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

test_predictions_nn = np.zeros(test_data.shape[0])

scores_folds = {}
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2020)
scores_folds['NN'] = []
counter = 1


for fold, (trn_ind, val_ind) in enumerate(kf.split(X, y)):
    print(f'Training fold {fold + 1}')
    X_train, X_test = X.iloc[trn_ind][:], X.iloc[val_ind][:]
    y_train, y_test = y.iloc[trn_ind], y.iloc[val_ind]
    print('CV {}/{}'.format(counter, n_folds)) 


    model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['AUC'])
    model.fit(X_train, 
              y_train, 
              epochs=100, 
              batch_size=1500, 
              validation_data=(X_test, y_test), 
              callbacks=[es, plateau],
              validation_batch_size=len(y_test),
              shuffle=True,
              verbose = 1)

    preds = model.predict(X_test).reshape(1,-1)[0]
    score = round(roc_auc_score(y_test, preds),5)
    print('Fold {} {}: {}'.format(counter, 'NN', score))
    scores_folds['NN'].append(score)
    test_predictions_nn += model.predict([tt]).reshape(1,-1)[0].clip(0,1e10)/n_folds       
    counter += 1

In [None]:
sub=pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
sub['target']=test_predictions_nn
sub.to_csv("nn_submission11.csv",index=False)