In [None]:
%load_ext nb_black
%load_ext autoreload

%autoreload 2

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np

np.random.seed(0)


import os
from requests import get
from pathlib import Path
import gc

from matplotlib import pyplot as plt
from zipfile import ZipFile

%matplotlib inline

In [None]:
from tensorflow.keras.utils import plot_model

In [None]:
from thc_net.classifier import ThcNetClassifier
from thc_net.utils import download, plot_history
from thc_net.input_utils import prepare_input_data

# Bank marketing  : loading data

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
dataset_name = "bank-additional-full"
out_zip = Path(os.getcwd().rsplit("/", 1)[0] + "/data/bank-additional.zip")
out = Path(
    os.getcwd().rsplit("/", 1)[0] + "/data/bank-additional/" + dataset_name + ".csv"
)

download(url, out_zip)
with ZipFile(out_zip, "r") as zipObj:
    zipObj.extractall("/data")

target = "y"
to_remove = []
train = pd.read_csv(out, sep=";", low_memory=False)

# Census income  : loading data

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = "census-income"
out = Path(os.getcwd().rsplit("/", 1)[0] + "/data/" + dataset_name + ".csv")


download(url, out)
target = " <=50K"
to_remove = []
train = pd.read_csv(out, sep=",", low_memory=False)

# Load data and split

In [None]:
if "Set" not in train.columns:
    train["Set"] = np.random.choice(
        ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=(train.shape[0],)
    )

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index

In [None]:
Y = train[target].values
X = train.drop(columns=["Set"] + [target])

In [None]:
ratio = 0.005

In [None]:
n_unique = X.nunique()
ratios = (n_unique / X.shape[0]) < ratio
cat_idxs = np.argwhere(
    X.columns.isin(X.columns[ratios | (X.dtypes == "object")])
).ravel()
cat_dims = n_unique[cat_idxs].values + X.isnull().sum()[cat_idxs].values + 1
del n_unique, ratios, train

In [None]:
gc.collect()

# Training

In [None]:
tg_enc = LabelEncoder()
Y = tg_enc.fit_transform(Y)

In [None]:
X_train = X.values[train_indices]
y_train = Y[train_indices]

X_valid = X.values[valid_indices]
y_valid = Y[valid_indices]

X_test = X.values[test_indices]
y_test = Y[test_indices]

In [None]:
X_train.shape

In [None]:
X_train_prep, encoders = prepare_input_data(X_train, cat_idxs)
X_valid_prep, _ = prepare_input_data(X_valid, cat_idxs, encoders=encoders, fit=False)
X_test_prep, _ = prepare_input_data(X_test, cat_idxs, encoders=encoders, fit=False)

# Network parameters

In [None]:
metrics = ["AUC"]

In [None]:
%%time

model = ThcNetClassifier(
    n_layer=3,
    mul_input=8, 
    metrics=metrics,
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=20,
    patience=30
)

history = model.fit(
        X=X_train_prep, 
        y=y_train,
        X_valid=X_valid_prep,
        y_valid=y_valid,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
%%time

model_snn = ThcNetClassifier(
    n_layer=3,
    mul_input=8, 
    metrics=metrics,
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=20,
    patience=30,
    use_snn=True,
    noise=None
)

history_snn = model_snn.fit(
        X=X_train_prep, 
        y=y_train,
        X_valid=X_valid_prep,
        y_valid=y_valid,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
model.network.summary()

In [None]:
plot_model(
    model,
    # to_file="model.png",
    show_shapes=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
)

In [None]:
model_snn.network.summary()

### Predictions

In [None]:
plot_history(history)

In [None]:
plot_history(history_snn)

In [None]:
y_pred = model.predict_proba(X_test_prep)
test_auc = roc_auc_score(y_score=y_pred[:, 1], y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
y_pred = model_snn.predict_proba(X_test_prep)
test_auc = roc_auc_score(y_score=y_pred[:, 1], y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")