In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, mean_absolute_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path
import gc
from zipfile import ZipFile

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
from thc_net.regressor import ThcNetRegressor
from thc_net.classifier import ThcNetClassifier

from thc_net.input_utils import prepare_input_data, detect_cat

# Download census-income dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
#dataset_name = 'student-por'
dataset_name = 'wine-quality'
#out_zip = Path(os.getcwd().rsplit("/", 1)[0]+'/data/student.zip')
out = Path(os.getcwd().rsplit("/", 1)[0]+'/data/'+dataset_name+'.csv')

In [None]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())
    #with ZipFile(out_zip, 'r') as zipObj:
    #    zipObj.extractall("/data")

# Load data and split

In [None]:
train = pd.read_csv(out, sep=";", low_memory=False)
to_remove = []
target = 'quality'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
Y = train[target].values -1 
X = train.drop(columns=['Set'] + [target] + to_remove)

In [None]:
ratio = 0.005

In [None]:
cat_idxs, cat_dims = detect_cat(X, ratio=ratio)

In [None]:
gc.collect()

# Training

In [None]:
tg_enc = LabelEncoder()
Y_clf = tg_enc.fit_transform(Y)

In [None]:
X_train = X.values[train_indices]
y_train = Y[train_indices]
y_train_clf = Y_clf[train_indices]

X_valid = X.values[valid_indices]
y_valid = Y[valid_indices]
y_valid_clf = Y_clf[valid_indices]

X_test = X.values[test_indices]
y_test = Y[test_indices]
y_test_clf = Y_clf[test_indices]

In [None]:
X_train.shape

In [None]:
X_train_prep, encoders = prepare_input_data(X_train, cat_idxs)
X_valid_prep, _ = prepare_input_data(X_valid, cat_idxs, encoders=encoders, fit=False)
X_test_prep, _ = prepare_input_data(X_test, cat_idxs, encoders=encoders, fit=False)

# Network parameters

In [None]:
%%time

model = ThcNetRegressor(
    n_layer=3,
    mul_input=8, 
    #metrics=['AUC'],
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=10,
    patience=20
)

history = model.fit(
        X=X_train_prep, 
        y=y_train,
        X_valid=X_valid_prep,
        y_valid=y_valid,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
%%time

model_clf = ThcNetClassifier(
    n_layer=3,
    mul_input=8, 
    #metrics=['AUC'],
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=10,
    patience=20
)

history_clf = model_clf.fit(
        X=X_train_prep, 
        y=y_train_clf,
        X_valid=X_valid_prep,
        y_valid=y_valid_clf,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
%%time

model_snn_clf = ThcNetClassifier(
    n_layer=3,
    mul_input=8, 
    #metrics=['AUC'],
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=10,
    patience=20,
    use_snn=True,
    noise=None
)

history_snn_clf = model_snn_clf.fit(
        X=X_train_prep, 
        y=y_train_clf,
        X_valid=X_valid_prep,
        y_valid=y_valid_clf,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
%%time

model_snn = ThcNetRegressor(
    n_layer=3,
    mul_input=8, 
    #metrics=['AUC'],
    cat_idxs=cat_idxs,
    cat_emb_dims=cat_dims,
    dropout=0.05,
    normalize=False,
    max_emb=10,
    patience=20,
    use_snn=True,
    noise=None
)

history_snn = model_snn.fit(
        X=X_train_prep, 
        y=y_train,
        X_valid=X_valid_prep,
        y_valid=y_valid,
        batch_size=1024,
        epochs=10000,
        verbose=2,
)

In [None]:
model.network.summary()

In [None]:
model_clf.network.summary()

In [None]:
model_snn.network.summary()

In [None]:
model_snn_clf.network.summary()

### Predictions

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history_clf.history['loss'])
plt.plot(history_clf.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history_snn.history['loss'])
plt.plot(history_snn.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history_snn_clf.history['loss'])
plt.plot(history_snn_clf.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
y_pred = model.predict_proba(X_test_prep)
test_auc = mean_absolute_error(y_pred=y_pred, y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
y_pred = tg_enc.inverse_transform(np.argmax(model_clf.predict_proba(X_test_prep), axis=1))
test_auc = mean_absolute_error(y_pred=y_pred, y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
y_pred = model_snn.predict_proba(X_test_prep)
test_auc = mean_absolute_error(y_pred=y_pred, y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
y_pred = tg_enc.inverse_transform(np.argmax(model_snn_clf.predict_proba(X_test_prep), axis=1))
test_auc = mean_absolute_error(y_pred=y_pred, y_true=y_test)
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")