In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../../code')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm.auto import tqdm

from models import ModifiedMLPClassifier
from utils import LRAP, perfection

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

X_train = pd.read_csv("../../data/expanded/train_features.csv", names=range(NUM_FEATURES))
y_train = pd.read_csv("../../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

X_valid = pd.read_csv("../../data/expanded/valid_features.csv", names=range(NUM_FEATURES))
y_valid = pd.read_csv("../../data/expanded/valid_labels.csv", names=range(NUM_CLASSES))

print(f"{X_train.shape[0]} examples in training set, and {X_valid.shape[0]} in validation.")

13959 examples in training set, and 1552 in validation.


## Pre-processing data

In [2]:
# Set True to apply log transformation to the features
# Intended to be used in combination with one of the other techniques below
LOG = False

STANDARDIZE = True     # Set True to standardize, False otherwise
NORMALIZE = False      # Set True to normalize, False otherwise
PCA_COMPONENTS = None  # Set to number of components for PCA (automatically standardized)


# ----- Code below applies the transformations as specified ----- #
func = []
if LOG:
    X_train_transformed = np.log1p(X_train.to_numpy())
    X_valid_transformed = np.log1p(X_valid.to_numpy())
else:
    X_train_transformed = X_train.to_numpy()
    X_valid_transformed = X_valid.to_numpy()

if STANDARDIZE:
    func = [StandardScaler()]
elif NORMALIZE:
    func = [MinMaxScaler()]
elif PCA_COMPONENTS is not None:
    func = [StandardScaler(with_std=False)]
    func.append(PCA(n_components=PCA_COMPONENTS, random_state=SEED))
    func.append(StandardScaler(with_mean=False))
else:
    func = None

if func is not None:
    for f in func:
        f = f.fit(X_train_transformed)
        X_train_transformed = f.transform(X_train_transformed)
        X_valid_transformed = f.transform(X_valid_transformed)

## Model training

While training the multi-layer perceptron, the training and validation loss are reported in each iteration (or epoch).

In [3]:
mlp = ModifiedMLPClassifier(
    hidden_layer_sizes=(256,),
    activation="relu",
    max_iter=100,
    alpha=0.1,
    early_stopping=True,
    verbose=True,
    random_state=SEED,
    custom_validation_data=(X_valid_transformed, y_valid)
)
mlp = mlp.fit(X_train_transformed, y_train)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Iteration 1, training loss = 364.64399666
Validation score: 0.060534
Iteration 2, training loss = 48.41153234
Validation score: 0.097176
Iteration 3, training loss = 39.32583581
Validation score: 0.138767
Iteration 4, training loss = 34.96982248
Validation score: 0.176535
Iteration 5, training loss = 32.14490112
Validation score: 0.208537
Iteration 6, training loss = 29.97408445
Validation score: 0.234303
Iteration 7, training loss = 28.15885611
Validation score: 0.263545
Iteration 8, training loss = 26.59237683
Validation score: 0.287220
Iteration 9, training loss = 25.19528590
Validation score: 0.308971
Iteration 10, training loss = 23.95072599
Validation score: 0.326858
Iteration 11, training loss = 22.81576999
Validation score: 0.346083
Iteration 12, training loss = 21.75109183
Validation score: 0.364869
Iteration 13, training loss = 20.78110786
Validation score: 0.374543
Iteration 14, training loss = 19.86857619
Validation score: 0.392631
Iteration 15, training loss = 19.01677641


## Evaluation

In [9]:
score_train = mlp.score(X_train_transformed, y_train.to_numpy())
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.9312


In [10]:
score_valid = mlp.score(X_valid_transformed, y_valid.to_numpy())
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5329


In [None]:
print(f"{perfection(y_valid, predictions_valid) * 100:.3f}% of examples perfectly predicted")

In [32]:
sample_idx = 117
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [33]:
np.nonzero(actual)

(array([ 141,  465, 1778, 1779, 1837, 2378], dtype=int64),)

In [34]:
np.nonzero(sample_pred)

(array([ 465, 1779, 1837, 2378], dtype=int64),)