In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../../code')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm.auto import tqdm

from models import ModifiedMLPClassifier
from utils import LRAP, perfection

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

X_train = pd.read_csv("../../data/expanded/train_features.csv", names=range(NUM_FEATURES))
y_train = pd.read_csv("../../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

X_valid = pd.read_csv("../../data/expanded/valid_features.csv", names=range(NUM_FEATURES))
y_valid = pd.read_csv("../../data/expanded/valid_labels.csv", names=range(NUM_CLASSES))

print(f"{X_train.shape[0]} examples in training set, and {X_valid.shape[0]} in validation.")

13959 examples in training set, and 1552 in validation.


## Pre-processing data

In [2]:
# Set True to apply log transformation to the features
# Intended to be used in combination with one of the other techniques below
LOG = True

STANDARDIZE = True     # Set True to standardize, False otherwise
NORMALIZE = False      # Set True to normalize, False otherwise
PCA_COMPONENTS = None  # Set to number of components for PCA (automatically standardized)


# ----- Code below applies the transformations as specified ----- #
func = []
if LOG:
    X_train_transformed = np.log1p(X_train.to_numpy())
    X_valid_transformed = np.log1p(X_valid.to_numpy())
else:
    X_train_transformed = X_train.to_numpy()
    X_valid_transformed = X_valid.to_numpy()

if STANDARDIZE:
    func = [StandardScaler()]
elif NORMALIZE:
    func = [MinMaxScaler()]
elif PCA_COMPONENTS is not None:
    func = [StandardScaler(with_std=False)]
    func.append(PCA(n_components=PCA_COMPONENTS, random_state=SEED))
    func.append(StandardScaler(with_mean=False))
else:
    func = None

if func is not None:
    for f in func:
        f = f.fit(X_train_transformed)
        X_train_transformed = f.transform(X_train_transformed)
        X_valid_transformed = f.transform(X_valid_transformed)

## Model training

While training the multi-layer perceptron, the training loss and validation LRAP are reported in each iteration.

In [3]:
mlp = ModifiedMLPClassifier(
    hidden_layer_sizes=(512,),
    activation="relu",
    max_iter=100,
    batch_size=800,
    alpha=0.2,
    early_stopping=True,
    verbose=True,
    random_state=SEED,
    custom_validation_data=(X_valid_transformed, y_valid)
)
mlp = mlp.fit(X_train_transformed, y_train)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Iteration 1, loss = 677.75142669
Validation score: 0.048437
Iteration 2, loss = 82.54380872
Validation score: 0.067134
Iteration 3, loss = 63.97587383
Validation score: 0.107428
Iteration 4, loss = 48.84972903
Validation score: 0.149965
Iteration 5, loss = 40.15930350
Validation score: 0.189502
Iteration 6, loss = 35.17388524
Validation score: 0.220418
Iteration 7, loss = 32.04771993
Validation score: 0.249521
Iteration 8, loss = 29.85018503
Validation score: 0.272278
Iteration 9, loss = 28.15857928
Validation score: 0.297621
Iteration 10, loss = 26.71161681
Validation score: 0.319205
Iteration 11, loss = 25.45974353
Validation score: 0.337590
Iteration 12, loss = 24.32663647
Validation score: 0.358750
Iteration 13, loss = 23.30106971
Validation score: 0.372639
Iteration 14, loss = 22.34775572
Validation score: 0.387328
Iteration 15, loss = 21.47797844
Validation score: 0.402208
Iteration 16, loss = 20.66359537
Validation score: 0.411926
Iteration 17, loss = 19.89646758
Validation scor

## Evaluation

In [4]:
score_train = mlp.score(X_train_transformed, y_train.to_numpy())
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.9692


In [5]:
score_valid = mlp.score(X_valid_transformed, y_valid.to_numpy())
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5839


In [24]:
print(f"{perfection(y_valid, predictions_valid) * 100:.3f}% of examples perfectly predicted")

3.479% of examples perfectly predicted


In [6]:
sample_idx = 123
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [7]:
np.nonzero(actual)

(array([342, 418, 637, 682, 906, 992], dtype=int64),)

In [8]:
np.nonzero(sample_pred)

(array([128, 358, 637, 682, 992], dtype=int64),)