In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../../code')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm.auto import tqdm

from models import ModifiedMLPClassifier
from utils import LRAP, perfection

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

X_train = pd.read_csv("../../data/expanded/train_features.csv", names=range(NUM_FEATURES))
y_train = pd.read_csv("../../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

X_valid = pd.read_csv("../../data/expanded/valid_features.csv", names=range(NUM_FEATURES))
y_valid = pd.read_csv("../../data/expanded/valid_labels.csv", names=range(NUM_CLASSES))

print(f"{X_train.shape[0]} examples in training set, and {X_valid.shape[0]} in validation.")

13959 examples in training set, and 1552 in validation.


## Pre-processing data

In [2]:
# Set True to apply log transformation to the features
# Intended to be used in combination with one of the other techniques below
LOG = True

STANDARDIZE = True     # Set True to standardize, False otherwise
NORMALIZE = False      # Set True to normalize, False otherwise
PCA_COMPONENTS = None  # Set to number of components for PCA (automatically standardized)


# ----- Code below applies the transformations as specified ----- #
func = []
if LOG:
    X_train_transformed = np.log1p(X_train.to_numpy())
    X_valid_transformed = np.log1p(X_valid.to_numpy())
else:
    X_train_transformed = X_train.to_numpy()
    X_valid_transformed = X_valid.to_numpy()

if STANDARDIZE:
    func = [StandardScaler()]
elif NORMALIZE:
    func = [MinMaxScaler()]
elif PCA_COMPONENTS is not None:
    func = [StandardScaler(with_std=False)]
    func.append(PCA(n_components=PCA_COMPONENTS, random_state=SEED))
    func.append(StandardScaler(with_mean=False))
else:
    func = None

if func is not None:
    for f in func:
        f = f.fit(X_train_transformed)
        X_train_transformed = f.transform(X_train_transformed)
        X_valid_transformed = f.transform(X_valid_transformed)

## Model training

While training the multi-layer perceptron, the training loss and validation LRAP are reported in each iteration.

In [None]:
mlp = ModifiedMLPClassifier(
    hidden_layer_sizes=(256,),
    activation="relu",
    max_iter=1500,
    batch_size=800,
    learning_rate_init=0.0001,
    alpha=0.2,
    early_stopping=True,
    verbose=True,
    random_state=SEED,
    custom_validation_data=(X_valid_transformed, y_valid)
)
mlp = mlp.fit(X_train_transformed, y_train)

HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))

Iteration 1, loss = 2579.64053454
Validation score: 0.004111
Iteration 2, loss = 1764.52872886
Validation score: 0.003539
Iteration 3, loss = 1042.69837609
Validation score: 0.003804
Iteration 4, loss = 803.72397964
Validation score: 0.004718
Iteration 5, loss = 708.93256301
Validation score: 0.016451
Iteration 6, loss = 636.05512302
Validation score: 0.023406
Iteration 7, loss = 563.44539588
Validation score: 0.027925
Iteration 8, loss = 481.56832250
Validation score: 0.030920
Iteration 9, loss = 386.34656049
Validation score: 0.032742
Iteration 10, loss = 286.46424778
Validation score: 0.034528
Iteration 11, loss = 200.71567724
Validation score: 0.036483
Iteration 12, loss = 142.21391850
Validation score: 0.038583
Iteration 13, loss = 108.44307406
Validation score: 0.040850
Iteration 14, loss = 89.25569704
Validation score: 0.044578
Iteration 15, loss = 77.45968372
Validation score: 0.050716
Iteration 16, loss = 69.45184478
Validation score: 0.056756
Iteration 17, loss = 63.70011579


Iteration 137, loss = 23.34371778
Validation score: 0.352996
Iteration 138, loss = 23.24760843
Validation score: 0.355170
Iteration 139, loss = 23.15052337
Validation score: 0.355908
Iteration 140, loss = 23.05542571
Validation score: 0.358900
Iteration 141, loss = 22.95957925
Validation score: 0.359969
Iteration 142, loss = 22.86472297
Validation score: 0.362318
Iteration 143, loss = 22.76969683
Validation score: 0.364020
Iteration 144, loss = 22.67385245
Validation score: 0.365395
Iteration 145, loss = 22.57978147
Validation score: 0.367893
Iteration 146, loss = 22.48558137
Validation score: 0.369591
Iteration 147, loss = 22.39183053
Validation score: 0.371495
Iteration 148, loss = 22.29720486
Validation score: 0.372373
Iteration 149, loss = 22.20474916
Validation score: 0.374010
Iteration 150, loss = 22.11117137
Validation score: 0.376000
Iteration 151, loss = 22.01899710
Validation score: 0.377620
Iteration 152, loss = 21.92645075
Validation score: 0.379266
Iteration 153, loss = 21

Validation score: 0.516392
Iteration 272, loss = 12.30097440
Validation score: 0.517052
Iteration 273, loss = 12.23094135
Validation score: 0.518137
Iteration 274, loss = 12.16161166
Validation score: 0.518871
Iteration 275, loss = 12.09114432
Validation score: 0.519730
Iteration 276, loss = 12.02296530
Validation score: 0.520055
Iteration 277, loss = 11.95278009
Validation score: 0.521153
Iteration 278, loss = 11.88472395
Validation score: 0.521772
Iteration 279, loss = 11.81490262
Validation score: 0.522837
Iteration 280, loss = 11.74674695
Validation score: 0.523179
Iteration 281, loss = 11.67961146
Validation score: 0.524111


## Evaluation

In [9]:
score_train = mlp.score(X_train_transformed, y_train.to_numpy())
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.7398


In [10]:
score_valid = mlp.score(X_valid_transformed, y_valid.to_numpy())
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5195


In [6]:
print(f"{perfection(y_valid, predictions_valid) * 100:.3f}% of examples perfectly predicted")

NameError: name 'predictions_valid' is not defined

In [6]:
sample_idx = 123
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [7]:
np.nonzero(actual)

(array([342, 418, 637, 682, 906, 992], dtype=int64),)

In [8]:
np.nonzero(sample_pred)

(array([128, 358, 637, 682, 992], dtype=int64),)