## Setup

- 90% training data, 10% validation data

In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../code')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from models import ModifiedMLPClassifier
from utils import LRAP

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

X_train = pd.read_csv("../data/expanded/train_features.csv", names=range(NUM_FEATURES))
y_train = pd.read_csv("../data/expanded/train_labels.csv", names=range(NUM_CLASSES))

X_valid = pd.read_csv("../data/expanded/valid_features.csv", names=range(NUM_FEATURES))
y_valid = pd.read_csv("../data/expanded/valid_labels.csv", names=range(NUM_CLASSES))

print(f"{X_train.shape[0]} examples in training set, and {X_valid.shape[0]} in validation.")

13959 examples in training set, and 1552 in validation.


## Pre-processing data

- Subtracting mean and dividing by variance.
- Statistics of training data is used for the validation data too.

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

standardize = StandardScaler()
standardize = standardize.fit(X_train)
X_train_transformed = standardize.transform(X_train)
X_valid_transformed = standardize.transform(X_valid)

# X_train_transformed = X_train
# X_valid_transformed = X_valid
# 
# normalize = MinMaxScaler()
# normalize = normalize.fit(X_train_transformed)
# X_train_transformed = normalize.transform(X_train_transformed)
# X_valid_transformed = normalize.transform(X_valid_transformed)

## Model training

While training the multi-layer perceptron, the training and validation loss are reported in each iteration (or epoch).

In [3]:
# Training takes ~10 seconds per iteration
mlp = ModifiedMLPClassifier(
    hidden_layer_sizes=(256,),
    max_iter=100,
    alpha=0.1,
    verbose=True,
    random_state=SEED,
    # custom_validation_data=(X_valid_transformed, y_valid)
)
mlp = mlp.fit(X_train_transformed, y_train)

Iteration 1, training loss = 364.64399666
Iteration 2, training loss = 48.41153234
Iteration 3, training loss = 39.32583581
Iteration 4, training loss = 34.96982248
Iteration 5, training loss = 32.14490112
Iteration 6, training loss = 29.97408445
Iteration 7, training loss = 28.15885611
Iteration 8, training loss = 26.59237683
Iteration 9, training loss = 25.19528590
Iteration 10, training loss = 23.95072599
Iteration 11, training loss = 22.81576999
Iteration 12, training loss = 21.75109183
Iteration 13, training loss = 20.78110786
Iteration 14, training loss = 19.86857619
Iteration 15, training loss = 19.01677641
Iteration 16, training loss = 18.22302403
Iteration 17, training loss = 17.47301005
Iteration 18, training loss = 16.76833762
Iteration 19, training loss = 16.11186691
Iteration 20, training loss = 15.50234152
Iteration 21, training loss = 14.91470869
Iteration 22, training loss = 14.35800772
Iteration 23, training loss = 13.84645156
Iteration 24, training loss = 13.35123457




## Evaluation

In [4]:
pred_train = mlp.predict_proba(X_train_transformed)
score_train = LRAP(y_train, pred_train)
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.9796


In [5]:
pred_valid = mlp.predict_proba(X_valid_transformed)
score_valid = LRAP(y_valid, pred_valid)
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5246


In [9]:
sample_idx = 123
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [10]:
np.nonzero(actual)

(array([342, 418, 637, 682, 906, 992], dtype=int64),)

In [11]:
np.nonzero(sample_pred)

(array([  88,  128,  342,  637,  992, 1722], dtype=int64),)