## Getting predictions on test data

In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../../code')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

from dataset import CSVDataset

from utils import LRAP, perfection
from train import Net

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

model_2 = Net.load_from_checkpoint("../../code/saved_models/neural_network_combined_2.ckpt")
model_2.freeze()

model_3 = Net.load_from_checkpoint("../../code/saved_models/neural_network_combined_3.ckpt")
model_3.freeze()

In [2]:
test_data = CSVDataset(
    "../../data/expanded/",
    csv_features="test_features.csv",
    csv_labels=None,
    standardize="../../code/saved_models/scaler_combined.pkl",
)

test_dataloader = torch.utils.data.DataLoader(
    test_data, batch_size=model_2.hparams.batch_size
)

In [4]:
predictions_1 = pd.read_csv("../../public_data/submission1/final_test_predictions_nn.csv", names=range(NUM_CLASSES))
predictions_1 = predictions_1.to_numpy()

In [5]:
predictions_2 = []
for features in test_dataloader:
    predictions_2.append(model_2.forward(features))
    
predictions_2 = torch.cat(predictions_2, dim=0)
predictions_2 = torch.sigmoid(predictions_2).numpy()

In [6]:
predictions_3 = []
for features in test_dataloader:
    predictions_3.append(model_3.forward(features))
    
predictions_3 = torch.cat(predictions_3, dim=0)
predictions_3 = torch.sigmoid(predictions_3).numpy()

In [9]:
ensemble_predictions = 0.45 * predictions_1 + 0.2 * predictions_2 + 0.35 * predictions_3
ensemble_predictions = pd.DataFrame(ensemble_predictions)
ensemble_predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3983,3984,3985,3986,3987,3988,3989,3990,3991,3992
0,3.108937e-05,0.000101,9.620997e-04,0.555303,0.969274,0.000305,3.057567e-05,3.621244e-05,4.267275e-05,4.274122e-04,...,1.212185e-05,3.910655e-05,4.108277e-05,1.174764e-05,2.192807e-05,1.838839e-05,1.851060e-05,1.826285e-05,1.388625e-05,1.956728e-05
1,9.860905e-06,0.000013,1.792514e-05,0.000126,0.000015,0.000084,2.795740e-05,1.409140e-05,2.009029e-05,2.372759e-05,...,9.276532e-06,1.213027e-05,2.329128e-05,9.589566e-06,1.223225e-05,1.165941e-05,1.761966e-05,1.377298e-05,7.731064e-06,1.648891e-05
2,1.905720e-05,0.000013,1.830872e-05,0.000073,0.000004,0.000029,7.888940e-06,1.035677e-05,1.311132e-05,6.660327e-05,...,1.078622e-05,1.516657e-05,1.962099e-05,8.143626e-06,1.231534e-05,8.680965e-06,6.949415e-06,1.103840e-05,5.983242e-06,8.453922e-06
3,2.040626e-07,0.000002,6.950813e-07,0.000004,0.000012,0.000001,6.073677e-07,1.788910e-07,6.545234e-08,6.475216e-07,...,1.589005e-07,2.451867e-07,5.001527e-07,2.489459e-07,1.045035e-07,3.992606e-07,3.177060e-07,5.331798e-08,1.097176e-07,1.966957e-07
4,1.869619e-04,0.000116,1.383527e-04,0.314147,0.000072,0.000109,7.299317e-05,3.676221e-05,1.639396e-04,9.645365e-05,...,1.151525e-04,9.914888e-05,1.044078e-04,7.073706e-05,7.268348e-05,6.788839e-05,7.319118e-05,4.713295e-05,5.708177e-05,9.913470e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484,9.314351e-05,0.000155,8.715258e-05,0.000465,0.000040,0.000105,7.348468e-05,7.631504e-05,2.979627e-04,2.915976e-05,...,5.497240e-05,9.878089e-05,1.077743e-04,6.841723e-05,7.999352e-05,1.291539e-04,9.507179e-05,6.271216e-05,4.570590e-05,4.940888e-05
2485,1.352206e-05,0.000101,1.542979e-04,0.128217,0.000117,0.000022,4.132132e-05,1.984165e-05,4.754588e-04,5.911247e-05,...,1.433045e-05,2.538677e-05,2.425965e-05,1.305303e-05,2.973110e-05,2.241362e-05,1.426131e-05,2.022973e-05,8.282485e-06,8.680453e-06
2486,3.772350e-05,0.000131,1.367427e-04,0.656208,0.000448,0.000060,1.098349e-04,4.981020e-05,1.431939e-04,4.424132e-05,...,2.180268e-05,5.496768e-05,4.568112e-05,1.912877e-05,3.563252e-05,4.177305e-05,3.933441e-05,1.424834e-05,1.728293e-05,1.682556e-05
2487,1.170001e-03,0.000041,3.292150e-05,0.000070,0.000013,0.000802,1.304044e-04,5.186724e-05,4.958751e-05,1.840150e-05,...,1.982029e-04,7.626776e-05,9.130443e-05,1.364736e-04,4.229788e-05,9.959706e-05,1.145564e-04,3.242202e-05,1.303668e-04,2.369455e-04


In [10]:
ensemble_predictions.astype("float16").to_csv(
    "../../public_data/submission1/final_test_predictions_nn_ensemble.csv", index=False, header=False
)

### Random stuff from here

In [51]:
LRAP(y_valid, predictions)

0.6254337395360583

In [23]:
binary_pred = (predictions >= 0.5).astype("float")
pd.Series(binary_pred.sum(axis=1)).describe()

count    1314.000000
mean        2.487062
std         1.777896
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max         9.000000
dtype: float64

In [None]:
perfection(y_valid, binary_pred)

In [7]:
train_data = CSVDataset(
    "../../data/expanded/",
    standardize="../../code/saved_models/scaler.pkl",
)

train_dataloader = torch.utils.data.DataLoader(
    train_data, batch_size=model.hparams.batch_size
)

y_train = pd.read_csv("../../data/expanded/train_labels.csv", names=range(NUM_CLASSES))
y_train = y_train.to_numpy()

In [8]:
predictions = []
for (features, labels) in train_dataloader:
    predictions.append(model.forward(features))
    
predictions = torch.cat(predictions, dim=0)
predictions = torch.sigmoid(predictions).numpy()

In [9]:
LRAP(y_train, predictions)

0.9523113911191974

In [10]:
binary_pred = (predictions >= 0.5).astype("float")
pd.Series(binary_pred.sum(axis=1)).describe()

count    15511.000000
mean         4.269937
std          1.803299
min          0.000000
25%          3.000000
50%          4.000000
75%          6.000000
max         18.000000
dtype: float64

In [None]:
perfection(y_train, binary_pred)

In [94]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [103]:
pca = PCA(n_components=1500, random_state=SEED)
pca = pca.fit(y_train)

In [104]:
y_train_reduced = pca.transform(y_train)
y_valid_reduced = pca.transform(y_valid)

In [105]:
rescale = MinMaxScaler()
rescale = rescale.fit(y_train_reduced)

In [106]:
y_train_rescaled = rescale.transform(y_train_reduced)
y_valid_rescaled = rescale.transform(y_valid_reduced)

In [107]:
temp = pca.inverse_transform(rescale.inverse_transform(y_valid_rescaled))

In [108]:
temp[temp >= 0.5] = 1

In [109]:
temp[temp < 0.5] = 0

In [110]:
y_valid.shape[0] * y_valid.shape[1] - (temp == y_valid).sum()

600

In [4]:
score_train = mlp.score(X_train_transformed, y_train.to_numpy())
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.9692


In [5]:
score_valid = mlp.score(X_valid_transformed, y_valid.to_numpy())
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5839


In [24]:
print(f"{perfection(y_valid, predictions_valid) * 100:.3f}% of examples perfectly predicted")

3.479% of examples perfectly predicted


In [6]:
sample_idx = 123
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [7]:
np.nonzero(actual)

(array([342, 418, 637, 682, 906, 992], dtype=int64),)

In [8]:
np.nonzero(sample_pred)

(array([128, 358, 637, 682, 992], dtype=int64),)