## Getting predictions on test data

In [1]:
# Needed to import custom code from other directories
import sys
sys.path.append('../../code')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

from dataset import CSVDataset

from utils import LRAP, perfection
from train import Net

%matplotlib inline

SEED = 42

NUM_FEATURES = 5000
NUM_CLASSES = 3993

model = Net.load_from_checkpoint("../../code/saved_models/neural_network_combined.ckpt")
model.freeze()

In [2]:
test_data = CSVDataset(
    "../../data/expanded/",
    csv_features="test_features.csv",
    csv_labels=None,
    standardize="../../code/saved_models/scaler_combined.pkl",
)

test_dataloader = torch.utils.data.DataLoader(
    test_data, batch_size=model.hparams.batch_size
)

In [3]:
predictions = []
for features in test_dataloader:
    predictions.append(model.forward(features))
    
predictions = torch.cat(predictions, dim=0)
predictions = torch.sigmoid(predictions).numpy()

In [6]:
predictions = pd.DataFrame(predictions)
predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3983,3984,3985,3986,3987,3988,3989,3990,3991,3992
0,3.676593e-05,0.000112,1.042378e-03,0.559985,9.390361e-01,0.000253,1.799524e-05,4.283153e-05,2.969171e-05,1.629235e-04,...,1.051946e-05,4.275094e-05,4.242583e-05,9.178982e-06,2.670355e-05,1.609088e-05,1.816434e-05,1.420076e-05,1.226968e-05,2.052980e-05
1,6.399192e-06,0.000011,5.614214e-06,0.000106,8.402646e-06,0.000026,8.041120e-06,7.867984e-06,8.602236e-06,5.897121e-06,...,8.765660e-06,7.889398e-06,1.355389e-05,7.217697e-06,6.430950e-06,1.101724e-05,1.122529e-05,3.238719e-06,7.436846e-06,1.418691e-05
2,1.085931e-05,0.000003,6.660365e-06,0.000058,4.442814e-07,0.000014,5.919027e-06,4.642799e-06,4.473854e-06,3.044094e-05,...,5.110882e-06,6.021321e-06,1.828984e-05,6.909427e-06,1.811475e-05,6.964594e-06,5.813019e-06,9.946471e-06,3.944291e-06,6.619838e-06
3,3.810915e-08,0.000001,1.729966e-07,0.000006,2.496302e-05,0.000002,1.697900e-07,1.056886e-07,4.744740e-09,7.212987e-08,...,1.491459e-07,6.375414e-08,1.018992e-07,4.881725e-08,3.415880e-08,1.508116e-07,4.523977e-08,1.626262e-08,2.707338e-08,6.945307e-08
4,2.605153e-04,0.000133,1.144774e-04,0.295935,5.228910e-05,0.000127,5.944542e-05,2.515619e-05,1.607373e-04,1.139463e-04,...,1.714389e-04,1.058863e-04,9.241945e-05,8.457553e-05,1.160852e-04,8.715515e-05,9.220537e-05,5.541691e-05,5.935677e-05,1.416644e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484,8.439861e-05,0.000208,8.424327e-05,0.000802,2.147939e-05,0.000067,7.354322e-05,5.496506e-05,2.362838e-04,1.990799e-05,...,5.218937e-05,1.138681e-04,1.083615e-04,6.220916e-05,1.054774e-04,2.216160e-04,1.387955e-04,6.126818e-05,4.370727e-05,5.372788e-05
2485,1.269877e-05,0.000088,2.347496e-04,0.234704,1.428709e-04,0.000015,5.523857e-05,1.443786e-05,5.857677e-04,1.992908e-05,...,1.665056e-05,2.528906e-05,2.469110e-05,1.181772e-05,4.164413e-05,3.082688e-05,1.375746e-05,2.679876e-05,5.952062e-06,1.158218e-05
2486,3.878100e-05,0.000202,1.626891e-04,0.694287,1.481197e-04,0.000061,1.355623e-04,6.249922e-05,1.534800e-04,4.560781e-05,...,2.707761e-05,6.978666e-05,4.826784e-05,1.633807e-05,5.881900e-05,6.162350e-05,5.012642e-05,1.251163e-05,1.847632e-05,1.853488e-05
2487,1.472609e-03,0.000034,1.452970e-05,0.000137,1.254621e-05,0.000707,8.582918e-05,1.394587e-05,1.064100e-05,7.261075e-06,...,2.545609e-04,5.656041e-05,6.027116e-05,1.633823e-04,3.119126e-05,9.800030e-05,8.647631e-05,1.803897e-05,1.224989e-04,2.296242e-04


In [11]:
predictions.astype("float16").to_csv("final_test_predictions_nn.csv", index=False, header=False)

### Random stuff from here

In [51]:
LRAP(y_valid, predictions)

0.6254337395360583

In [23]:
binary_pred = (predictions >= 0.5).astype("float")
pd.Series(binary_pred.sum(axis=1)).describe()

count    1314.000000
mean        2.487062
std         1.777896
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max         9.000000
dtype: float64

In [None]:
perfection(y_valid, binary_pred)

In [7]:
train_data = CSVDataset(
    "../../data/expanded/",
    standardize="../../code/saved_models/scaler.pkl",
)

train_dataloader = torch.utils.data.DataLoader(
    train_data, batch_size=model.hparams.batch_size
)

y_train = pd.read_csv("../../data/expanded/train_labels.csv", names=range(NUM_CLASSES))
y_train = y_train.to_numpy()

In [8]:
predictions = []
for (features, labels) in train_dataloader:
    predictions.append(model.forward(features))
    
predictions = torch.cat(predictions, dim=0)
predictions = torch.sigmoid(predictions).numpy()

In [9]:
LRAP(y_train, predictions)

0.9523113911191974

In [10]:
binary_pred = (predictions >= 0.5).astype("float")
pd.Series(binary_pred.sum(axis=1)).describe()

count    15511.000000
mean         4.269937
std          1.803299
min          0.000000
25%          3.000000
50%          4.000000
75%          6.000000
max         18.000000
dtype: float64

In [None]:
perfection(y_train, binary_pred)

In [94]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [103]:
pca = PCA(n_components=1500, random_state=SEED)
pca = pca.fit(y_train)

In [104]:
y_train_reduced = pca.transform(y_train)
y_valid_reduced = pca.transform(y_valid)

In [105]:
rescale = MinMaxScaler()
rescale = rescale.fit(y_train_reduced)

In [106]:
y_train_rescaled = rescale.transform(y_train_reduced)
y_valid_rescaled = rescale.transform(y_valid_reduced)

In [107]:
temp = pca.inverse_transform(rescale.inverse_transform(y_valid_rescaled))

In [108]:
temp[temp >= 0.5] = 1

In [109]:
temp[temp < 0.5] = 0

In [110]:
y_valid.shape[0] * y_valid.shape[1] - (temp == y_valid).sum()

600

In [4]:
score_train = mlp.score(X_train_transformed, y_train.to_numpy())
print(f"LRAP on training data: {score_train:.4f}")

LRAP on training data: 0.9692


In [5]:
score_valid = mlp.score(X_valid_transformed, y_valid.to_numpy())
print(f"LRAP on validation data: {score_valid:.4f}")

LRAP on validation data: 0.5839


In [24]:
print(f"{perfection(y_valid, predictions_valid) * 100:.3f}% of examples perfectly predicted")

3.479% of examples perfectly predicted


In [6]:
sample_idx = 123
sample_pred = mlp.predict(X_valid_transformed[[sample_idx]])[0]
actual = y_valid.loc[sample_idx].to_numpy()

In [7]:
np.nonzero(actual)

(array([342, 418, 637, 682, 906, 992], dtype=int64),)

In [8]:
np.nonzero(sample_pred)

(array([128, 358, 637, 682, 992], dtype=int64),)