# MLP classifier

This notebook takes as input the output from `create_dfs.ipynb` and trains a MLP classifier.

Inputs:
- `final_df`: dataset for training and testing the model
- `validation_df`: dataset for validation of the model

Outputs:
- Accuracy for MLP classifier for `validation_df`

Import libraries and set paths for inputs and outputs.

In [None]:
import torch
from torch import nn
import pandas as pd
import random
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score
import numpy as np
from statistics import mean, stdev

path_to_final_df = ""
test_size = 0.3 # test_size for train_test_split

# Model parameters
indim = 384 # 384 = 128 * 3 (for BERTwalk the embeddins were 128-dimensional, for BIONIC vedi tu)
h1 = 128 # by default 128
outdim = 2 # 2 classes (positive/negative triplets)
n_epochs = 300 # number of training epochs

path_to_validation_df = ""

cvs = 5 # number of cross validations
# Set seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)


## Create train and test datasets

Load `final_df` and shuffle data.

In [None]:
# Load data
triplets = pd.read_csv(path_to_final_df, sep = "\t")

# Shuffle the dataframe before splitting
triplets = triplets.sample(frac = 1)

# Divide X and y (np.fromstring to obtain numpy arrays)
X = triplets["Concat_emb"].apply(lambda x: np.fromstring(x.replace('\n','').replace('[','').replace(']','').replace('  ',' '), sep=' '))
y = triplets["Label"].values

# Stack the arrays to create a matrix
X = np.stack(X.values, axis = 0)
# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

# Convert to torch tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)

Load `validation_df` for validation purposes.

In [None]:
# Load validation_df
validation_embs = pd.read_csv(path_to_validation_df, sep = "\t")

# Shuffle data
validation_embs = validation_embs.sample(frac = 1)

# Separate data from labels
X_val = validation_embs["Concat_emb"].apply(lambda x: np.fromstring(x.replace('\n','').replace('[','').replace(']','').replace('  ',' '), sep=' '))
true_val = validation_embs["Label"].values

# Stack the arrays to create a matrix
X_val = np.stack(X_val.values, axis = 0)

# Convert to torch tensors
X_val = torch.FloatTensor(X_val)
true_val = torch.LongTensor(true_val)

## Define the model

Define the MLP model and create an instance of it.<br>
The model is composed by a single hidden layer, with `RELU` activation function and `h1` neurons.

In [None]:
# Define the model
class mynet(nn.Module):
    def __init__(self, indim, h1, outdim):
        super().__init__()
        self.l0 = nn.Linear(indim, h1)
        self.output = nn.Linear(h1, outdim)

    def forward(self, x):
        z1 = torch.relu(self.l0(x))
        out = self.output(z1)
        return out

## Training and testing

Train the model for `n_epochs`.

In [None]:
precisions = []
recalls = []
accs = []
f1s = []


for cv in range(cvs):
    print(f"Iter: {cv+1}")
    # Create an instance of the model
    model = mynet(indim, h1, outdim)

    # Set criterion and init optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.05)

    # Train the model
    losses = []
    model.train()
    for i in range(n_epochs):
        y_pred = model(X_train)
        loss = criterion(y_pred,y_train.squeeze())
        losses.append(loss.detach().numpy())
        # do backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print(f"Epoch: {i}, Loss: {loss}")

    with torch.no_grad():
        pred_val = model(X_val)

    preds = pred_val.argmax(dim = 1)
    tot_acc = ((preds == true_val).sum()/len(preds)).item()
    noass_acc = ((preds[true_val == 0] == true_val[true_val == 0]).sum()/len(true_val[true_val == 0])).item()
    ass_acc = ((preds[true_val == 1] == true_val[true_val == 1]).sum()/len(true_val[true_val == 1])).item()

    f1 = f1_score(true_val, preds)
    recall = recall_score(true_val, preds)
    precision = precision_score(true_val, preds)
    acc = (noass_acc + ass_acc)/2

    # Append results
    precisions.append(precision)
    recalls.append(recall)
    accs.append(acc)
    f1s.append(f1)

print("Final results...")
print(f"Mean Accuracy: {mean(accs)} +/- {stdev(accs)}")
print(f"Mean Recall: {mean(recalls)} +/- {stdev(recalls)}")
print(f"Mean Precision: {mean(precisions)} +/- {stdev(precisions)}")
print(f"Mean F1: {mean(f1s)} +/- {stdev(f1s)}")