# ML classifiers

This notebook takes as input the output from `create_dfs.ipynb` and trains a Logistic Regression and Random Forest classifiers.

Inputs:
- `final_df`: dataset for training and testing the model
- `validation_df`: dataset for validation of the model

Outputs:
- Accuracy for Logistic Regression and Random Forest classifiers for `validation_df`

Import libraries and set paths for inputs and outputs

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score
from statistics import mean, stdev

path_to_final_df = ""
test_size = 0.3 # test_size for train_test_split

path_to_validation_df = ""
cvs = 5 # Number of cross validations
# Set seed
SEED=42
random.seed(SEED)
np.random.seed(SEED)


## Create train, test and validation datasets

Load `final_df` and create train and test datasets.

In [None]:
# Load data
triplets = pd.read_csv(path_to_final_df, sep = "\t")

# Shuffle the dataframe before splitting
triplets = triplets.sample(frac = 1)

# Divide X and y (np.fromstring to obtain numpy arrays)
X = triplets["Concat_emb"].apply(lambda x: np.fromstring(x.replace('\n','').replace('[','').replace(']','').replace('  ',' '), sep=' '))
y = triplets["Label"].values

# Stack the arrays to create a matrix
X = np.stack(X.values, axis = 0)
# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

Load `validation_df` dataset and create validation set.

In [None]:
# Load validation_df
validation_embs = pd.read_csv(path_to_validation_df, sep = "\t")

# Shuffle data
validation_embs = validation_embs.sample(frac = 1)

# Separate data from labels
X_val = validation_embs["Concat_emb"].apply(lambda x: np.fromstring(x.replace('\n','').replace('[','').replace(']','').replace('  ',' '), sep=' '))
true_val = validation_embs["Label"].values

# Stack the arrays to create a matrix
X_val = np.stack(X_val.values, axis = 0)

## Random Forest Classifier

Define the model, train and test it.<br>

In [None]:
accs = []
precisions = []
recalls = []
f1s = []

for cv in range(cvs):
    print(f"Iter: {cv+1}")
    clf = RandomForestClassifier()

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    val_preds = clf.predict(X_val)

    # Take results
    acc = accuracy_score(true_val, val_preds)
    recall = recall_score(true_val, val_preds)
    precision = precision_score(true_val, val_preds)
    f1 = f1_score(true_val, val_preds)

    # Append results
    precisions.append(precision)
    recalls.append(recall)
    accs.append(acc)
    f1s.append(f1)

print("Final results...")
print(f"Mean Accuracy: {mean(accs)} +/- {stdev(accs)}")
print(f"Mean Recall: {mean(recalls)} +/- {stdev(recalls)}")
print(f"Mean Precision: {mean(precisions)} +/- {stdev(precisions)}")
print(f"Mean F1: {mean(f1s)} +/- {stdev(f1s)}")


## Logistic Regression

Define the model, train and test it.<br>

In [None]:
accs = []
precisions = []
recalls = []
f1s = []

for cv in range(cvs):
    print(f"Iter: {cv+1}")
    clf = LogisticRegression()

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    val_preds = clf.predict(X_val)

    # Take results
    acc = accuracy_score(true_val, val_preds)
    recall = recall_score(true_val, val_preds)
    precision = precision_score(true_val, val_preds)
    f1 = f1_score(true_val, val_preds)

    # Append results
    precisions.append(precision)
    recalls.append(recall)
    accs.append(acc)
    f1s.append(f1)

print("Final results...")
print(f"Mean Accuracy: {mean(accs)} +/- {stdev(accs)}")
print(f"Mean Recall: {mean(recalls)} +/- {stdev(recalls)}")
print(f"Mean Precision: {mean(precisions)} +/- {stdev(precisions)}")
print(f"Mean F1: {mean(f1s)} +/- {stdev(f1s)}")