This notebook uses the CSVs created with the OULAD - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses some non-standard libraries which you may need to install first, such as [PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/stable/), and [CatBoost](https://catboost.ai/).

We furthermore use the 'pass-fail' scenario as an example for this notebook. Results for the other OULAD scenarios can easily be obtained by altering the CSV file names.

### Import libraries, load and pre-process data

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as torch_utils

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

X = pd.read_csv('oulad_pass_fail_x.csv').to_numpy()
y = pd.read_csv('oulad_pass_fail_y.csv').to_numpy().ravel()

input_dim = X.shape[1]
output_dim = len(np.unique(y))

scaler = StandardScaler()
X = scaler.fit_transform(X)

### Cross-validation experiments with scikit-learn, XGBoost, and CatBoost

In [None]:
def random_cross_val(X, y, k, classifier):
    acc, f1 = [], []
    for i in range(k):
        print(f"Round {i} of {k}")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
        
        clf = None
        if classifier == 'rf':
            clf = RandomForestClassifier(random_state = 42)
        elif classifier == 'lr':
            clf = LogisticRegression(max_iter = 1000, random_state = 42)
        elif classifier == 'mlp':
            clf = MLPClassifier(hidden_layer_sizes = (30, 10), random_state = 42)
        elif classifier == 'gb':
            clf = XGBClassifier(random_state = 42)
        elif classifier == 'cb':
            clf = CatBoostClassifier(task_type = 'GPU', silent = True, random_state = 42)
        else:
            print(f"Classifier {classifier} is not a valid classifier. Choose from: rf, lr, mlp, gb, or cb.")
        
        model = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        
    return acc, f1

In [None]:
acc_rf, f1_rf = random_cross_val(X, y, 10, 'rf')
acc_lr, f1_lr = random_cross_val(X, y, 10, 'lr')
acc_mlp, f1_mlp = random_cross_val(X, y, 10, 'mlp')
acc_gb, f1_gb = random_cross_val(X, y, 10, 'gb')
acc_cb, f1_cb = random_cross_val(X, y, 10, 'cb')

### Cross-valiation experiments with PyTorch

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(NeuralNetwork, self).__init__()
        self.input_layer    = nn.Linear(in_dim,30)
        self.hidden_layer1  = nn.Linear(30,10)
        self.output_layer   = nn.Linear(10,out_dim)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        out =  self.relu(self.input_layer(x))
        out =  self.relu(self.hidden_layer1(out))
        out =  self.output_layer(out)
        return out
    
def train_model(X, y, input_dim, output_dim, loss_fn, lr, optim_str, num_epochs, batch_size):
    model = NeuralNetwork(input_dim, output_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    if optim_str == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
            
    dataloader = torch.utils.data.DataLoader(list(zip(X, y)), batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):
        model.train()
        for data, target in dataloader:
            optimizer.zero_grad()
            prediction = model(data)
            loss = loss_fn(prediction, target)
            loss.backward()
            optimizer.step()
    
    return model

In [None]:
N_EPOCHS = 100
BATCH_SIZE = 64
loss_fn = nn.CrossEntropyLoss()
lr = 0.02
optimizer = 'adam'

X_torch = torch.tensor(X, dtype=torch.float32)
y_torch = torch.tensor(y, dtype=torch.int64)

acc_nn, f1_nn = [], []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_torch, y_torch, test_size = 0.2, random_state = i)
    model = train_model(X_train, y_train, input_dim, output_dim, loss_fn, lr, optimizer, N_EPOCHS, BATCH_SIZE)
    
    model.eval()
    with torch.no_grad():
        predictions = model(torch.tensor(X_test, dtype=torch.float32))
                            
    _, y_pred = torch.max(predictions, dim=1)
    y_pred = y_pred.numpy()
                            
    acc_nn.append(accuracy_score(y_test, y_pred))
    f1_nn.append(f1_score(y_test, y_pred))

### Store results as CSV

In [None]:
df = pd.DataFrame({'acc_rf': acc_rf, 'f1_rf': f1_rf, 'acc_lr': acc_lr, 'f1_lr': f1_lr,
                 'acc_mlp': acc_mlp, 'f1_mlp': f1_mlp, 'acc_nn': acc_nn, 'f1_nn': f1_nn,
                 'acc_gb': acc_gb, 'f1_gb': f1_gb, 'acc_cb': acc_cb, 'f1_cb': f1_cb})

df.to_csv('oulad_pass_fail_central.csv', index = False)