This notebook uses the CSVs created with the EdNet - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses some non-standard libraries which you may need to install first, such as [PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/stable/), and [CatBoost](https://catboost.ai/).

### Import libraries

In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as torch_utils
import torch.nn.functional as F

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### Define function to create final question features based on users in training set

In [None]:
def create_question_features(x):
    x_new = x.sort_values('timestamp')
    
    q_accuracy = []
    part_accuracy = []
    user_excess_correct = []
    
    q_dict = {}
    part_dict = {}
    user_dict = {}
    for part, q_id, u_id, correct in zip(x_new['part'], x_new['question_id'], x_new['user_id'], x_new['correct_response']):
        # Calculate excess correct first to avoid contamination
        excess_correct = 0
        if q_id in q_dict:
            avg_q_acc = q_dict[q_id]['n_correct']/q_dict[q_id]['n_ans']
            excess_correct = correct - avg_q_acc
        elif part in part_dict:
            avg_p_acc = part_dict[part]['n_correct']/part_dict[part]['n_ans']
            excess_correct = correct - avg_p_acc
        else:
            excess_correct = correct - 0.5# default
        
        if q_id in q_dict:
            q_accuracy.append(q_dict[q_id]['n_correct']/q_dict[q_id]['n_ans'])
            q_dict[q_id]['n_ans'] += 1
            q_dict[q_id]['n_correct'] += correct
        else:
            q_accuracy.append(np.nan)
            q_dict[q_id] = {'n_ans': 1, 'n_correct': correct}
            
        if part in part_dict:
            part_accuracy.append(part_dict[part]['n_correct']/part_dict[part]['n_ans'])
            part_dict[part]['n_ans'] += 1
            part_dict[part]['n_correct'] += correct
        else:
            part_accuracy.append(np.nan)
            part_dict[part] = {'n_ans': 1, 'n_correct': correct}
            
        if u_id in user_dict:
            avg_excess_correct = user_dict[u_id]['sum_excess_correct'] / user_dict[u_id]['n_ans']
            user_excess_correct.append(avg_excess_correct)
            
            user_dict[u_id]['n_ans'] += 1
            user_dict[u_id]['sum_excess_correct'] += excess_correct
        else:
            user_excess_correct.append(np.nan)
            user_dict[u_id] = {'n_ans': 1, 'sum_excess_correct': excess_correct}
            
    x_new['q_acc'] = q_accuracy
    x_new['part_acc'] = part_accuracy
    x_new['usr_excess_correct'] = user_excess_correct
    
    return x_new

### Define functions for splitting data and training models

In [None]:
def custom_train_test_split(full_df, train_ratio, seed):
    user_ids = full_df['user_id'].unique()
    
    np.random.seed(seed)
    random_user_ids = np.random.choice(user_ids, len(user_ids), replace = False)
    
    split_index = int(len(random_user_ids) * train_ratio)
    train_user_ids = random_user_ids[0:split_index]
    test_user_ids = random_user_ids[split_index:]
    
    train_df = full_df.loc[full_df['user_id'].isin(train_user_ids)]
    test_df = full_df.loc[full_df['user_id'].isin(test_user_ids)]
    
    return train_df, test_df

In [None]:
def train_model(df, classifier, train_ratio, seed):
    df_copy = df.copy()
    # Need to impute missing values if not XGBoost
    if classifier != 'gb':
        df_copy = df_copy.fillna(0)
        
    train_df, test_df = custom_train_test_split(df_copy, 0.8, seed)
    X_train = train_df.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response'])
    y_train = train_df['correct_response']
    X_test = test_df.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response'])
    y_test = test_df['correct_response']
    
    clf = None
    if classifier == 'mlp':
        clf = MLPClassifier(hidden_layer_sizes = (16,8), random_state = 42)
    elif classifier == 'lr':
        clf = LogisticRegression(max_iter = 1000, random_state = 42)
    elif classifier == 'rf':
        clf = RandomForestClassifier(random_state = 42)
    elif classifier == 'gb':
        clf = XGBClassifier(tree_method = 'gpu_hist', random_state = 42)
    elif classifier == 'cb':
        clf = CatBoostClassifier(task_type = 'GPU', silent = True, random_state = 42)
    else:
        print(f"Classifier {classifier} is not a valid choice. Select from mlp, lr, rf, gb or cb.")
        
    model = clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_pred_probs = clf.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_probs)
    
    return accuracy, auc

### Read data and train scikit-learn, XGBoost, and CatBoost models

In [None]:
df = pd.read_csv('ednet_features_10000_users.csv')
df = create_question_features(df)

TRAIN_RATIO = 0.8

accuracies_mlp, aucs_mlp = [], []
accuracies_lr, aucs_lr = [], []
accuracies_rf, aucs_rf = [], []
accuracies_cb, aucs_cb = [], []
accuracies_gb, aucs_gb = [], []

for i in range(10):
    print(f"Round {i+1} of 10.")
    
    acc, auc = train_model(df, 'mlp', TRAIN_RATIO, i)
    accuracies_mlp.append(acc)
    aucs_mlp.append(auc)
    
    acc, auc = train_model(df, 'lr', TRAIN_RATIO, i)
    accuracies_lr.append(acc)
    aucs_lr.append(auc)
    
    acc, auc = train_model(df, 'rf', TRAIN_RATIO, i)
    accuracies_rf.append(acc)
    aucs_rf.append(auc)
    
    acc, auc = train_model(df, 'gb', TRAIN_RATIO, i)
    accuracies_gb.append(acc)
    aucs_gb.append(auc)
    
    acc, auc = train_model(df, 'cb', TRAIN_RATIO, i)
    accuracies_cb.append(acc)
    aucs_cb.append(auc)

### Define PyTorch functions and train neural network

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(NeuralNetwork, self).__init__()
        self.input_layer    = nn.Linear(in_dim,16)
        self.hidden_layer1  = nn.Linear(16,8)
        self.output_layer   = nn.Linear(8,out_dim)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        out =  self.relu(self.input_layer(x))
        out =  self.relu(self.hidden_layer1(out))
        out =  self.output_layer(out)
        return out
    
def train_nn(X, y, input_dim, output_dim, loss_fn, lr, optim_str, num_epochs, batch_size):
    model = NeuralNetwork(input_dim, output_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    if optim_str == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
            
    dataloader = torch.utils.data.DataLoader(list(zip(X, y)), batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):
        model.train()
        for data, target in dataloader:
            optimizer.zero_grad()
            prediction = model(data)
            loss = loss_fn(prediction, target)
            loss.backward()
            optimizer.step()
    
    return model

In [None]:
N_EPOCHS = 50
BATCH_SIZE = 512
loss_fn = nn.CrossEntropyLoss()
lr = 0.02
optimizer = 'adam'

output_dim = len(np.unique(df['correct_response']))
df_copy = df.copy().fillna(0)

accuracies_nn, aucs_nn = [], []
for i in range(10):
    train_df, test_df = custom_train_test_split(df_copy, TRAIN_RATIO, i)
    X_train = train_df.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response'])
    y_train = train_df['correct_response']
    X_test = test_df.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response'])
    y_test = test_df['correct_response']
    
    input_dim = X_train.shape[1]
    X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
    X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy().ravel(), dtype=torch.int64)
    y_test = torch.tensor(y_test.to_numpy().ravel(), dtype=torch.int64)
    
    model = train_nn(X_train, y_train, input_dim, output_dim, loss_fn, lr, optimizer, N_EPOCHS, BATCH_SIZE)
    
    model.eval()
    with torch.no_grad():
        predictions = model(torch.tensor(X_test, dtype=torch.float32))
                            
    _, y_pred = torch.max(predictions, dim=1)
    y_pred = y_pred.numpy()
    y_pred_probs = F.softmax(predictions, dim=1).numpy()[:, 1]
                            
    accuracies_nn.append(accuracy_score(y_test, y_pred))
    aucs_nn.append(roc_auc_score(y_test, y_pred_probs))

### Store results as CSV

In [None]:
df_results = pd.DataFrame({'acc_mlp': accuracies_mlp, 'auc_mlp': aucs_mlp, 'acc_lr': accuracies_lr, 'auc_lr': aucs_lr,
                             'acc_rf': accuracies_rf, 'auc_rf': aucs_rf, 'acc_gb': accuracies_gb, 'auc_gb': aucs_gb,
                              'acc_cb': accuracies_cb, 'auc_cb': aucs_cb, 'acc_nn': accuracies_nn, 'auc_nn': aucs_nn})

df_results.to_csv('ednet_central_10000users.csv', index = False)