This notebook uses the CSVs created with the KDD Cup 2015 - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses some non-standard libraries which you may need to install first, such as [PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/stable/), and [CatBoost](https://catboost.ai/).

### Import libraries, load and pre-process data

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as torch_utils
import torch.nn.functional as F

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv('kdd_cup_2015_features.csv')
X = df.drop(columns = ['enroll_id', 'dropout']).to_numpy()
y = df['dropout'].to_numpy().ravel()

input_dim = X.shape[1]
output_dim = len(np.unique(y))

scaler = StandardScaler()
X = scaler.fit_transform(X)

### Define cross-validation function and train scikit-learn, XGBoost, and CatBoost models

In [None]:
def random_cross_val(X, y, k, classifier):
    acc, f1, auc = [], [], []
    for i in range(k):
        print(f"Round {i} of {k}")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
        
        clf = None
        if classifier == 'rf':
            clf = RandomForestClassifier(random_state = 42)
        elif classifier == 'lr':
            clf = LogisticRegression(max_iter = 1000, random_state = 42)
        elif classifier == 'mlp':
            clf = MLPClassifier(hidden_layer_sizes = (30, 10), random_state = 42)
        elif classifier == 'gb':
            clf = XGBClassifier(tree_method = 'gpu_hist', random_state = 42)
        elif classifier == 'cb':
            clf = CatBoostClassifier(task_type = 'GPU', silent = True, random_state = 42)
        else:
            print(f"Classifier {classifier} is not a valid classifier. Choose from: rf, lr, mlp, gb, or cb.")
        
        model = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_probs = clf.predict_proba(X_test)[:, 1]
        
        acc.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred_probs))
        
    return acc, f1, auc

In [None]:
acc_rf, f1_rf, auc_rf = random_cross_val(X, y, 10, 'rf')
acc_lr, f1_lr, auc_lr = random_cross_val(X, y, 10, 'lr')
acc_mlp, f1_mlp, auc_mlp = random_cross_val(X, y, 10, 'mlp')
acc_gb, f1_gb, auc_gb = random_cross_val(X, y, 10, 'gb')
acc_cb, f1_cb, auc_cb = random_cross_val(X, y, 10, 'cb')

### Define PyTorch functions and train neural network

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(NeuralNetwork, self).__init__()
        self.input_layer    = nn.Linear(in_dim,30)
        self.hidden_layer1  = nn.Linear(30,10)
        self.output_layer   = nn.Linear(10,out_dim)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        out =  self.relu(self.input_layer(x))
        out =  self.relu(self.hidden_layer1(out))
        out =  self.output_layer(out)
        return out
    
def train_model(X, y, input_dim, output_dim, loss_fn, lr, optim_str, num_epochs, batch_size):
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    
    X, y = X.to(device), y.to(device)
    model = NeuralNetwork(input_dim, output_dim).to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    if optim_str == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
            
    dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X, y), batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):
        model.train()
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            prediction = model(data)
            loss = loss_fn(prediction, target)
            loss.backward()
            optimizer.step()
    
    return model

In [None]:
N_EPOCHS = 100
BATCH_SIZE = 64

loss_fn = nn.CrossEntropyLoss()
lr = 0.02
optimizer = 'adam'

X_torch = torch.tensor(X, dtype=torch.float32)
y_torch = torch.tensor(y, dtype=torch.int64)

acc_nn, f1_nn, auc_nn = [], [], []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_torch, y_torch, test_size = 0.2, random_state = i)
    
    model = train_model(X_train, y_train, input_dim, output_dim, loss_fn, lr, optimizer, N_EPOCHS, BATCH_SIZE)
    
    model.eval()
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    with torch.no_grad():
        predictions = model(torch.tensor(X_test, dtype=torch.float32)).to(device)
                            
    _, y_pred = torch.max(predictions, dim=1)
    y_pred = y_pred.cpu().numpy()
    y_pred_probs = F.softmax(predictions, dim=1).cpu().numpy()[:, 1]
                            
    acc_nn.append(accuracy_score(y_test, y_pred))
    f1_nn.append(f1_score(y_test, y_pred))
    auc_nn.append(roc_auc_score(y_test, y_pred_probs))

### Store results as CSV

In [None]:
df_results = pd.DataFrame({'acc_rf': acc_rf, 'f1_rf': f1_rf, 'auc_rf': auc_rf,
                           'acc_lr': acc_lr, 'f1_lr': f1_lr, 'auc_lr': auc_lr,
                           'acc_mlp': acc_mlp, 'f1_mlp': f1_mlp, 'auc_mlp': auc_mlp,
                           'acc_nn': acc_nn, 'f1_nn': f1_nn, 'auc_nn': auc_nn,
                           'acc_gb': acc_gb, 'f1_gb': f1_gb, 'auc_gb': auc_gb,
                           'acc_cb': acc_cb, 'f1_cb': f1_cb, 'auc_cb': auc_cb})

df_results.to_csv('kdd_cup_2015_central.csv', index = False)