This notebook uses the CSVs created with the OULAD - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses the non-standard library [XGBoost](https://xgboost.readthedocs.io/en/stable/), which you may need to install before proceeding.

### Import libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from xgboost import XGBClassifier

### Define function to split data over local clients

In [None]:
def split_into_clients(X, y, n_clients, seed):
    n_samples = len(X)

    np.random.seed(seed)
    clients_indices = []
    random_indices = np.random.choice(n_samples, n_samples, replace = False)
    clients_indices = np.array_split(random_indices, n_clients)
    
    clients = []
    for client_indices in clients_indices:
        X_client = X[client_indices].copy()
        y_client = y[client_indices]

        clients.append((X_client, y_client))

    return clients

### Define function to train model

In [None]:
def train_model(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)
        
    clf = XGBClassifier(tree_method = 'gpu_hist', random_state = seed)
        
    model = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_probs = clf.predict_proba(X_test)[:, 1]
        
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_probs)

### Read data and train model

In this code example we set the number of clients to 10. Changing N_CLIENTS to another value will easily yield the results for other local client numbers.

In [None]:
df = pd.read_csv('data/kdd_cup_2015_features.csv')
X = df.drop(columns = ['enroll_id', 'dropout']).to_numpy()
y = df['dropout'].to_numpy().ravel()

input_dim = X.shape[1]
output_dim = len(np.unique(y))

scaler = StandardScaler()
X = scaler.fit_transform(X)

size_dataset = len(X)
N_CLIENTS = 10

acc_arr, f1_arr, auc_arr = [], [], []
for seed in range(10):
    acc_arr_round, f1_arr_round, auc_arr_round = [], [], []
    
    clients = split_into_clients(X, y, N_CLIENTS, seed)
    for client in clients:
        client_X, client_y = client
        
        client_acc, client_f1, client_auc = train_model(client_X, client_y, seed)
        weight_client = len(client_X) / size_dataset
        
        acc_arr_round.append(client_acc * weight_client)
        f1_arr_round.append(client_f1 * weight_client)
        auc_arr_round.append(client_auc * weight_client)
        
    acc_arr.append(sum(acc_arr_round))
    f1_arr.append(sum(f1_arr_round))
    auc_arr.append(sum(auc_arr_round))
    
    print(f"Round {seed + 1} complete. Avg auc round: {sum(auc_arr_round)}, Avg f1 round: {sum(f1_arr_round)}")

### Store results as CSV

In [None]:
df = pd.DataFrame({'acc': acc_arr, 'f1': f1_arr, 'auc': auc_arr})

df.to_csv('kdd_cup_2015_local_10clients.csv', index = False)