This notebook uses the CSVs created with the OULAD - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses the non-standard library [XGBoost](https://xgboost.readthedocs.io/en/stable/), which you may need to install before proceeding.

We furthermore use the 'pass-fail' scenario as an example for this notebook. Results for the other OULAD scenarios can easily be obtained by altering the CSV file names.

### Import libraries and define training functions

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier

In [None]:
def split_into_clients(X, y, n_clients, region_dummy, seed):
    n_samples = len(X)
    
    assert n_samples == len(y), 'Number of samples in X and y must be the same.'
    assert n_clients > 0, 'Number of clients must be greater than 0.'
    assert n_clients <= n_samples, 'Number of clients cannot be greater than number of samples.'

    np.random.seed(seed)
    clients_indices = []
    if region_dummy:
        for col_idx in range(38,51):
            column = X[:, col_idx]
            # Due to use of standard scaler should not check for = 1, but > 0
            clients_indices.append(np.where(column > 0)[0])
    else:
        random_indices = np.random.choice(n_samples, n_samples, replace = False)
        clients_indices = np.array_split(random_indices, n_clients)
    
    clients = []
    for client_indices in clients_indices:
        X_client = X[client_indices].copy()
        y_client = y[client_indices]

        clients.append((X_client, y_client))

    return clients

def train_model(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)
        
    clf = XGBClassifier(random_state = seed)
        
    model = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
        
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

### Read data, set relevant constants, and train models

In [None]:
X = pd.read_csv('oulad_pass_fail_x.csv').to_numpy()
y = pd.read_csv('oulad_pass_fail_y.csv').to_numpy().ravel()

scaler = StandardScaler()
X = scaler.fit_transform(X)

In this code example we set the number of clients to 10. Changing N_CLIENTS to another value will easily yield the results for other local client numbers.

In [None]:
size_dataset = len(X)
N_CLIENTS = 10
REGION_DUMMY = False
if REGION_DUMMY:
    N_CLIENTS = 13

acc_arr, f1_arr = [], []
for seed in range(10):
    acc_arr_round, f1_arr_round = [], []
    
    clients = split_into_clients(X, y, N_CLIENTS, REGION_DUMMY, seed)
    for client in clients:
        client_X, client_y = client
        
        client_acc, client_f1 = train_model(client_X, client_y, seed)
        weight_client = len(client_X) / size_dataset
        
        acc_arr_round.append(client_acc * weight_client)
        f1_arr_round.append(client_f1 * weight_client)
        
    acc_arr.append(sum(acc_arr_round))
    f1_arr.append(sum(f1_arr_round))
    
    print(f"Round {seed + 1} complete. Avg acc round: {sum(acc_arr_round)}, Avg f1 round: {sum(f1_arr_round)}")

### Store results as CSV

In [None]:
df = pd.DataFrame({'acc': acc_arr, 'f1': f1_arr})

df.to_csv('oulad_pass_fail_local_10clients.csv', index = False)