This notebook uses the CSVs created with the EdNet - 1 - Feature engineering notebook. Make sure to run the feature engineering code before continuing with this notebook. Additionally, this notebook uses the non-standard library [XGBoost](https://xgboost.readthedocs.io/en/stable/), which you may need to install before proceeding.

### Import libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from xgboost import XGBClassifier

### Define function to create final question features based on users in training set

In [None]:
def create_question_features(x):
    x_new = x.sort_values('timestamp')
    
    q_accuracy = []
    part_accuracy = []
    user_excess_correct = []
    
    q_dict = {}
    part_dict = {}
    user_dict = {}
    for part, q_id, u_id, correct in zip(x_new['part'], x_new['question_id'], x_new['user_id'], x_new['correct_response']):
        # Calculate excess correct first to avoid contamination
        excess_correct = 0
        if q_id in q_dict:
            avg_q_acc = q_dict[q_id]['n_correct']/q_dict[q_id]['n_ans']
            excess_correct = correct - avg_q_acc
        elif part in part_dict:
            avg_p_acc = part_dict[part]['n_correct']/part_dict[part]['n_ans']
            excess_correct = correct - avg_p_acc
        else:
            excess_correct = correct - 0.5# default
        
        if q_id in q_dict:
            q_accuracy.append(q_dict[q_id]['n_correct']/q_dict[q_id]['n_ans'])
            q_dict[q_id]['n_ans'] += 1
            q_dict[q_id]['n_correct'] += correct
        else:
            q_accuracy.append(np.nan)
            q_dict[q_id] = {'n_ans': 1, 'n_correct': correct}
            
        if part in part_dict:
            part_accuracy.append(part_dict[part]['n_correct']/part_dict[part]['n_ans'])
            part_dict[part]['n_ans'] += 1
            part_dict[part]['n_correct'] += correct
        else:
            part_accuracy.append(np.nan)
            part_dict[part] = {'n_ans': 1, 'n_correct': correct}
            
        if u_id in user_dict:
            avg_excess_correct = user_dict[u_id]['sum_excess_correct'] / user_dict[u_id]['n_ans']
            user_excess_correct.append(avg_excess_correct)
            
            user_dict[u_id]['n_ans'] += 1
            user_dict[u_id]['sum_excess_correct'] += excess_correct
        else:
            user_excess_correct.append(np.nan)
            user_dict[u_id] = {'n_ans': 1, 'sum_excess_correct': excess_correct}
            
    x_new['q_acc'] = q_accuracy
    x_new['part_acc'] = part_accuracy
    x_new['usr_excess_correct'] = user_excess_correct
    
    return x_new

### Define functions to split data into train-test and into local clients

In [None]:
def X_y_from_df(df):
    uids = list(df['user_id'].unique())
    np.random.shuffle(uids)
    
    n_train_uids = int(len(uids) * 0.8)
    train_uids = uids[0:n_train_uids]
    test_uids = uids[n_train_uids:]
    
    df_train = df.loc[df['user_id'].isin(train_uids)]
    df_test = df.loc[df['user_id'].isin(test_uids)]
    
    X_train = df_train.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response']).to_numpy()
    X_test = df_test.drop(columns=['timestamp', 'solving_id', 'question_id', 'elapsed_time',
                                 'user_id', 'part', 'correct_response']).to_numpy()
    
    y_train = df_train['correct_response'].to_numpy().ravel()
    y_test = df_test['correct_response'].to_numpy().ravel()
    
    return X_train, X_test, y_train, y_test

In [None]:
def split_into_clients(df, n_clients, seed = 42):
    np.random.seed(seed)
    
    uids = list(df['user_id'].unique())
    np.random.shuffle(uids)
    
    clients_uids = np.array_split(uids, n_clients)
    
    clients = []
    for client_uids in clients_uids:
        df_client = create_question_features(df.loc[df['user_id'].isin(client_uids)])
        X_train_client, X_test_client, y_train_client, y_test_client = X_y_from_df(df_client)
        clients.append((X_train_client, X_test_client, y_train_client, y_test_client))

    return clients

### Define function to train model

In [None]:
def train_model(client, seed):
    X_train, X_test, y_train, y_test = client[0], client[1], client[2], client[3]
        
    clf = XGBClassifier(tree_method = 'gpu_hist', random_state = seed)
        
    model = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_probs = clf.predict_proba(X_test)[:, 1]
        
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_probs)

### Read data and train the model

For this example we use N_CLIENTS = 10. Results for other numbers of local clients can easily be obtained by changing the value of N_CLIENTS.

In [None]:
df = pd.read_csv('ednet_features_10000_users.csv')
size_dataset = len(df)

N_CLIENTS = 10
acc_arr, f1_arr, auc_arr = [], [], []
for seed in range(10):
    acc_arr_round, f1_arr_round, auc_arr_round = [], [], []
    
    clients = split_into_clients(df, N_CLIENTS, seed)
    for client in clients:
        client_acc, client_f1, client_auc = train_model(client, seed)
        weight_client = (len(client[0]) + len(client[1])) / size_dataset #sum x_train and x_test
        
        acc_arr_round.append(client_acc * weight_client)
        f1_arr_round.append(client_f1 * weight_client)
        auc_arr_round.append(client_auc * weight_client)
        
    acc_arr.append(sum(acc_arr_round))
    f1_arr.append(sum(f1_arr_round))
    auc_arr.append(sum(auc_arr_round))
    
    print(f"Round {seed + 1} complete. Avg acc round: {sum(acc_arr_round)}, Avg auc round: {sum(auc_arr_round)}")

### Store results as CSV

In [None]:
df = pd.DataFrame({'acc': acc_arr, 'f1': f1_arr, 'auc': auc_arr})

df.to_csv('ednet_local_10000users_10clients.csv', index = False)