In [None]:
# import libraries
import pandas as pd
import numpy as np
import sklearn

import pickle
import random

from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, cohen_kappa_score

random.seed(10)

In [None]:
# import data
df_features = pd.read_csv("Features_aggToPhase_2021-12-26.csv") #added # of n.,v.,pronouns.
df_label = pd.read_csv("Cuethink_Coding_JoyceAlexis_Dec29.csv").drop(columns = ['index','filename','X'])


# join features with with labels
df = pd.merge(df_label, df_features,on = ['user_id','thinklet_id']) #inner join (excluded thinklets that are incomplete, features were extracted for complete thinklets only)

# drop columns and inpute missing value as 0
df_train = df.drop(['thinklet_id', 'assignment_name','strategiesSelected'], axis=1).fillna(0) # drop'strategiesSelected'


# extract features as X
X = df_train.drop(['Numerical_Representation','Contextual_Representation',
                     'Strategy_Orientation','Outcome_Orientation','Data_Transformation'], axis=1)

# extract the prediction variable as y
y = df_train.Data_Transformation

In [None]:
# split data 

# Construct grouping data to ensure the same student does not end up in both training and test splits
group_dict = dict()
groups = np.array([])

for index, row in df_train.iterrows():
    s_id = row['user_id']
    if s_id not in group_dict:
        group_dict[s_id] = index
    groups = np.append(groups, group_dict[s_id])
    
# Set up the splitter with 10 splits
gkf = GroupKFold(n_splits = 10)

In [None]:
# xgboost

# setup the XGBoost classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# set up storage arrays for each round of validation
roc_auc_scores = np.array([])
accuracy_scores = np.array([])
cohen_kappa_scores = np.array([])
f1_scores = np.array([])
list_shap_values = list()
list_test_sets = list()

# split, train, test and store performance metrics
for train_index, test_index in gkf.split(X, y, groups=groups):
    
    # Get the training and test data from the dataset for this group
    X_train = X.iloc[train_index].drop(['user_id'], axis=1)
    X_test = X.iloc[test_index].drop(['user_id'], axis=1)
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    # train classifier on this round of training group
    classifier.fit(X_train, y_train)
    
    # test classifier on this round of testing group
    predictions = classifier.predict(X_test)
    predictions_prob = classifier.predict_proba(X_test)

    # compute some metrics and store them for averaging later on
    roc_auc_scores = np.append(roc_auc_scores, roc_auc_score(y_test, predictions))
    accuracy_scores = np.append(accuracy_scores, accuracy_score(y_test, predictions))
    cohen_kappa_scores = np.append(cohen_kappa_scores, cohen_kappa_score(y_test, predictions))
    f1_scores = np.append(f1_scores, f1_score(y_test, predictions))


# print mean scores for the 10-fold CV
print("average roc_auc score: ", np.round(roc_auc_scores.mean(), 3))
print("stdv roc_auc score: ", np.round(roc_auc_scores.std(), 3))
print("max roc_auc score: ", np.round(roc_auc_scores.max(), 3))
print("average Cohen's Kappa score: ", np.round(cohen_kappa_scores.mean(), 3))
print("stdev Cohen's Kappa score: ", np.round(cohen_kappa_scores.std(), 3))
print("average F1 score: ", np.round(f1_scores.mean(), 3))
print("average Accuracy score: ", np.round(accuracy_scores.mean(), 3))

In [None]:
# save the model to disk
filename = 'detector_DT.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test)
print(result)