In [1]:
import pickle
from spkit import bkt
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

In [2]:
with open('kdd_2005_2005_df.pkl', 'rb') as pklfile:
    df = pickle.load(pklfile)
with open('kdd_2005_2005_q_matrix.pkl', 'rb') as pklfile:
    q_matrix = pickle.load(pklfile)
with open('kdd_2005_2005_students.pkl', 'rb') as pklfile:
    students = pickle.load(pklfile)
with open('kdd_2005_2005_kcs.pkl', 'rb') as pklfile:
    kcs = pickle.load(pklfile)
with open('kdd_2005_2005_model.pkl', 'rb') as pklfile:
    model = pickle.load(pklfile)

In [5]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()

CPU times: user 422 ms, sys: 55.6 ms, total: 477 ms
Wall time: 484 ms


In [6]:
%%time
data_arr = np.asarray(data)
rmse = 0
n = 0
score_df = []
learning_state_df = []
outcome_all = []
predicted_proba = []
for student in students:
    student_idx = np.where(data_arr[:,1] == student)
    outcome = data_arr[student_idx, 0].astype('int64')
    outcome_all += outcome.tolist()[0]
    question_id = data_arr[student_idx, 2].astype('int64')
    data_student = np.vstack((outcome, question_id)).T.astype('int64').tolist()
    predicted = model.predict_proba(data_student, q_matrix)
    predicted_proba += predicted[:,1].tolist()
    s_ll, s_aic, s_bic, s_rmse, s_acc = model.score()
    fpr, tpr, _ = roc_curve(outcome.flatten(), predicted[:,1], pos_label=1)
    auc_score = auc(fpr, tpr)
    score_df.append({"student": student,
                     "AIC": s_aic,
                     "BIC": s_bic,
                     "RMSE": s_rmse,
                     "Acc": s_acc,
                     "n questions": model.n_questions,
                     "LL": s_ll,
                     "correct_outcome": np.where(outcome == 1)[1].shape[0],
                     "incorrect_outcome": np.where(outcome == 0)[1].shape[0],
                     "AUC": auc_score})
    learning_state_dict = {"student": student}
    for idx, kc in enumerate(model.learning_state):
        learning_state_dict[kcs[idx]] = kc
    learning_state_df.append(learning_state_dict)

  if np.any(dx < 0):


CPU times: user 1min 4s, sys: 129 ms, total: 1min 4s
Wall time: 1min 4s


In [7]:
score = pd.DataFrame.from_dict(score_df)

In [8]:
rmse_train = np.sqrt((score['RMSE']**2*score['n questions']).sum()/score['n questions'].sum())
rmse_avg = score['RMSE'].mean()
rmse_std = score['RMSE'].std()

In [9]:
print("RMSE de treino: %.2f" % rmse_train)
print("RMSE médio: %.2f +- %.2f" % (rmse_avg, rmse_std))

RMSE de treino: 0.41
RMSE médio: 0.43 +- 0.05


In [10]:
acc_train = (score['Acc']*score['n questions']).sum()/score['n questions'].sum()
acc_avg = score['Acc'].mean()
acc_std = score['Acc'].std()

In [11]:
print("Acc de treino: %.2f" % acc_train)
print("Acc médio: %.2f +- %.2f" % (acc_avg, acc_std))

Acc de treino: 0.75
Acc médio: 0.69 +- 0.12


In [13]:
learning_state = pd.DataFrame.from_dict(learning_state_df)
learning_state[learning_state['student'] == '0BrbPbwCMz']['perform-mult-row2-sp']

0    1.0
Name: perform-mult-row2-sp, dtype: float64