In [203]:
from spkit import bkt
import pandas as pd
import re
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [223]:
%load_ext autoreload
%autoreload 0

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [211]:
%aimport spkit.bkt

### Reading KDD train file
Drop rows with null in any of the important columns

In [2]:
%%time
df_all = pd.read_csv('algebra_2005_2006_train.txt', delimiter='\t')
print(df_all.shape)

(809694, 19)
CPU times: user 4.37 s, sys: 184 ms, total: 4.56 s
Wall time: 4.56 s


In [140]:
df = df_all[['Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'KC(Default)']]

In [141]:
df = df.dropna()

In [142]:
df.shape

(607025, 7)

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [143]:
skipped = []
def get_kcs(row):
    kc_list = row['KC(Default)'].split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[,/?]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', text, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [144]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

CPU times: user 32.8 s, sys: 128 ms, total: 33 s
Wall time: 33 s


In [145]:
%%time
kcs = []
for row in df['KC'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

CPU times: user 12min 33s, sys: 686 ms, total: 12min 33s
Wall time: 12min 35s


Construct Q-Matrix

In [146]:
%%time
q_matrix = np.zeros((len(steps), len(kcs)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(176630, 25)
CPU times: user 1min 44s, sys: 91 ms, total: 1min 44s
Wall time: 1min 44s


Save data

In [201]:
import pickle
with open('kdd_2005_2005_df.pkl', 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('kdd_2005_2005_q_matrix.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

### Fitting data

In [148]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()

# Instantiate model
model = bkt.BKT()
model.fit(data, q_matrix)

<spkit.bkt.BKT at 0x7f8bb82257b8>

In [158]:
students = df['Anon Student Id'].unique().tolist()
len(students)

574

In [217]:
with open('kdd_2005_2005_students.pkl', 'wb') as pklfile:
    pickle.dump(students, pklfile)

In [222]:
%%time
data_arr = np.asarray(data)
rmse = 0
n = 0
score_df = []
outcome_all = []
predicted_proba = []
for student in students:
    student_idx = np.where(data_arr[:,1] == student)
    outcome = data_arr[student_idx, 0]
    question_id = data_arr[student_idx, 2]
    data_student = np.vstack((outcome, question_id)).T.astype('int64').tolist()
    inferred_outcome = model.predict_proba(data_student, q_matrix)
    s_aic, s_bic, s_rmse, s_acc = model.score()
    score_df.append({"student": student,
                     "AIC": s_aic,
                     "BIC": s_bic,
                     "RMSE": s_rmse,
                     "Acc": s_acc,
                     "n questions": model.n_questions,
                     "LL": model.loglikelihood.sum(),
                     "correct_outcome": })

CPU times: user 57.7 s, sys: 123 ms, total: 57.8 s
Wall time: 57.8 s


In [224]:
score = pd.DataFrame.from_dict(score_df)

In [227]:
with open('kdd_2005_2005_score.pkl', 'wb') as pklfile:
    pickle.dump(score, pklfile)

In [225]:
rmse_train = np.sqrt((score['RMSE']**2*score['n questions']).sum()/score['n questions'].sum())
rmse_avg = score['RMSE'].mean()
rmse_std = score['RMSE'].std()

In [226]:
print("RMSE de treino: %.2f" % rmse_train)
print("RMSE médio: %.2f +- %.2f" % (rmse_avg, rmse_std))

RMSE de treino: 0.42
RMSE médio: 0.43 +- 0.05


In [228]:
acc_train = (score['Acc']*score['n questions']).sum()/score['n questions'].sum()
acc_avg = score['Acc'].mean()
acc_std = score['Acc'].std()

In [229]:
print("Acc de treino: %.2f" % acc_train)
print("Acc médio: %.2f +- %.2f" % (acc_avg, acc_std))

Acc de treino: 0.75
Acc médio: 0.72 +- 0.11
