In [1]:
#Loading needed libraries
import os
import argparse
import json
import pickle
import pandas as pd
import numpy as np
import math
import tensorflow as tf
import random

import torch

from torch.utils.data import DataLoader, random_split
from torch.optim import SGD, Adam

from data_loaders.assist2009 import ASSIST2009
from data_loaders.assist2015 import ASSIST2015
from data_loaders.algebra2005 import Algebra2005
from data_loaders.statics2011 import Statics2011
from models.dkt import DKT
from models.dkt_plus import DKTPlus
from models.dkvmn import DKVMN
from models.sakt import SAKT
from models.gkt import PAM, MHA
from models.utils import collate_fn
from api import *

A dataset with the format needed for training a model can be loaded using the function with same name as the dataset. Notice that cognitive tutor does not have a loader. We might need to implement our own one or opt for any of the other datasets (KDDCup Algebra 2005-2006 seems promising)

In [2]:
dataset = ASSIST2009(100)

It is extremely important to understand the data. Note that we are inputing a tuple <sequence of skills_id, sequence of correct/incorrect>. We are never introducing any other type of information, such as the id of the exercise or the response time. Excluding the id of the exercise might be inconvenient for our proposal, as we can no longer explain a prediction in terms of exercises done in the past. However, we can still try to identify sequences of skill_ids in our data.

In [3]:
for i in dataset :
    print(i)

(array([67, 18, 67, 18, 67, 18, 18, 67, 67, 18, 18, 67, 32, 35, 18, 32, 18,
       74, 32, 18, 32, 18, 74, 32, 18, 32, 35, 18, 49, 77, 77, 49, 77, 49,
       49, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]), array([ 0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  1,  1,  0,  0,  0,  1,  1,  0,  0,  0,  0,  1,  1,  0,  1,  0,
        0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
      dtype=int64))
(array([48, 99, 48, 99, 48, 99, 99, 48, 49, 99, 99, 49, 49, 99, 54, 54, 54,
       5

      dtype=int64))
(array([105, 105,  14,  11, 103, 103,  67,  67,  67,  67,  67,  67,  67,
        67,  67,  67,  12,  40, 103,  14, 104,  67, 104, 105, 105,  30,
        30,  30,  30,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]), array([ 1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  0,  1,  1,  1,
        1,  1,  1,  0,  1,  1,  1,  1,  0,  1,  1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1

       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int64))
(array([ 35,  32,  35,  32,  18,  67,  18,  32,  18,  35,  35,  18,  32,
        32,  35,  18,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
        23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
        23,  23,  23,  23,  23,  23,  23,  23,  23, 102, 102,  35,  35,
        35,  35,  35,  35,  35,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]), array([ 1,  1,  1,  1,  1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        0,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  1,  1,  0,  1,
        0,  1,  0,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,


       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64))
(array([ 90,  57,  90,  90,  57,  57,  90,  90,  57,  57,  90,  65,  96,
        65,  96,  65,  65,  96,  96,  65,  96,  65,  96,  65,  65,  96,
        96,  65,  65,  96,  96,  65,  96,  65,  65,  96,  65,  96,  65,
        96,  65, 109, 109,  65,  65, 109,  65, 109,  65, 109, 109,  65,
       109,  65, 109,  65, 109,  65, 109,  65,  65, 109,  57,  90,  90,
        57,  57,  90,  57,  90,  90,  57,  90,  57,  90,  57,  57,  90,
        90,  57,  31,  65, 109,  65, 109,  65, 109, 108, 109, 109, 109,
       107,  13,  10,  10,  10,  10,  10,  10,  10,  10]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1], dtype=int64))
(array([ 10,  10,  10,

       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0], dtype=int64))
(array([ 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  35,  35,  35,
        35,  35,  35,  35,  35,  35,  35,  23,  23,  23,  23,  23,  23,
        23,  23,  23,  23,  23, 102, 102, 102, 102, 102, 102, 102, 102,
       102, 102, 102,  35,  35,  35,  35,  35,  35, 102, 102, 102, 102,
       102, 102, 102, 102,  23, 102, 102, 102, 102, 102, 102, 102,  23,
        23,  23,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]), array([ 0,  1,  1,  1,  1,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  1,  1,  1,  1,  0,  1,  0,  1,  0,  1,  1,  1,  1,
        0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



If we want to get the dataset in a pandas dataframe, we can easily get it with the read_csv function over the .csv file

In [4]:
pd.read_csv("./datasets/ASSIST2009/skill_builder_data.csv",encoding_errors = "ignore")

  pd.read_csv("./datasets/ASSIST2009/skill_builder_data.csv",encoding_errors = "ignore")


Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,33022537,277618,64525,33139,51424,1,1,1,32454,tutor,...,0,3,32454,30799,,26,0,,1,1.0
1,33022709,277618,64525,33150,51435,1,1,1,4922,tutor,...,0,3,4922,30799,,55,0,,2,2.0
2,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
3,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
4,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525529,38214014,291495,96299,57830,108976,0,1,1,38234,tutor,...,0,2,38234,55692,200260.0,,0,,54,
525530,38214016,291495,96299,57843,109015,0,1,1,6500,tutor,...,0,0,6500,55693,200299.0,,0,,55,
525531,38214195,291503,96299,34577,54060,0,1,1,18344,tutor,...,0,3,18344,30677,,0.8,0,,56,
525532,38214196,291503,96299,34577,54061,0,1,1,12922,tutor,...,0,2,12922,30677,,-6,0,,57,


The following function takes as input the dataset name and model type, trains a model and returns it as an output. It is basically an adaptation of the function "main.py" of the library. Advanced options can be specified in the file "config.json" and the implementation can be found in api.py. 

Here, we train a DKT model with the ASSIST2009 dataset 

In [5]:
model = get_model("dkt","ASSIST2009")

Epoch: 1,   AUC: 0.6924365309718088,   Loss Mean: 0.6734675765037537
Epoch: 2,   AUC: 0.7370199017312276,   Loss Mean: 0.6386783123016357
Epoch: 3,   AUC: 0.7586559174211449,   Loss Mean: 0.6237490773200989


Get a prediction over an input (a sequence of pairs $<question,answer>$)

In [6]:
model.eval()
input_1= torch.from_numpy(dataset[5][0])
answer_1 = torch.from_numpy(dataset[5][1])
output_1 = model(input_1,answer_1)

The model returns a matrix $[seq\_len,n\_skills]$, where the position $[i,j]$ the level of knowledge of the j-th knowledge concept after the i-th interaction. We can get the increment in skill mastery throughout the learning process as follows:

In [7]:
output_1[-1]-output_1[0]

tensor([ 0.0222, -0.0112, -0.0297,  0.0170, -0.0244, -0.0045, -0.0164,  0.0072,
         0.0131, -0.1165, -0.0628,  0.0088,  0.0598,  0.0155, -0.0312,  0.1001,
         0.0151, -0.0018, -0.0311,  0.0047,  0.0214, -0.0156, -0.0578,  0.0187,
         0.0048,  0.0071,  0.0369,  0.0047,  0.0008,  0.0188,  0.0488, -0.0560,
        -0.0483, -0.0010,  0.0336,  0.0072,  0.0062, -0.0084,  0.0284, -0.0330,
         0.0841,  0.0353,  0.0042,  0.0822,  0.0974,  0.0204,  0.0325, -0.0502,
        -0.0153,  0.1000, -0.0506,  0.0442, -0.0108,  0.0395,  0.0512,  0.0615,
         0.0531, -0.0595,  0.0141,  0.0396,  0.0700, -0.0381,  0.0642, -0.1479,
         0.0079,  0.0454,  0.0036,  0.0955,  0.0181, -0.0712, -0.0240, -0.0368,
         0.0807,  0.0053,  0.0770,  0.0073, -0.0019,  0.0154,  0.0477, -0.0248,
         0.0063,  0.0489, -0.0314,  0.0391,  0.0036,  0.0347,  0.0450,  0.0143,
        -0.0572,  0.0421, -0.0047, -0.0108, -0.0068, -0.0640, -0.0123,  0.0104,
         0.0131, -0.0173,  0.0399, -0.08

We can see that the probabilities of skill mastery output by the system orbit around 0.5 in order to maximize the AUC. However, this could be a problem if we try to use the model to generate our own sequences

In [8]:
sum = 0
print("First 5 interactions")
print("Question \t Answer \t Predicted answer \t Absolute error")
for i in range(1,len(output_1)) :
    if (i<6) :
        print(dataset[5][0][i],"\t \t",int(answer_1[i]),"\t \t",float(output_1[i-1,input_1[i]]),"\t",abs(float(output_1[i-1,input_1[i]])-float(answer_1[i])))
    sum = sum + abs(float(output_1[i-1,input_1[i]])-float(answer_1[i]))
print("MAE",(sum/(len(output_1)-1)))

First 5 interactions
Question 	 Answer 	 Predicted answer 	 Absolute error
9 	 	 0 	 	 0.5648069977760315 	 0.5648069977760315
1 	 	 1 	 	 0.5215309858322144 	 0.47846901416778564
0 	 	 1 	 	 0.6107142567634583 	 0.38928574323654175
1 	 	 1 	 	 0.7012269496917725 	 0.29877305030822754
0 	 	 1 	 	 0.7297891974449158 	 0.27021080255508423
MAE 0.40901722729206086


The function generate is able to generate synthetic data. It requires as input the model, a sequence of question, a sequence of answer and the interaction from which we want to start to generate the data and outputs a synthetic squence that starts from ini and its likelihood. Let's generate 200 sequences starting from the second interaction

In [9]:
res = []
for i in range(0,200) :
    res.append(generate_seq(model,[46,46,46,46],[0,1,1,1],ini=2, method = "ran"))
res[0:5]

[([0, 1, 1, 1], 0.4077725991032892),
 ([0, 1, 1, 0], 0.2103098050333685),
 ([0, 1, 1, 1], 0.4077725991032892),
 ([0, 1, 1, 0], 0.2103098050333685),
 ([0, 1, 0, 1], 0.21003640273535495)]

We create an alternative input by deleting the 95 interaction from the sequence. The altered input sequence of questions will be renamed as "input_alt"

In [10]:
print(input_1[96],answer_1[96])
input_alt = torch.cat([input_1[:96], input_1[97:]])

tensor(61, dtype=torch.int32) tensor(0)


We generate the set of answers for our new input starting from the 95-th interaction (the one we deleted). Then, we input the sequence of altered questions and the synthetically computed answer to the model and get the skill mastery (variable "output"). We do this 500 and compute the weighted average of the difference of skill mastery between the original input and for each of the 500 generated sequences (where the weights are the likelihood of the sequences).

In [11]:
res = []
l_list = []
for i in range(0,500):
    answer_alt,likelihood = generate_seq(model, input_alt, answer_1, ini = 96, method = "ran")
    output_alt= model(input_alt, answer_alt)
    res.append(likelihood * ((output_1[-1]-output_alt[-1])).detach().numpy())
    l_list.append(likelihood)
print(np.sum(res,axis=0)/np.sum(l_list))
print(np.mean(np.sum(res,axis=0)/np.sum(l_list)))

  output = model(torch.tensor(input_seq[ini:i+1]), torch.tensor(rec_answer_seq[ini:i+1]))


[-0.02199388 -0.00434964 -0.0207602  -0.00147472 -0.06567274  0.02175658
  0.02784846 -0.01559796 -0.03557725 -0.06468824 -0.03068843 -0.01198477
  0.02313635 -0.01618033 -0.03938122  0.00179009  0.01974914  0.03054206
 -0.04657226 -0.03024339 -0.00972771  0.0139936  -0.08505667  0.00679879
 -0.014056    0.02554102  0.03375208 -0.03193484  0.01030694  0.01132895
 -0.00581719 -0.05009058 -0.04959981 -0.05920492 -0.01658422 -0.03399539
  0.02530809  0.00426204 -0.00723911 -0.05948037  0.01700849 -0.06138287
 -0.04029353  0.02859268  0.04280101  0.02566999  0.03989921 -0.02209388
 -0.03191847  0.03348454 -0.03562469  0.00274285 -0.00196771  0.0291902
 -0.02258406  0.02779372 -0.01333307 -0.04807809 -0.04023832 -0.03429754
  0.00292423 -0.15415287  0.00458647 -0.17707434 -0.01757341 -0.01935355
  0.05990072  0.02104251 -0.00567544 -0.02190243 -0.0495753  -0.01465493
  0.03812497 -0.01358842 -0.00538733 -0.00422351 -0.03623495 -0.03578109
  0.03122752 -0.03794049 -0.01027422  0.01174793 -0.