# DKT Model

This file trains a DKT model with Assistment data and tests the model.

In [8]:
# Modules
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
import random
import math

Read in the data we pre-processed. It contains each student's action sequence.

In [9]:
# Limit the data size
skill_cut = 150        # limit skill amounts
student_cut = 5000    # limit sequences

dataset = pd.read_csv("Assistments/assistment_for_dkt.csv")
dataset = dataset[dataset['skill'] < skill_cut]
print dataset.columns
num_records = len(dataset)
num_skills = len(dataset['skill'].value_counts())
num_actions = 2 * num_skills    # action: every skill correct/wrong
num_labels = num_skills + 1     # one-hot question, plus one bit for correct/wrong
num_students = len(dataset['student'].value_counts())
print str(num_records) + " problem records"
print str(num_skills) + " skills"
print str(num_students) + " students"
print str(np.sum(dataset['correct'].values)) + " correct answers"

Index([u'student', u'skill', u'correct'], dtype='object')
338001 problem records
123 skills
4163 students
220802 correct answers


The following LSTM is based on the one in the Udacity Assignment. The structure of LSTM is the one introduced in this [article](http://colah.github.io/posts/2015-08-Understanding-LSTMs/).

In [10]:
# Hyper parameters to Tune
num_hidden = 200
init_mean = 0
init_stddev = 0.001
# batch_size sequences, with the length of time_window
batch_size = 100
time_window = 100
# Training
# We are using Adams Optimizer, so no hyperparameter.
clipping_norm = 2

| set | num hidden | init mean | init stddev | batch size | time window | clipping norm | AUC    | Overfit After |
|:---:|:----------:|:---------:|:-----------:|:----------:|-------------|---------------|--------|---------------|
|  1  |     200    |     0     |    0.001    |     50     |      50     |       10      | 0.8152 | epoch 8       |
|  2  |     200    |     0     |    0.001    |    100     |      50     |       10      | 0.8172 | epoch 9       |
|  3  |     200    |     0     |    0.001    |    100     |      50     |        5      | 0.8173 | epoch 9       |
|  4  |     200    |     0     |    0.001    |    100     |      50     |        2      | 0.8175 | epoch 8       | 
|  5  |     200    |   0.01    |    0.001    |    100     |      50     |        2      | 0.8152 | epoch 10      |
|  6  |     200    |     0     |    0.001    |    100     |     100     |        2      | 0.8169 | epoch 19      |

As for now, none of the hyperparameters seem to have a major influence on the performance. So probaly we'll just leave it here. Note that we are just using the default AdamOptimizer and haven't tuned even one bit.

AUC drop in one epoch does not necessarily mean that the model has overfitted. However, our model seems to overfit merely after 10 epoches, therefore we need to add regularization tricks, like dropout.

In [11]:
# LSTM Model
graph = tf.Graph()
with graph.as_default():
    # Parameters: _x for new input, _m for old output, _b for bias
    # Input gate
    input_x = tf.Variable(tf.truncated_normal([num_actions, num_hidden], init_mean, init_stddev))
    input_m = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], init_mean, init_stddev))
    input_b = tf.Variable(tf.zeros([1, num_hidden]))
    # Forget gate
    forget_x = tf.Variable(tf.truncated_normal([num_actions, num_hidden], init_mean, init_stddev))
    forget_m = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], init_mean, init_stddev))
    forget_b = tf.Variable(tf.zeros([1, num_hidden]))
    # Update cell:                             
    update_x = tf.Variable(tf.truncated_normal([num_actions, num_hidden], init_mean, init_stddev))
    update_m = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], init_mean, init_stddev))
    update_b = tf.Variable(tf.zeros([1, num_hidden]))
    # Output gate:
    output_x = tf.Variable(tf.truncated_normal([num_actions, num_hidden], init_mean, init_stddev))
    output_m = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], init_mean, init_stddev))
    output_b = tf.Variable(tf.zeros([1, num_hidden]))
    # Variables saving state across the sequence (length: time_window).
    saved_output = tf.Variable(tf.zeros([batch_size, num_hidden]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_hidden]), trainable=False)
    # Classifier weights and biases.
    classify_w = tf.Variable(tf.truncated_normal([num_hidden, num_skills], init_mean, init_stddev))
    classify_b = tf.Variable(tf.zeros([num_skills]))
  
    def lstm_cell(i, o, state):
        # input, last/saved_output, last/saved_state
        input_gate = tf.sigmoid(tf.matmul(i, input_x) + tf.matmul(o, input_m) + input_b)
        forget_gate = tf.sigmoid(tf.matmul(i, forget_x) + tf.matmul(o, forget_m) + forget_b)
        update = tf.tanh(tf.matmul(i, update_x) + tf.matmul(o, update_m) + update_b)
        state = forget_gate * state + input_gate * update
        output_gate = tf.sigmoid(tf.matmul(i, output_x) + tf.matmul(o, output_m) + output_b)
        # return new_output, new_state
        return output_gate * tf.tanh(state), state

    # Input data.
    inputs = list()
    question_labels = list()    # only when training
    action_labels = list()
    for _ in range(time_window):
        inputs.append(tf.placeholder(tf.float32, shape=[batch_size, num_actions]))
        question_labels.append(tf.placeholder(tf.float32, shape=[batch_size, num_skills]))
        action_labels.append(tf.placeholder(tf.float32, shape=[batch_size, ]))
    
    # State resets when starting a new sequence
    reset_state = tf.group(saved_output.assign(tf.zeros([batch_size, num_hidden])),
                           saved_state.assign(tf.zeros([batch_size, num_hidden])))
    
    outputs = list()
    output = saved_output
    state = saved_state
    for i in inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across different segment of a sequence
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), classify_w, classify_b)
        # logits of the actual encountered problem:
        logits_of_interest = tf.reduce_sum(tf.mul(logits, tf.concat(0, question_labels)), 1)
        truth = tf.reshape(tf.concat(0, action_labels), [-1])    # flatten
        # binary cross entropy: padding would introduce some constant loss
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits_of_interest, truth))
    
    optimizer = tf.train.AdamOptimizer()
    gradients, var = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, clipping_norm)
    optimizer = optimizer.apply_gradients(zip(gradients, var))
    
    prediction = tf.sigmoid(logits_of_interest)

Genrating input sequences for LSTM is a bit complicated. The general idea is first take a batch of students then pad their sequence to the same length. When feeding to LSTM, we feed one "window"(time interval).

In [12]:
class DataGenerator(object):
    def __init__(self, dataset, train_ratio):
        # convert file to sequence
        dataset = dataset.values
        seqs = list()
        last_student = -1
        print dataset.shape
        for i in range(len(dataset)):
            if dataset[i][0] != last_student:    # a new student
                last_student = dataset[i][0]
                seqs.append([(dataset[i][1], dataset[i][2])])  # (skill, correct)
            else:     # same student
                seqs[-1].append((dataset[i][1], dataset[i][2]))
        del dataset
        
        tot_seqs = min(len(seqs), student_cut)
        print "total: %d sequences" % tot_seqs
        
        # split train and test
        train_size = int(tot_seqs * train_ratio)
        train_seq_cnt = 0
        self._train_seqs = list()
        for i in range(train_size):
            self._train_seqs.append(seqs[i])
            train_seq_cnt += len(seqs[i])
        test_seq_cnt = 0
        self._test_seqs = list()
        for i in range(train_size, tot_seqs):
            self._test_seqs.append(seqs[i])
            test_seq_cnt += len(seqs[i])
        print "%d records for train" % train_seq_cnt
        print "%d records for test" % test_seq_cnt
        
        # takes around 2GB memory:
        self._train_inputs = []
        self._train_labels = []
        self.generate_batch(self._train_seqs, self._train_inputs, self._train_labels)
        
        self._test_inputs = []
        self._test_labels = []
        self.generate_batch(self._test_seqs, self._test_inputs, self._test_labels)
        
        print "all batch generated"
        
        self._train_cursor = -1
        self._test_cursor = -1
        
    def get_train_batch_num(self):
        return len(self._train_inputs)
    
    def get_test_batch_num(self):
        return len(self._test_inputs)
    
    def get_train_batch(self):
        self._train_cursor += 1
        if self._train_cursor == len(self._train_inputs):
            self._train_cursor = 0
        return self._train_inputs[self._train_cursor], self._train_labels[self._train_cursor]
    
    def get_test_batch(self):
        self._test_cursor += 1
        if self._test_cursor == len(self._test_inputs):
            self._test_cursor = 0
        return self._test_inputs[self._test_cursor], self._test_labels[self._test_cursor]
    
    def generate_batch(self, seqs_pool, inputs, labels):
        seq_count = len(seqs_pool)
        num_batch = int(math.ceil(float(seq_count) / batch_size))
        correct_cnt = 0
        for start in range(0, seq_count, batch_size):            
            end = min(seq_count, start + batch_size)
            maxlen = 0
            for i in range(start, end):
                if maxlen < len(seqs_pool[i]):
                    maxlen = len(seqs_pool[i])
            num_window = int(math.ceil(float(maxlen) / time_window))
            
            # setup empty data (i.e., padded with full 0s)
            inputs.append([])
            labels.append([])
            for _ in range(num_window):
                inputs[-1].append([])
                labels[-1].append([])
                for _ in range(time_window):
                    inputs[-1][-1].append(np.zeros([batch_size, num_actions], dtype=np.float32))
                    labels[-1][-1].append(np.zeros([batch_size, num_labels], dtype=np.float32))
            
            # fill in data
            for i in range(start, end):
                pos_in_batch = i - start    # position in batch
                seq = seqs_pool[i]
                # from back to front
                for back_offset in range(1, len(seq) + 1):
                    # find the row of the record
                    window_offset = - int(math.ceil(float(back_offset) / time_window))
                    frame_offset = - back_offset % time_window
                    if frame_offset == 0:
                        frame_offset = - time_window
                    # code the record by setting ones
                    record = seq[- back_offset]
                    labels[-1][window_offset][frame_offset][pos_in_batch][record[0]] = 1
                    labels[-1][window_offset][frame_offset][pos_in_batch][num_skills] = record[1]
                    
                    input_back_offset = back_offset - 1    # skew input backward 1 time step
                    if input_back_offset == 0:
                        continue
                    input_window_offset = - int(math.ceil(float(input_back_offset) / time_window))
                    input_frame_offset = - input_back_offset % time_window
                    if input_frame_offset == 0:
                        input_frame_offset = - time_window
                    inputs[-1][input_window_offset][input_frame_offset][pos_in_batch][2 * record[0] + record[1]] = 1

The following session trains and runs the LSTM.

In [None]:
# Running Specifications
num_epochs = 50
test_frequency = 1
train_ratio = 0.6

data_generator = DataGenerator(dataset, train_ratio)

(338001, 3)
total: 4163 sequences
206404 records for train
131597 records for test
all batch generated


In [None]:
with tf.Session(graph=graph) as session:
    # Initialize
    tf.initialize_all_variables().run()
    mean_loss = 0
    for epoch in range(num_epochs):
        for batch_no in range(data_generator.get_train_batch_num()):
            batch_inputs, batch_labels = data_generator.get_train_batch()
            reset_state.run()    # new sequence
            for input_window, label_window in zip(batch_inputs, batch_labels):
                
                feed_dict = dict()
                for i in range(time_window):
                    feed_dict[inputs[i]] = input_window[i]
                    feed_dict[question_labels[i]] = label_window[i][:, 0:num_skills]
                    feed_dict[action_labels[i]] = label_window[i][:, num_skills]
                
                _, l = session.run([optimizer, loss], feed_dict=feed_dict)
                mean_loss += l
        
        print "epoch " + str(epoch) + ": loss = " + str(mean_loss)
        mean_loss = 0
        
        if epoch % test_frequency == 0:
            pred_all = []
            truth_all = []
            for batch_no in range(data_generator.get_test_batch_num()):
                batch_inputs, batch_labels = data_generator.get_test_batch()
                reset_state.run()
                for input_window, label_window in zip(batch_inputs, batch_labels):
                    feed_dict = dict()
                    for i in range(time_window):
                        feed_dict[inputs[i]] = input_window[i]
                        feed_dict[question_labels[i]] = label_window[i][:, 0:num_skills]
                        feed_dict[action_labels[i]] = np.zeros([batch_size, ])      # No need to give the target
                    
                    pred = prediction.eval(feed_dict)
                    label_all = np.concatenate(label_window, axis=0)
                    # Exclude padded actions
                    for i in range(len(pred)):
                        if np.sum(label_all[i]) != 0:
                            pred_all.append(pred[i])
                            truth_all.append(label_all[i][num_skills])
            print "Test AUC = " + str(metrics.roc_auc_score(truth_all, pred_all)) + "    "

epoch 0: loss = 169.199989498
Test AUC = 0.694248942971    
epoch 1: loss = 168.180951178
Test AUC = 0.758151402947    
epoch 2: loss = 167.458352327
Test AUC = 0.776980935197    
epoch 3: loss = 167.064126432
Test AUC = 0.78581568835    
epoch 4: loss = 166.804977894
Test AUC = 0.792739664357    
epoch 5: loss = 166.616363168
Test AUC = 0.79761515851    
epoch 6: loss = 166.471256316
Test AUC = 0.802961889688    
epoch 7: loss = 166.332275212
Test AUC = 0.806399543644    
epoch 8: loss = 166.206695676
Test AUC = 0.80953400805    
epoch 9: loss = 166.08298558
Test AUC = 0.811898844887    
epoch 10: loss = 166.017994046
Test AUC = 0.811594235753    
epoch 11: loss = 165.930454969
Test AUC = 0.814531147132    
epoch 12: loss = 165.844239295
Test AUC = 0.81505496762    
epoch 13: loss = 165.779064178
Test AUC = 0.815770047319    
epoch 14: loss = 165.706398845
Test AUC = 0.816363040141    
epoch 15: loss = 165.610882163
Test AUC = 0.816819624032    
epoch 16: loss = 165.540887356
Test AUC

#### Output Log
[Set 1]  
epoch 0: loss = 567.766662002    Test AUC = 0.769160009625    
epoch 1: loss = 561.678450704    Test AUC = 0.792352762107    
epoch 2: loss = 559.633854687    Test AUC = 0.802843348663    
epoch 3: loss = 558.392439961    Test AUC = 0.809475230732    
epoch 4: loss = 557.570638657    Test AUC = 0.813102115054    
epoch 5: loss = 556.914604604    Test AUC = 0.814558511321    
epoch 6: loss = 556.38486594     Test AUC = 0.814910906558    
epoch 7: loss = 555.848721504    Test AUC = 0.815193798888    
epoch 8: loss = 555.357498288    Test AUC = 0.815226028808    
epoch 9: loss = 554.890223265    Test AUC = 0.814296198834    
epoch 10: loss = 554.397933245   Test AUC = 0.81007721421    

[Set 2]  
epoch 0: loss = 332.910510778
Test AUC = 0.674394648502    
epoch 1: loss = 330.410343766
Test AUC = 0.769532576628    
epoch 2: loss = 328.604476213
Test AUC = 0.789874732514    
epoch 3: loss = 327.884168208
Test AUC = 0.799412435034    
epoch 4: loss = 327.37021488
Test AUC = 0.80563279165    
epoch 5: loss = 326.957753122
Test AUC = 0.809847454097    
epoch 6: loss = 326.599114776
Test AUC = 0.814033516832    
epoch 7: loss = 326.286964297
Test AUC = 0.816342685584    
epoch 8: loss = 326.063906908
Test AUC = 0.816886560374    
epoch 9: loss = 325.838874578
Test AUC = 0.817185123433    
epoch 10: loss = 325.609891176
Test AUC = 0.817061722639    
epoch 11: loss = 325.378716528
Test AUC = 0.816330687684    
epoch 12: loss = 325.160521328
Test AUC = 0.815502604237  

[Set 3]   
epoch 0: loss = 332.835750043
Test AUC = 0.718647725378    
epoch 1: loss = 330.070697308
Test AUC = 0.777552152737    
epoch 2: loss = 328.405154288
Test AUC = 0.790903209804    
epoch 3: loss = 327.849017143
Test AUC = 0.800270027838    
epoch 4: loss = 327.271425068
Test AUC = 0.807289911952    
epoch 5: loss = 326.908729076
Test AUC = 0.811421508002    
epoch 6: loss = 326.619960666
Test AUC = 0.813599784297    
epoch 7: loss = 326.336533487
Test AUC = 0.815815462967    
epoch 8: loss = 326.063607395
Test AUC = 0.817161858418    
epoch 9: loss = 325.825604141
Test AUC = 0.81731912728    
epoch 10: loss = 325.611338079
Test AUC = 0.816776815415    
epoch 11: loss = 325.375428975
Test AUC = 0.815700919001    
epoch 12: loss = 325.164224088
Test AUC = 0.814964095224    
epoch 13: loss = 324.97730583
Test AUC = 0.81452376418   

[Set 4]   
Forgot to copy ...

[Set 5]
Not so good, didn't copy again ...
