In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import numpy.random as rng
import tensorflow as tf

# Recurrent Neural Networks

- RNNs are designed to handle time series data
- A RNN consists of a series of cells as follows:

![alt text](figures/rnn.png)

An RNN cell stores a _hidden state_  $\boldsymbol{h}_t$. The cell takes the input at time $t$ and combines it with the hidden state from time $t-1$ to produce an output $\boldsymbol{o}_t$.

- It is important to note that the _same_ cell is repeated across time; the same weights and the same mathematical operations.
- With RNNs, multiple prediction tasks can be performed:
    - Predict the next output given the previous outputs and inputs
    - Predict a label associated with the sequence
    - Predict the output sequence, given the input sequence (used in tasks such as machine translation)
- There are different kinds of RNN cells, but the most known are:
    - Simple feedforward RNN cell
    - Long-Short-Term-Memory (LSTM) and Gated Recurrent Unit (GRU) RNN cells which solve the vanishing gradient issue with simple feedforward RNN cells

In this module, we'll apply RNNs to the task of student performance prediction. Here, we are given the following dataset:


## Loading the data

In [85]:
df = pd.read_csv("data/synthetic.csv")
df

Unnamed: 0,correct,skill,student,problem,skill_name
0,0,0,0,0,0
1,1,1,0,1,1
2,1,2,0,2,2
3,1,3,0,3,3
4,0,4,0,4,4
...,...,...,...,...,...
199995,1,45,3999,45,45
199996,1,46,3999,46,46
199997,1,47,3999,47,47
199998,1,48,3999,48,48


In [86]:
print("Mean correct: %0.2f" % np.mean(df['correct'], axis=0))
print("Number of students: %d, Skills: %d" % (len(set(df['student'])), len(set(df['skill']))))

Mean correct: 0.67
Number of students: 4000, Skills: 50


## Splitting into Sequences

In [87]:
sequences_by_student = defaultdict(list)
for r in df.itertuples():
    sequences_by_student[r.student].append((r.skill, r.correct))

In [88]:
seq_lens = [len(seq) for student, seq in sequences_by_student.items()]
print("Median sequence length: %d, Range: %d-%d" % (np.median(seq_lens), np.min(seq_lens), np.max(seq_lens))) 

Median sequence length: 50, Range: 50-50


In [89]:
# we don't care about student id, just that a sequence comes from the same student
all_seqs = [ seq for _, seq in sequences_by_student.items() ]

# shuffle all sequences
rng.shuffle(all_seqs)

# split into training and testing sequences
p_train = 0.8
n_train = int(p_train * len(all_seqs))

train_seqs = all_seqs[:n_train]
test_seqs = all_seqs[n_train:]

print("Training sequences: %d, testing: %d" % (len(train_seqs), len(test_seqs)))

Training sequences: 3200, testing: 800


### Chopping sequences up

We'll do something that is not ideal but for code clarity later on, we'll chop the sequences so that they do not exceed 50 trials in length.

In [90]:
def chop_sequences(seqs, max_len):
    new_seqs = []
    for seq in seqs:
        
        if len(seq) <= max_len:
            new_seqs.append(seq)
        else:
            while len(seq) > max_len:
                subseq = seq[:max_len]
                new_seqs.append(subseq)
                seq = seq[max_len:]
            if len(seq) > 0:
                new_seqs.append(seq)
    
    return new_seqs

train_seqs = chop_sequences(train_seqs, 50)
test_seqs = chop_sequences(test_seqs, 50)

print("After limiting to 50 trials, training sequences: %d, testing: %d" % (len(train_seqs), len(test_seqs)))

After limiting to 50 trials, training sequences: 3200, testing: 800


### Padding

We'll now pad all sequences so that they are the same length (50 trials). This is necessary for RNN training.

In [91]:
padded_train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding="post", value=-1)
padded_test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, padding="post", value=-1)

## Feature Transformation

- The input to the RNN cell consists of three things:
    - The skill that was practiced at the previous time step
    - Whether the student answered correctly or not at the previous time step
    - The skill at the current time step
- As you may have noticed, while the skill is numerically coded, it is a qualitative variable. So we have to use a one-hot encoding representation of it.

In [92]:
def transform_seqs(batch_seqs, n_skills):
    """ Performs feature transformation on a batch of sequences:
        
        Input: batch_seqs with shape (B, N, 2)
               where B is # of sequences in batch
                     N is # of trials
               n_skills: number of skills
        Output: transformed inputs with shape (B, N, 2 * S + 1)
               where S is number of skills
    """
    
    n_batch, n_trials, _ = batch_seqs.shape
    
    transformed = np.zeros((n_batch, n_trials, 2 * n_skills + 1))
    y = np.zeros((n_batch, n_trials, 1))
    
    for seq_id, seq in enumerate(batch_seqs):
        curr_skill = seq[0, 0]
        
        transformed[seq_id, 0, curr_skill] = 1
        y[seq_id, 0, 0] = seq[0, 1]
        
        for i in range(1, n_trials):
            
            prev_skill = seq[i-1, 0]
            prev_correct = seq[i-1, 1]
            curr_skill = seq[i, 0]
            y[seq_id, i, 0] = seq[i, 1]
        
            if curr_skill > -1:
                transformed[seq_id, i, prev_skill] = 1
                transformed[seq_id, i, n_skills + curr_skill] = 1
                transformed[seq_id, i, n_skills + n_skills] = prev_correct
        
    return transformed, y
#transform_seqs(padded_train_seqs[:5,:,:], 97)

- You may be inclined to apply this operation to all training and testing sequences but your computer would probably run out of memory if you do so
- Consider this: if you have a single sequence with $N=50$ trials and $S=100$ skills, then the input shape for that sequence will be $50 \times 201 = 10050$ elements!
- We'll have to use mini-batch learning and we'll have to take control of the keras training loop ourselves

## Model Training

In [96]:
timesteps = 50
features = 97 * 2 + 1
n_hidden = 100

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Masking(mask_value=0.,input_shape=(timesteps, features)))
model.add(tf.keras.layers.LSTM(n_hidden, return_sequences=True))
model.add(tf.keras.layers.Dropout(0.5))
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

model.add(tf.keras.layers.TimeDistributed(output_layer))

opt = tf.keras.optimizers.Nadam(learning_rate=0.01)

model.compile(opt, 'binary_crossentropy')

print(model.summary())


Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_20 (Masking)         (None, 50, 195)           0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 50, 100)           118400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
time_distributed_17 (TimeDis (None, 50, 1)             101       
Total params: 118,501
Trainable params: 118,501
Non-trainable params: 0
_________________________________________________________________
None


In [97]:
n_skills = 97
n_batch = 10
n_epochs = 5

for e in range(n_epochs):
    rng.shuffle(padded_train_seqs)
    for i in range(0, len(padded_train_seqs), n_batch):
        start = i
        end = i + n_batch

        inputs, outputs = transform_seqs(padded_train_seqs[start:end,:,:], n_skills)
        
        model.train_on_batch(inputs,outputs)
        
    print("Finished epoch %d" % e)
    
    total_loss = 0.0
    for i in range(0, len(padded_test_seqs), n_batch):
        start = i
        end = i + n_batch

        inputs, outputs = transform_seqs(padded_test_seqs[start:end,:,:], n_skills)
        
        ignore_ix = np.sum(inputs, axis=2) == 0
        
        loss = model.test_on_batch(inputs,outputs)
        total_loss += loss
        
    print("Loss = ", total_loss)

Finished epoch 0
Loss =  33.509444147348404
Finished epoch 1
Loss =  33.3130176961422
Finished epoch 2
Loss =  33.205979228019714
Finished epoch 3
Loss =  32.98287257552147
Finished epoch 4
Loss =  32.95540598034859


In [98]:
all_preds = []
all_y = []
for i in range(0, len(padded_test_seqs), n_batch):
    start = i
    end = i + n_batch

    inputs, outputs = transform_seqs(padded_test_seqs[start:end,:,:], n_skills)
    
    
    included_ix = np.sum(inputs, axis=2) != 0
        
    preds = model.predict_on_batch(inputs)
    
    included_ix = np.reshape(included_ix, -1)
    preds = np.reshape(preds, -1)
    y = np.reshape(outputs, -1)
    
    preds = preds[included_ix]
    y = y[included_ix]
    
    all_preds.extend(preds)
    all_y.extend(y)

In [99]:
import sklearn.metrics

In [100]:
acc = sklearn.metrics.accuracy_score(all_y, np.array(all_preds) > 0.5)
acc

0.803725

In [101]:
aucroc = sklearn.metrics.roc_auc_score(all_y, all_preds)
aucroc

0.8711255708583006

In [102]:
acc = sklearn.metrics.balanced_accuracy_score(all_y, np.array(all_preds) > 0.5)
acc

0.7755635518638788

In [103]:
cm = sklearn.metrics.confusion_matrix(all_y, np.array(all_preds) > 0.5)
cm

array([[ 8833,  3802],
       [ 4049, 23316]], dtype=int64)