In [1]:
import gc
import psutil
import joblib
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import math

In [2]:
TRAIN_SAMPLES = 320000

MAX_SEQ = 100
MIN_SAMPLES = 5
EMBED_DIM = 128
DROPOUT_RATE = 0.2
LEARNING_RATE = 1e-3
MAX_LEARNING_RATE = 2e-3
# EPOCHS = 30
EPOCHS = 10
# TRAIN_BATCH_SIZE = 2048
TRAIN_BATCH_SIZE = 64

In [3]:
# from transformers import AutoTokenizer, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

# inputs = tokenizer("Hello world!", return_tensors="pt")
# outputs = model(**inputs)

In [4]:
%%time

dtypes = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}
# train_df = pd.read_feather('../input/riiid-cross-validation-dataset/train.feather')[[
#     'timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly'
# ]]
train_df = pd.read_csv('./input/riiid-test-answer-prediction/train.csv')[['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly']]
for col, dtype in dtypes.items():
    train_df[col] = train_df[col].astype(dtype)
    
    
#train_df have only rows with False in content_type_id (0 if the event was a question being posed to the user)
train_df = train_df[train_df.content_type_id == False]  

train_df = train_df.sort_values(['timestamp'], ascending=True)
train_df.reset_index(drop=True, inplace=True)


skills = train_df["content_id"].unique()
# joblib.dump(skills, "skills.pkl.zip")
n_skill = len(skills)  # (unique content IDs)
print("number skills", n_skill)


group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
# joblib.dump(group, "group.pkl.zip")  # Save models?
del train_df
gc.collect()
# group

# The training data is sorted by timestamp and split into two sets using an 80/20 split
TRAIN_SAMPLES = int(len(group.index)*0.8)
print('TRAIN_SAMPLES',TRAIN_SAMPLES)


# The method then creates a dictionary of samples, where each key is a user ID and the corresponding value is a tuple containing the user's content IDs and answered correctly values.
train_indexes = list(group.index)[:TRAIN_SAMPLES]
valid_indexes = list(group.index)[TRAIN_SAMPLES:]
train_group = group[group.index.isin(train_indexes)]
valid_group = group[group.index.isin(valid_indexes)]
print('train_group \n', train_group[:5] )
print('valid_group \n', valid_group[:5] )

del group, train_indexes, valid_indexes
print(len(train_group), len(valid_group))

number skills 13523
TRAIN_SAMPLES 314924
train_group 
 user_id
115     ([5692, 5716, 128, 7860, 7922, 156, 51, 50, 78...
124     ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
2746    ([5273, 758, 5976, 236, 404, 382, 405, 873, 53...
5382    ([5000, 3944, 217, 5844, 5965, 4990, 5235, 605...
8623    ([3915, 4750, 6456, 3968, 6104, 5738, 6435, 54...
dtype: object
valid_group 
 user_id
1720820513    ([3849, 1320, 5285, 8918, 3644, 6111, 8397, 94...
1720823127    ([7900, 7876, 175, 1278, 2064, 2065, 2063, 336...
1720823509    ([128, 7860, 7922, 156, 51, 50, 7896, 7863, 15...
1720827508    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
1720827841    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
dtype: object
314924 78732
CPU times: user 55.6 s, sys: 8.47 s, total: 1min 4s
Wall time: 1min 4s


### The BERTDataset class returns three values for each sample in the dataset:

- `x`: the input data for the BERT model, which consists of the content IDs for each sample shifted by one and the answered correctly values added to the content IDs
- `target_id`: the target IDs for each sample, which consist of the content IDs shifted by one
- `label`: the labels for each sample, which consist of the answered correctly values shifted by one

These values are used as input to the BERT model and are used to calculate the model's performance during training and evaluation. The `x` and `target_id` arrays are used as input to the BERT model, while the `label` array is used to calculate the model's loss and accuracy.

The __init__ method iterates over the users in the `group` object and retrieves the questions and answers for each user. If a user has answered fewer than `min_samples` questions, their questions and answers are not included in the `samples` dictionary. If a user has answered more than `max_seq` questions, their questions and answers are split into multiple sequences of length `max_seq` and each sequence is added to the `samples` dictionary using a unique key that includes the user's ID and the sequence number. For example, if the user's ID is `123` and they have answered 150 questions, their questions and answers will be split into two sequences with lengths 128 and 22, and the keys `123_0` and `123_1` will be added to the samples dictionary with the values of the first and second sequence, respectively.


In [5]:
class BERTDataset(Dataset):
    def __init__(self, group, n_skill, min_samples=1, max_seq=128):
        super(BERTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = {}
        
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]  # q:content_id(questions); qa:answered_correctly(user's question answer)
            if len(q) < min_samples:  # If a user has answered fewer than min_samples questions, their questions and answers are not included in the 'samples' dictionary
                continue 
            
            # Main Contribution
            if len(q) > self.max_seq:
                total_questions = len(q)
                initial = total_questions % self.max_seq
                if initial >= min_samples:
                    self.user_ids.append(f"{user_id}_0")
                    self.samples[f"{user_id}_0"] = (q[:initial], qa[:initial])
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq+1}")
                    start = initial + seq * self.max_seq
                    end = start + self.max_seq
                    self.samples[f"{user_id}_{seq+1}"] = (q[start:end], qa[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (q, qa)
    
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len == self.max_seq:
            q[:] = q_
            qa[:] = qa_
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
        
        # 'x' also has a length of max_seq-1
        target_id = q[1:]  
        label = qa[1:]
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill  # the model needs to be able to distinguish between the question IDs and the correct answers in order to make predictions.
          
        return x, target_id, label
    
    
train_dataset = BERTDataset(train_group, n_skill, min_samples=MIN_SAMPLES, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=8)
valid_dataset = BERTDataset(valid_group, n_skill, max_seq=MAX_SEQ)
valid_dataloader = DataLoader(valid_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False, num_workers=8)


## Define model



- the `d_model` parameter specifies the size of the hidden states used by the model. This is also known as the "model size" or the "embedding size" of the model.
- The `d_model` parameter is used as a scaling factor when computing the dot product between the query and key vectors in the Attention mechanism. It is also used to specify the size of the input and output vectors for the linear layers in the MultiHeadedAttention class, as well as the size of the input and output vectors for the LayerNorm and SublayerConnection classes. In general, a larger d_model value will result in a more expressive BERT model, but will also increase the computational complexity and memory usage of the model.



### Sublayer Connection:
- ### residual connection:
    - A residual connection is a type of connection in a neural network that allows information to bypass one or more layers of the network. This allows the network to learn to perform tasks more efficiently by allowing the information to flow more directly from the input to the output layers. Residual connections can help improve the performance of the network, particularly on tasks that require the network to process long sequences of data. They are often used in deep learning networks, where they can help prevent the vanishing gradient problem, allowing the network to learn more effectively.
    
- ### LayerNorm:
    - the `eps` parameter specifies a small value used to stabilize the division operation in the layer normalization computation. This is necessary because division by zero is undefined, and division by a very small value can lead to numerical instability.

In [15]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=128, embed_dim=128, dropout_rate=0.2):
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim
        
# self.embedding: an nn.Embedding layer that maps the input sequence of student responses to a sequence of vectors of dimension embed_dim
# self.pos_embedding: an nn.Embedding layer that maps the positions of each token in the student response to a vector of dimension embed_dim
# self.e_embedding: an nn.Embedding layer that maps the input sequence of question IDs to a sequence of vectors of dimension embed_dim
# self.multi_att: an nn.MultiheadAttention layer that computes attention weights for each student response
        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=dropout_rate)

        self.dropout = nn.Dropout(dropout_rate)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
#         att_output = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1)

In [16]:
n_skill

13523

In [17]:
model = SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_DIM, dropout_rate=DROPOUT_RATE)
model

SAKTModel(
  (embedding): Embedding(27047, 128)
  (pos_embedding): Embedding(99, 128)
  (e_embedding): Embedding(13524, 128)
  (multi_att): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (layer_normal): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (ffn): FFN(
    (lr1): Linear(in_features=128, out_features=128, bias=True)
    (relu): ReLU()
    (lr2): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (pred): Linear(in_features=128, out_features=1, bias=True)
)

## Define Train process

In [9]:
def train_fn(model, dataloader, optimizer, scheduler, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
#         x = item[0]
#         print('x',x, 'x.size()',x.size())
        x = item[0].to(device).long()
        segment_info = item[1].to(device).long()
        label = item[2].to(device).float()

        
        optimizer.zero_grad()
        output = model(x, segment_info)
#         print('output',output, 'output.size()',output.size())
        
        
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss.append(loss.item())
        
        
        target_mask = (segment_info != 0)  # Create a mask indicating which values in segment_info are not 0
        last_nonzero_idx = target_mask.sum(dim=1) - 1  # Find the last non-zero value for each batch
        
        output = output[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the output tensor using last_nonzero_idx
        label = label[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the label tensor using last_nonzero_idx

#         print('output = output[range(64), last_nonzero_idx]', output,'output.size()',output.size())
    
        pred = (torch.sigmoid(output) >= 0.5).long()

        
        
        
#         output = torch.masked_select(output, target_mask)
#         label = torch.masked_select(label, target_mask)
#         pred = (torch.sigmoid(output) >= 0.5).long()
        
#         print('output = torch.masked_select(output, target_mask)', output,'output.size()',output.size())
        
#         print('label = torch.masked_select(label, target_mask)', label,'label.size()',label.size())
#         print('pred = (torch.sigmoid(output) >= 0.5).long()', pred, 'pred.size()',pred.size())
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

## Define Test process

In [10]:
def valid_fn(model, dataloader, criterion, device="cpu"):
    model.eval()

    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
        x = item[0].to(device).long()
        segment_info = item[1].to(device).long()
        label = item[2].to(device).float()
    
#         target_mask = (segment_info != 0)

        output= model(x, segment_info)
        loss = criterion(output, label)
        valid_loss.append(loss.item())

        
        
        target_mask = (segment_info != 0)  # Create a mask indicating which values in segment_info are not 0
        
        last_nonzero_idx = target_mask.sum(dim=1) - 1  # Find the last non-zero value for each batch
        
        output = output[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the output tensor using last_nonzero_idx
        label = label[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the label tensor using last_nonzero_idx

#         output = torch.masked_select(output, target_mask)
#         label = torch.masked_select(label, target_mask)
        pred = (torch.sigmoid(output) >= 0.5).long()
#         pred = (torch.sigmoid(output) >= 0.5)
    
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, auc

## Training

In [18]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=MAX_LEARNING_RATE, steps_per_epoch=len(train_dataloader), epochs=EPOCHS
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

best_auc = 0
max_steps = 3
step = 0
for epoch in range(EPOCHS):
    loss, acc, auc = train_fn(model, train_dataloader, optimizer, scheduler, criterion, device)
    print("epoch - {}/{} train: - {:.4f} acc - {:.4f} auc - {:.4f}".format(epoch+1, EPOCHS, loss, acc, auc))
    loss, acc, auc = valid_fn(model, valid_dataloader, criterion, device)
    print("epoch - {}/{} valid: - {:.4f} acc - {:.4f} auc - {:.4f}".format(epoch+1, EPOCHS, loss, acc, auc))
    if auc > best_auc:
        best_auc = auc
        step = 0
        torch.save(model.state_dict(), "sakt_model.pt")
    else:
        step += 1
        if step >= max_steps:
            break

epoch - 1/10 train: - 0.4616 acc - 0.7705 auc - 0.8489
epoch - 1/10 valid: - 0.4427 acc - 0.7850 auc - 0.8688
epoch - 2/10 train: - 0.4437 acc - 0.7828 auc - 0.8669
epoch - 2/10 valid: - 0.4416 acc - 0.7868 auc - 0.8707
epoch - 3/10 train: - 0.4418 acc - 0.7835 auc - 0.8681
epoch - 3/10 valid: - 0.4387 acc - 0.7870 auc - 0.8720
epoch - 4/10 train: - 0.4400 acc - 0.7852 auc - 0.8695
epoch - 4/10 valid: - 0.4383 acc - 0.7891 auc - 0.8733
epoch - 5/10 train: - 0.4383 acc - 0.7863 auc - 0.8711
epoch - 5/10 valid: - 0.4360 acc - 0.7898 auc - 0.8739
epoch - 6/10 train: - 0.4367 acc - 0.7877 auc - 0.8724
epoch - 6/10 valid: - 0.4354 acc - 0.7903 auc - 0.8751
epoch - 7/10 train: - 0.4351 acc - 0.7890 auc - 0.8738
epoch - 7/10 valid: - 0.4349 acc - 0.7896 auc - 0.8757
epoch - 8/10 train: - 0.4335 acc - 0.7900 auc - 0.8751
epoch - 8/10 valid: - 0.4351 acc - 0.7912 auc - 0.8760
epoch - 9/10 train: - 0.4322 acc - 0.7909 auc - 0.8761
epoch - 9/10 valid: - 0.4345 acc - 0.7911 auc - 0.8760
epoch - 10

In [None]:
0.7912