In [None]:
import os
import re
import pickle
import glob
import numpy as np
import pandas as pd

from torch.utils.data import Dataset



DATASET_DIR = "/content/drive/MyDrive"

#q, r 길이 맞추기
def match_seq_len(q_seqs, r_seqs, seq_len, pad_val=-1):

    proc_q_seqs = []
    proc_r_seqs = []

    for q_seq, r_seq in zip(q_seqs, r_seqs):
        i = 0
        while i + seq_len + 1 < len(q_seq):
            proc_q_seqs.append(q_seq[i:i + seq_len + 1])    #subsequence of length seq_len + 1
            proc_r_seqs.append(r_seq[i:i + seq_len + 1])    #subsequence of length seq_len + 1

            i += seq_len + 1

        proc_q_seqs.append(
            np.concatenate(
                [
                    q_seq[i:],
                    np.array([pad_val] * (i + seq_len + 1 - len(q_seq)))
                ]
            )
        )
        proc_r_seqs.append(
            np.concatenate(
                [
                    r_seq[i:],
                    np.array([pad_val] * (i + seq_len + 1 - len(q_seq)))
                ]
            )
        )

    return proc_q_seqs, proc_r_seqs



class sdata(Dataset):
    #window = (start_unit, end_unit)
    def __init__(self, seq_len, unit_depth=5, datset_dir=DATASET_DIR, window=None, subject_t=None, test_t=None) -> None:
        super().__init__()
        self.dataset_dir = datset_dir
        self.dataset_pdir = datset_dir
        self.subject_t = subject_t
        self.seq_len = seq_len
        self.unit_depth = unit_depth
        self.test_t = test_t

        if self.subject_t:
            self.dataset_dir = os.path.join(self.dataset_dir, f"subject_{self.subject_t}")
        if self.test_t:
            self.dataset_dir = os.path.join(self.dataset_dir, f"test_{self.test_t}")
        if self.unit_depth <= 5:
            self.dataset_dir = os.path.join(self.dataset_dir, f"unit_depth_{unit_depth}")

        self.dataset_path = os.path.join(self.dataset_pdir, ".csv")
        self.window = window

        if self.window:
            self.dataset_dir = os.path.join(
                self.dataset_dir, f"{window[0]}_{window[1]}"
            )


        os.makedirs(self.dataset_dir, exist_ok=True)

        if os.path.exists(os.path.join(self.dataset_dir, "q_seqs.pkl")):
            with open(os.path.join(self.dataset_dir, "q_seqs.pkl"), "rb") as f:
                self.q_seqs = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "r_seqs.pkl"), "rb") as f:
                self.r_seqs = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "q_list.pkl"), "rb") as f:
                self.q_list = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "u_list.pkl"), "rb") as f:
                self.u_list = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "q2idx.pkl"), "rb") as f:
                self.q2idx = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "u2idx.pkl"), "rb") as f:
                self.u2idx = pickle.load(f)

        else:
            self.q_seqs, self.r_seqs, self.q_list, self.u_list, self.q2idx, \
                self.u2idx = self.preprocess()

        # print(self.q2idx)

        self.num_u = self.u_list.shape[0]
        self.num_q = self.q_list.shape[0]

        if self.seq_len:
            self.q_seqs, self.r_seqs = \
                match_seq_len(self.q_seqs, self.r_seqs, self.seq_len)

        self.len = len(self.q_seqs)

    def __getitem__(self, index):
        return self.q_seqs[index], self.r_seqs[index]

    def __len__(self):
        return self.len

    def preprocess(self):
        df = pd.read_csv(self.dataset_path)\
            .dropna(subset=["memb_no", "chg_dt","chapter_nm"]).sort_values(by=["chg_dt"])#, sep="\t"


        if self.subject_t is not None:
            df = df[df["subject_cd"].apply(lambda x: self.subject_t in x)]
        if self.test_t is not None:
            df = df[df["test_kind_cd"].apply(lambda x: x == self.test_t)]
        df["memb_no"] = df["memb_no"].astype("str")
        df["unit"] = df[f"chapter_nm{self.unit_depth}"].astype("str")
        df["correct"] = df["ox_yn"].apply(lambda x: 1 if x == "O" else 0)

        #중복제거
        df["unit"] = df["unit"].apply(lambda x: x.split(".")[-1].strip()).apply(lambda x: re.sub('\(\s*\d+\s*\)', '', x).strip())

        if self.window is not None:
            (start, end) = self.window
            df = df[df["unit"] >= start and df["unit"] <= end]


        u_list = np.unique(df["memb_no"].values)
        q_list = df.sort_values(by=["subject_cd", "chapter_cd"])['unit'].unique()

        u2idx = {u: idx for idx, u in enumerate(u_list)}
        q2idx = {q: idx for idx, q in enumerate(q_list)}



        q_seqs = []
        r_seqs = []
        for u in u_list:
            u_df = df[df["memb_no"] == u]

            q_seqs.append([q2idx[q] for q in u_df["unit"].values])
            r_seqs.append(u_df["correct"].values)

        with open(os.path.join(self.dataset_dir, "q_seqs.pkl"), "wb") as f:
            pickle.dump(q_seqs, f)
        with open(os.path.join(self.dataset_dir, "r_seqs.pkl"), "wb") as f:
            pickle.dump(r_seqs, f)
        with open(os.path.join(self.dataset_dir, "q_list.pkl"), "wb") as f:
            pickle.dump(q_list, f)
        with open(os.path.join(self.dataset_dir, "u_list.pkl"), "wb") as f:
            pickle.dump(u_list, f)
        with open(os.path.join(self.dataset_dir, "q2idx.pkl"), "wb") as f:
            pickle.dump(q2idx, f)
        with open(os.path.join(self.dataset_dir, "u2idx.pkl"), "wb") as f:
            pickle.dump(u2idx, f)

        return q_seqs, r_seqs, q_list, u_list, q2idx, u2idx


if __name__ == "__main__":
    dataset = sdata(100, unit_depth=5)


  df = pd.read_csv(self.dataset_path)\


In [None]:
from torch.nn.utils.rnn import pad_sequence

if torch.cuda.is_available():
    from torch.cuda import FloatTensor
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
    from torch import FloatTensor


In [None]:
def collate_fn(batch, pad_val=-1):

    q_seqs = []
    r_seqs = []
    qshft_seqs = []
    rshft_seqs = []

    for q_seq, r_seq in batch:
        q_seqs.append(FloatTensor(q_seq[:-1]))
        r_seqs.append(FloatTensor(r_seq[:-1]))  #마지막값 제외
        qshft_seqs.append(FloatTensor(q_seq[1:]))
        rshft_seqs.append(FloatTensor(r_seq[1:]))   #첫번째값 제외

    q_seqs = pad_sequence(  #패딩
        q_seqs, batch_first=True, padding_value=pad_val
    )
    r_seqs = pad_sequence(
        r_seqs, batch_first=True, padding_value=pad_val
    )
    qshft_seqs = pad_sequence(
        qshft_seqs, batch_first=True, padding_value=pad_val
    )
    rshft_seqs = pad_sequence(
        rshft_seqs, batch_first=True, padding_value=pad_val
    )

    mask_seqs = (q_seqs != pad_val) * (qshft_seqs != pad_val)   #shifted sequence -> predict the next element in a sequence,
                                                                #as it provides a clear input-output relationship between the original sequence
                                                                #and its shifted version.
    print(mask_seqs)
    q_seqs, r_seqs, qshft_seqs, rshft_seqs = \
        q_seqs * mask_seqs, r_seqs * mask_seqs, qshft_seqs * mask_seqs, \
        rshft_seqs * mask_seqs

    return q_seqs, r_seqs, qshft_seqs, rshft_seqs, mask_seqs


In [None]:
import os

import numpy as np
import torch

from torch.nn import Module, Parameter, Embedding, Linear
from torch.nn.init import kaiming_normal_
from torch.nn.functional import binary_cross_entropy
from sklearn import metrics


class DKVMN(Module):
    '''
        Args:
            num_q: the total number of the questions(KCs) in the given dataset
            dim_s: the dimension of the state vectors in this model
            size_m: the memory size of this model
    '''
    def __init__(self, num_q, dim_s, size_m, window=None):
        super().__init__()
        self.num_q = num_q
        self.dim_s = dim_s
        self.size_m = size_m
        self.window = window

        self.k_emb_layer = Embedding(self.num_q, self.dim_s)    #Q embedding(query)
        self.Mk = Parameter(torch.Tensor(self.size_m, self.dim_s))  #key M
        self.Mv0 = Parameter(torch.Tensor(self.size_m, self.dim_s))

        kaiming_normal_(self.Mk)
        kaiming_normal_(self.Mv0)

        self.v_emb_layer = Embedding(self.num_q * 2, self.dim_s)    #O/X

        self.f_layer = Linear(self.dim_s * 2, self.dim_s)
        self.p_layer = Linear(self.dim_s, 1)

        self.e_layer = Linear(self.dim_s, self.dim_s)   #erase
        self.a_layer = Linear(self.dim_s, self.dim_s)   #add

    def forward(self, q, r):
        '''
            Args:
                q: the question(KC) sequence with the size of [batch_size, n]
                r: the response sequence with the size of [batch_size, n]
            Returns:
                p: the knowledge level about q
                Mv: the value matrices from q, r
        '''
        x = q + self.num_q * r

        batch_size = x.shape[0]
        Mvt = self.Mv0.unsqueeze(0).repeat(batch_size, 1, 1)

        Mv = [Mvt]

        k = self.k_emb_layer(q)
        v = self.v_emb_layer(x)

        w = torch.softmax(torch.matmul(k, self.Mk.T), dim=-1)   #knowledgeS W

        # Write Process
        e = torch.sigmoid(self.e_layer(v))
        a = torch.tanh(self.a_layer(v))

        for et, at, wt in zip(
            e.permute(1, 0, 2), a.permute(1, 0, 2), w.permute(1, 0, 2)
        ):
            Mvt = Mvt * (1 - (wt.unsqueeze(-1) * et.unsqueeze(1))) + \
                (wt.unsqueeze(-1) * at.unsqueeze(1))
            Mv.append(Mvt)

        Mv = torch.stack(Mv, dim=1)

        # Read Process
        f = torch.tanh(
            self.f_layer(
                torch.cat(
                    [
                        (w.unsqueeze(-1) * Mv[:, :-1]).sum(-2), #comp. rt(+ k)
                        k
                    ],
                    dim=-1
                )
            )
        )
        p = torch.sigmoid(self.p_layer(f)).squeeze()    #prob of O/X


        return p, Mv

    def train_model(
        self, train_loader, test_loader, num_epochs, opt, ckpt_path, targets=None, logging_steps=-1
    ):
        '''
            Args:
                train_loader: the PyTorch DataLoader instance for training
                test_loader: the PyTorch DataLoader instance for test
                num_epochs: the number of epochs
                opt: the optimization to train this model
                ckpt_path: the path to save this model's parameters
        '''
        aucs = []
        loss_means = []
        unit_counts = []
        max_auc = 0
        steps = 0
        loss_mean = []
        if  logging_steps == -1:
            logging_steps = len(train_loader)

        for i in range(1, num_epochs + 1):


            for data in train_loader:
                q, r, _, _, m = data

                self.train()
                if self.window:
                    m = m * (q >= self.window[0]) * (q <= self.window[1])

                p, _ = self(q.long(), r.long())
                p = torch.masked_select(p, m)
                t = torch.masked_select(r, m).float()

                opt.zero_grad()
                loss = binary_cross_entropy(p, t)
                loss.backward()
                opt.step()

                loss_mean.append(loss.detach().cpu().numpy())

                steps += 1
                if steps %logging_steps == 0:
                    auc, val_loss, unit_count = self.test_model(test_loader=test_loader, targets=targets)
                    unit_counts.append(unit_count)
                    loss_mean = np.mean(loss_mean)
                    print(
                        "Epoch: {},   Steps: {},   AUC: {},   Loss Mean: {}"
                        .format(i, steps, auc, loss_mean)
                    )

                    wandb.log({"auc": auc, "loss_mean": loss_mean, "val_loss": val_loss, "max_auc": max_auc})

                    if auc > max_auc:
                        if self.window:
                            name = f"tmp_model_{self.window[0]}_{self.window[1]}.ckpt"
                        else:
                            name = "tmp_model.ckpt"
                        torch.save(
                            self.state_dict(),
                            os.path.join(
                                ckpt_path, name
                            )
                        )
                        max_auc = auc
                    aucs.append(auc)
                    loss_means.append(loss_mean)
                    loss_mean = []

        os.rename(os.path.join(ckpt_path, name), os.path.join(ckpt_path, name.replace("tmp_", "")))

        if targets:
            loss_means = sum(unit_counts)
        return aucs, loss_means

    def test_model(self, test_loader, targets=None):
        with torch.no_grad():
            for data in test_loader:
                q, r, _, _, m = data

                if self.window:
                    m = m * (q >= self.window[0]) * (q <= self.window[1]) * (q != 0)
                if  targets is not None:
                    if len(targets) > 0:
                        targets = torch.tensor(targets)
                        q1 = torch.eq(q.unsqueeze(-1), targets).any(-1)
                        unit_count = q1.sum().item()
                        print("unit counts: ", unit_count)
                        if unit_count == 0:
                            raise Exception("no target unit in this batch")
                        m = m * q1
                else:
                    unit_count = 0


                self.eval()

                p, _ = self(q.long(), r.long())
                p = torch.masked_select(p, m).detach().cpu()
                t = torch.masked_select(r, m).float().detach().cpu()


                if len(p) > 0:
                    try:
                        val_loss = binary_cross_entropy(p, t)
                        auc = metrics.roc_auc_score(
                        y_true=t.numpy(), y_score=p.numpy()
                )

                    except:
                        continue

                    return auc, val_loss, unit_count
                else:
                    return 0, 0, 0