In [1]:
import csv
import numpy as np

# 示例数据
batch_size = 3
K = 4

# 模拟数据
student_id = np.arange(1, batch_size + 1)  # [1, 2, 3]
item_id = np.arange(101, 101 + batch_size)  # [101, 102, 103]
Q_matrix = np.random.rand(batch_size, K)
t_e = np.random.rand(batch_size, K)
t_s = np.random.rand(batch_size, K)
s_e = np.random.rand(batch_size, K)
diff = np.random.rand(batch_size, K)
evaluate = np.random.rand(batch_size, K)
correct = np.random.randint(0, 2, size=batch_size)  # [0, 1] 随机布尔值

# 保存到 CSV 文件
output_file = "vectors_output.csv"

with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # 写入表头
    header = ["student_id", "item_id", "Q_matrix", "t_e", "t_s", "s_e", "diff", "evaluate", "correct"]
    writer.writerow(header)
    
    # 写入每一行数据
    for i in range(batch_size):
        row = [
            student_id[i], 
            item_id[i], 
            list(Q_matrix[i]),  # 将向量转换为列表
            list(t_e[i]),
            list(t_s[i]),
            list(s_e[i]),
            list(diff[i]),
            list(evaluate[i]),
            correct[i]
        ]
        writer.writerow(row)

print(f"数据已成功保存到 {output_file}")


数据已成功保存到 vectors_output.csv


In [1]:
# Load the data from files
import pandas as pd

train_data = pd.read_csv("../../data/a0910/train.csv")
valid_data = pd.read_csv("../../data/a0910/valid.csv")
test_data = pd.read_csv("../../data/a0910/test.csv")
df_item = pd.read_csv("../../data/a0910/item.csv")
item2knowledge = {}
knowledge_set = set()
for i, s in df_item.iterrows():
    item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
    item2knowledge[item_id] = knowledge_codes
    knowledge_set.update(knowledge_codes)

train_data.head(5)

Unnamed: 0,user_id,item_id,score
0,1615,12977,1
1,782,13124,0
2,1084,16475,0
3,593,8690,0
4,127,14225,1


In [2]:
len(train_data), len(valid_data), len(test_data)

(186049, 25606, 55760)

In [3]:
# Get basic data info for model initialization
import numpy as np
user_n = np.max(train_data['user_id'])
item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])])
knowledge_n = np.max(list(knowledge_set))

user_n, item_n, knowledge_n

(4128, 17746, 123)

In [4]:
# Transform data to torch Dataloader (i.e., batchify)
# batch_size is set to 32

import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32
def transform(user, item, item2knowledge, score, batch_size):
    knowledge_emb = torch.zeros((len(item), knowledge_n))
    for idx in range(len(item)):
        knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0

    data_set = TensorDataset(
        torch.tensor(user, dtype=torch.int64) - 1,  # (1, user_n) to (0, user_n-1)
        torch.tensor(item, dtype=torch.int64) - 1,  # (1, item_n) to (0, item_n-1)
        knowledge_emb,
        torch.tensor(score, dtype=torch.float32)
    )
    return DataLoader(data_set, batch_size=batch_size, shuffle=True)


train_set, valid_set, test_set = [
    transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size)
    for data in [train_data, valid_data, test_data]
]

train_set, valid_set, test_set

(<torch.utils.data.dataloader.DataLoader at 0x1e80529fc40>,
 <torch.utils.data.dataloader.DataLoader at 0x1e80784faf0>,
 <torch.utils.data.dataloader.DataLoader at 0x1e80784f8b0>)

In [5]:
import logging
logging.getLogger().setLevel(logging.INFO)

In [8]:
from EduCDM import KaNCD

In [7]:
cdm = KaNCD(exer_n=item_n, student_n=user_n, knowledge_n=knowledge_n, mf_type='gmf', dim=20)
cdm.train(train_set, valid_set, epoch_n=3, device="cuda", lr=0.002)
cdm.save("kancd.snapshot")

INFO:root:traing... (lr=0.002)
Epoch 0: 100%|████████████████████████████████████████████████████████████████████| 5815/5815 [00:47<00:00, 123.17it/s]
INFO:root:[Epoch 0] average loss: 0.569911
INFO:root:eval ... 


[Epoch 0] average loss: 0.569911


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:02<00:00, 270.70it/s]
INFO:root:[Epoch 0] auc: 0.763524, acc: 0.734476


[Epoch 0] auc: 0.763524, acc: 0.734476


Epoch 1: 100%|████████████████████████████████████████████████████████████████████| 5815/5815 [00:49<00:00, 117.69it/s]
INFO:root:[Epoch 1] average loss: 0.492857
INFO:root:eval ... 


[Epoch 1] average loss: 0.492857


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:02<00:00, 353.55it/s]
INFO:root:[Epoch 1] auc: 0.766779, acc: 0.734984


[Epoch 1] auc: 0.766779, acc: 0.734984


Epoch 2: 100%|████████████████████████████████████████████████████████████████████| 5815/5815 [00:46<00:00, 125.03it/s]
INFO:root:[Epoch 2] average loss: 0.463993
INFO:root:eval ... 


[Epoch 2] average loss: 0.463993


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:02<00:00, 314.35it/s]
INFO:root:[Epoch 2] auc: 0.766093, acc: 0.731352
INFO:root:save parameters to kancd.snapshot


[Epoch 2] auc: 0.766093, acc: 0.731352


In [9]:
cdm.load("kancd.snapshot")
auc, accuracy = cdm.eval(test_set, device="cuda")
print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))

INFO:root:load parameters from kancd.snapshot
INFO:root:eval ... 
Evaluating: 100%|█████████████████████████████████████████████████████████████████| 1743/1743 [00:05<00:00, 334.30it/s]

auc: 0.768029, accuracy: 0.731923



