In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 生成文本表示

In [4]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda()


In [5]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('..\..\data\selected_book_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # 将标签列表转换为字符串
        tags_str = " ".join(rows.Tags)
        # 使用BERT中文模型对标签进行编码
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(),
                        inputs.attention_mask.cuda())
        # 使用最后一层的平均隐藏状态作为标签的向量表示
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Book] = tag_embedding


1200it [02:57,  6.74it/s]


In [6]:
import pickle

# 将映射表存储为二进制文件
with open('../../data/tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)


In [7]:
# 从二进制文件中读取映射表
with open('../../data/tag_embedding_dict.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [8]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('../../data\\book_score.csv')
# 新加的，把评分为0的数据删了
loaded_data.drop(loaded_data[loaded_data["Rate"] == 0].index, inplace=True)
# 显示加载的数据
print(loaded_data)

           User     Book  Rate                       Time          Tag
45      1398478  2348372     4  2009-11-10T18:42:00+08:00          NaN
164     1779492  1851385     3  2011-03-13T12:37:12+08:00  奥尔罕·帕慕克,土耳其
165     1779492  3266345     3  2010-10-20T19:31:20+08:00      葛瑞格·摩顿森
166     1779492  1001885     3  2010-10-20T19:29:16+08:00     林达,法国,旅行
168     1779492  1424741     3  2010-10-04T01:24:33+08:00      卡森·麦卡勒斯
...         ...      ...   ...                        ...          ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00   张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00   金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00      彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00    小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00           爱情

[403807 rows x 5 columns]


In [9]:
class BookRatingDataset(Dataset):

    def __init__(self, data, user_to_idx, book_to_idx, tag_embedding_dict):
        self.data = data
        self.user_to_idx = user_to_idx
        self.book_to_idx = book_to_idx
        self.tag_embedding_dict = tag_embedding_dict

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_to_idx[row['User']]
        book = self.book_to_idx[row['Book']]
        rating = row['Rate'].astype('float32')
        text_embedding = self.tag_embedding_dict.get(row['Book'])
        return user, book, rating, text_embedding


class MatrixFactorization(nn.Module):

    def __init__(self, num_users, num_books, embedding_dim, hidden_state):
        super(MatrixFactorization, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings = nn.Embedding(num_books, embedding_dim)
        self.linear_embedding = nn.Linear(hidden_state, embedding_dim)
        self.output = nn.Linear(embedding_dim, 6)

    def forward(self, user, book, tag_embedding):
        user_embedding = self.user_embeddings(user)
        book_embedding = self.book_embeddings(book)
        tag_embedding_proj = self.linear_embedding(tag_embedding)
        book_intergrate = book_embedding + tag_embedding_proj
        return (user_embedding * book_intergrate).sum(dim=1)


def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))

    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids)}

    # 创建将连续索引映射回原始ID的字典
    idx_to_id = {idx: id for id, idx in id_to_idx.items()}

    return id_to_idx, idx_to_id


# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group['true'].tolist()
    pred_ratings = group['pred'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k=50)

In [10]:
user_ids = loaded_data['User'].unique()
book_ids = loaded_data['Book'].unique()

user_to_idx, idx_to_user = create_id_mapping(user_ids)
book_to_idx, idx_to_book = create_id_mapping(book_ids)

# 划分训练集和测试集
train_data, test_data = train_test_split(loaded_data,
                                         test_size=0.5,
                                         random_state=42)

# 创建训练集和测试集的数据集对象
train_dataset = BookRatingDataset(train_data, user_to_idx, book_to_idx,
                                  tag_embedding_dict)
test_dataset = BookRatingDataset(test_data, user_to_idx, book_to_idx,
                                 tag_embedding_dict)

# 创建训练集和测试集的数据加载器
train_dataloader = DataLoader(train_dataset,
                              batch_size=4096,
                              shuffle=True,
                              drop_last=True)
test_dataloader = DataLoader(test_dataset,
                             batch_size=4096,
                             shuffle=False,
                             drop_last=True)

num_users = loaded_data['User'].nunique()
num_books = loaded_data['Book'].nunique()
embedding_dim, hidden_state = 32, 768

model = MatrixFactorization(num_users, num_books, embedding_dim,
                            hidden_state).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### 训练

In [13]:
num_epochs = 20
lambda_u, lambda_b = 0.001, 0.001

for epoch in range(num_epochs):
    model.train()
    total_loss_train, total_loss_test = 0.0, 0.0

    for idx, (user_ids, book_ids, ratings,
              tag_embedding) in tqdm(enumerate(train_dataloader)):
        # 使用user_ids, book_ids, ratings进行训练

        optimizer.zero_grad()

        predictions = model(user_ids.to(device), book_ids.to(device),
                            tag_embedding.squeeze(1).to(device))
        loss = criterion(
            predictions,
            ratings.to(device)) + lambda_u * model.user_embeddings.weight.norm(
                2) + lambda_b * model.book_embeddings.weight.norm(2)

        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()

        # if idx % 100 == 0:
        #     print(f'Step {idx}, Loss: {loss.item()}')

    output_loss_train = total_loss_train / (idx + 1)

    results = []
    model.eval()

    with torch.no_grad():
        for idx, (user_ids, item_ids, true_ratings,
                  tag_embedding) in enumerate(test_dataloader):
            pred_ratings = model(user_ids.to(device), item_ids.to(device),
                                 tag_embedding.squeeze(1).to(device))

            loss = criterion(pred_ratings, ratings.to(device))
            total_loss_test += loss.item()

            # 将结果转换为 numpy arrays
            user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
            pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
            true_ratings_np = true_ratings.numpy().reshape(-1, 1)

            # 将这三个 arrays 合并成一个 2D array
            batch_results = np.column_stack(
                (user_ids_np, pred_ratings_np, true_ratings_np))

            # 将这个 2D array 添加到 results
            results.append(batch_results)

        # 将结果的 list 转换为一个大的 numpy array
        results = np.vstack(results)

        # 将结果转换为DataFrame
        results_df = pd.DataFrame(results, columns=['user', 'pred', 'true'])
        results_df['user'] = results_df['user'].astype(int)

        # ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
        ndcg_scores = results_df[results_df.groupby('user')['user'].transform(
            'count') > 1].groupby('user').apply(compute_ndcg)

        # 计算平均NDCG
        avg_ndcg = ndcg_scores.mean()
        print(
            f'Epoch {epoch}, Train loss: {output_loss_train}, Test loss:, {total_loss_test / (idx + 1)}, Average NDCG: {avg_ndcg}'
        )

49it [00:19,  2.53it/s]


Epoch 0, Train loss: 26.369487295345383, Test loss:, 23.221381518305563, Average NDCG: 0.9233733540324441


49it [00:19,  2.55it/s]


Epoch 1, Train loss: 15.285121723097198, Test loss:, 15.06255087560537, Average NDCG: 0.9244758813142464


49it [00:27,  1.76it/s]


Epoch 2, Train loss: 9.815733092171806, Test loss:, 11.487129814770757, Average NDCG: 0.9253648917514238


49it [00:31,  1.54it/s]


Epoch 3, Train loss: 7.198552462519432, Test loss:, 9.189735685076032, Average NDCG: 0.926020061911951


49it [00:23,  2.09it/s]


Epoch 4, Train loss: 5.546880663657675, Test loss:, 7.6025031537425765, Average NDCG: 0.9265247357956407


49it [00:21,  2.28it/s]


Epoch 5, Train loss: 4.435245513916016, Test loss:, 6.437704592334981, Average NDCG: 0.9273020069476213


49it [00:26,  1.84it/s]


Epoch 6, Train loss: 3.6252392554769712, Test loss:, 5.4880097934177945, Average NDCG: 0.9280192584368241


49it [00:26,  1.83it/s]


Epoch 7, Train loss: 3.0265131045360953, Test loss:, 4.842234796407271, Average NDCG: 0.928708680414276


49it [00:23,  2.05it/s]


Epoch 8, Train loss: 2.58396552046951, Test loss:, 4.325096548820029, Average NDCG: 0.9294034492018324


49it [00:20,  2.43it/s]


Epoch 9, Train loss: 2.2604465922530816, Test loss:, 3.850972136672662, Average NDCG: 0.9303971251984335


49it [00:19,  2.49it/s]


Epoch 10, Train loss: 1.9867549648090286, Test loss:, 3.4742989394129538, Average NDCG: 0.9310176959895201


49it [00:21,  2.27it/s]


Epoch 11, Train loss: 1.7684075540425825, Test loss:, 3.1998845460463543, Average NDCG: 0.9317327787522168


49it [00:19,  2.46it/s]


Epoch 12, Train loss: 1.5873881043220053, Test loss:, 2.881617229811999, Average NDCG: 0.9322232782857586


49it [00:19,  2.46it/s]


Epoch 13, Train loss: 1.4745973810857655, Test loss:, 2.7427254890909, Average NDCG: 0.9332083480438842


49it [00:19,  2.53it/s]


Epoch 14, Train loss: 1.367330782267512, Test loss:, 2.5423279100534866, Average NDCG: 0.9339863136107134


49it [00:32,  1.50it/s]


Epoch 15, Train loss: 1.2896408548160476, Test loss:, 2.420883689607893, Average NDCG: 0.9344482042343306


49it [00:35,  1.37it/s]


Epoch 16, Train loss: 1.2100390171518132, Test loss:, 2.261963275014138, Average NDCG: 0.9352152936389899


49it [00:38,  1.28it/s]


Epoch 17, Train loss: 1.164652240519621, Test loss:, 2.1611609361609636, Average NDCG: 0.935721823460319


49it [00:35,  1.40it/s]


Epoch 18, Train loss: 1.1123629102901536, Test loss:, 2.0371815428441886, Average NDCG: 0.9365047217222056


49it [00:35,  1.37it/s]


Epoch 19, Train loss: 1.0773406174718116, Test loss:, 2.0027465163444984, Average NDCG: 0.9374121103798688
