In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from itertools import islice
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
# 自定义 Dataset 类
class MovieLensDataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path, sep='\t', names=['user_id', 'movie_id', 'index', 'timestamp'])

    def __len__(self):
        # 返回数据集的长度
        return len(self.data)

    def __getitem__(self, idx):
        # 获取指定索引的数据，返回用户 ID、电影 ID 和评分
        user_id = self.data.iloc[idx, 0]  # 用户 ID
        movie_id = self.data.iloc[idx, 1]  # 电影 ID
        index = self.data.iloc[idx, 2]  # session次序
        timestamp = self.data.iloc[idx, 3]  # 时间戳
        return user_id, movie_id, index, timestamp


# 读取数据集
train_dataset = MovieLensDataset(path='./data/ml-100k/u.train_data')

In [3]:
# 构建session
user_num = 943
item_num = 1682

y_ui = np.zeros((user_num + 1, item_num + 1), int)
train_session = {}

for user, item, index, timestamp in tqdm(train_dataset):
    # user, item, rating, timestamp = user.item(), item.item(), rating.item(), timestamp.item()
    y_ui[user][item] = 1
    if user not in train_session:
        train_session[user] = []
    train_session[user].append([item, timestamp])

# 排序session
for user in train_session.keys():
    session = train_session[user]
    session.sort(key=lambda x: x[1])

100%|██████████| 99057/99057 [00:04<00:00, 20573.94it/s]


In [4]:
sessions = [[] for a in range(user_num + 1)]
for u in range(1, user_num + 1):
    first_column = [row[0] for row in train_session[u]]
    sessions[u] = first_column

test_data = pd.read_csv('./data/ml-100k/u.test_data', header=None)
labels = np.zeros(user_num + 1, int)
for index, row in test_data.iterrows():
    user, item = row[0], row[1]
    labels[user] = item


# 获得用户session
def get_session(u):
    return sessions[u]


# 获得用户label
def get_label(u):
    return labels[u]

In [5]:
# 用户u评分过的所有物品
def I_u(u):
    return np.where(y_ui[u] == 1)[0]


# 用户u没评分过的所有物品
def I_not_u(u):
    return np.where(y_ui[u] == 0)[0]

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [6]:
# 损失函数
def Pre(predict_rule, k):
    loss = 0.
    for user in range(1, user_num + 1):
        recommended_items = get_recommended_items(user, k, predict_rule)
        # 用户感兴趣的物品
        prefer_items = [get_label(user)]
        loss += len(np.intersect1d(recommended_items, prefer_items)) / k
    return loss / user_num


def Rec(predict_rule, k):
    loss = 0.
    for user in range(1, user_num + 1):
        recommended_items = get_recommended_items(user, k, predict_rule)
        # 用户感兴趣的物品
        prefer_items = [get_label(user)]
        loss += len(np.intersect1d(recommended_items, prefer_items)) / len(prefer_items)
    return loss / user_num


def NDCG(predict_rule, k):
    loss = 0.
    for user in range(1, user_num + 1):
        recommended_items = get_recommended_items(user, k, predict_rule)
        # 用户感兴趣的物品
        prefer_items = [get_label(user)]
        # 计算DCG
        dcg = 0.0
        for l in range(k):
            if recommended_items[l] in prefer_items:
                dcg += 1.0 / np.log(l + 2)
        idcg = sum(1.0 / np.log(i + 2) for i in range(min(len(prefer_items), k)))
        loss += dcg / idcg if idcg > 0 else 0.0
    return loss / user_num

In [7]:
# 取用户u的推荐物品
def get_recommended_items(u, k, predict_rule):
    ranks = {i: 0 for i in range(1, item_num + 1)}
    rated_items = I_u(u)
    ranks = {key: value for key, value in ranks.items() if key not in rated_items}
    for i in ranks:
        ranks[i] = predict_rule(u, i)
    ranks = dict(sorted(ranks.items(), key=lambda item: item[1], reverse=True))
    recommended_items = [key for key, value in islice(ranks.items(), k)]
    return recommended_items

In [8]:
# 预测
def predict(predict_rule, k):
    print(f"Pre: {Pre(predict_rule, k):.4f}")
    print(f"Rec: {Rec(predict_rule, k):.4f}")
    print(f"NDCG: {NDCG(predict_rule, k):.4f}")