In [3]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split

# 加载MovieLens数据
data = movielens.load_pandas_df(
    size="100k",
    header=["userID", "itemID", "rating"],
    genres_col="genre"
)

# 将电影类型编码为多热向量
genres_encoder = MultiLabelBinarizer()
data["genre"] = genres_encoder.fit_transform(
    data["genre"].apply(lambda s: s.split("|"))
).tolist()

# 划分训练集和测试集
train, test = python_random_split(data, ratio=0.75, seed=42)

100%|█████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [01:44<00:00, 46.0KB/s]


In [19]:
import torch
import torch.nn as nn

class WideAndDeepModel(nn.Module):
    def __init__(self, num_users, num_items, genre_dim, embedding_dim, hidden_units):
        super(WideAndDeepModel, self).__init__()
        
        # Embedding 层
        self.user_embedding = nn.Embedding(num_users, embedding_dim)  # 用户 ID 的 embedding
        self.item_embedding = nn.Embedding(num_items, embedding_dim)  # 物品 ID 的 embedding
        
        # Wide 部分
        self.wide = nn.Linear(embedding_dim * 2, 1)  # Wide 部分的输入为用户和物品的 embedding 拼接
        
        # Deep 部分
        self.deep = nn.Sequential(
            nn.Linear(embedding_dim * 2 + genre_dim, hidden_units[0]),  # Deep 部分的输入包括用户 embedding，物品 embedding 和类型特征
            nn.ReLU(),
            nn.Linear(hidden_units[0], hidden_units[1]),
            nn.ReLU(),
            nn.Linear(hidden_units[1], 1)
        )
    
    def forward(self, user_ids, item_ids, genre_features):
        # 获取用户和物品的 embedding
        user_embedded = self.user_embedding(user_ids)  # 输出形状：[batch_size, embedding_dim]
        item_embedded = self.item_embedding(item_ids)  # 输出形状：[batch_size, embedding_dim]
        
        # Wide 部分
        wide_input = torch.cat([user_embedded, item_embedded], dim=1)  # 拼接用户和物品的 embedding
        wide_output = self.wide(wide_input)
        
        # Deep 部分
        deep_input = torch.cat([user_embedded, item_embedded, genre_features], dim=1)  # 拼接所有特征
        deep_output = self.deep(deep_input)
        
        # 合并 Wide 和 Deep 部分
        return wide_output + deep_output

In [20]:
from torch.utils.data import DataLoader, Dataset

class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user_id = torch.tensor(row["userID"], dtype=torch.long)  # 用户 ID（索引）
        item_id = torch.tensor(row["itemID"], dtype=torch.long)  # 物品 ID（索引）
        genre_features = torch.tensor(row["genre"], dtype=torch.float32)  # 电影类型特征
        rating = torch.tensor(row["rating"], dtype=torch.float32)  # 评分
        return user_id, item_id, genre_features, rating

train_dataset = MovieLensDataset(train)
test_dataset = MovieLensDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [21]:
# 初始化模型
num_users = data["userID"].nunique()  # 用户总数
num_items = data["itemID"].nunique()  # 物品（电影）总数
genre_dim = 19  # 类型特征的维度
embedding_dim = 8  # Embedding 向量的维度
hidden_units = [64, 32]  # Deep 部分的隐藏层单元数

model = WideAndDeepModel(
    num_users=num_users,
    num_items=num_items,
    genre_dim=genre_dim,
    embedding_dim=embedding_dim,
    hidden_units=hidden_units
)
# 训练模型

num_epochs = 20
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    for user_ids, item_ids, genre_features, ratings in train_loader:
        optimizer.zero_grad()
        outputs = model(user_ids, item_ids, genre_features)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)


IndexError: index out of range in self

In [13]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def calculate_rmse(y_true, y_pred):
    """计算 RMSE（均方根误差）"""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mae(y_true, y_pred):
    """计算 MAE（平均绝对误差）"""
    return mean_absolute_error(y_true, y_pred)

def calculate_ndcg(y_true, y_pred, k=10):
    """计算 NDCG@K（归一化折损累积增益）"""
    def dcg_score(y_true, y_pred, k):
        order = np.argsort(y_pred)[::-1]
        y_true = np.take(y_true, order[:k])
        gain = 2 ** y_true - 1
        discounts = np.log2(np.arange(len(y_true)) + 2)
        return np.sum(gain / discounts)
    
    dcg = dcg_score(y_true, y_pred, k)
    idcg = dcg_score(y_true, y_true, k)
    return dcg / idcg if idcg > 0 else 0

def calculate_precision_at_k(y_true, y_pred, k=10):
    """计算 Precision@K"""
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    return np.sum(y_true) / k

In [17]:

# 测试集评估
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for user_features, item_features, genre_features, ratings in test_loader:
        outputs = model(user_features, item_features, genre_features)
        y_true.extend(ratings.squeeze().tolist())
        y_pred.extend(outputs.squeeze().tolist())

y_true = np.array(y_true)
y_pred = np.array(y_pred)

rmse = calculate_rmse(y_true, y_pred)
mae = calculate_mae(y_true, y_pred)
ndcg = calculate_ndcg(y_true, y_pred, k=10)
precision_at_k = calculate_precision_at_k(y_true, y_pred, k=10)

print(f"测试集上的 RMSE: {rmse:.4f}")
print(f"测试集上的 MAE: {mae:.4f}")
print(f"测试集上的 NDCG@10: {ndcg:.4f}")
print(f"测试集上的 Precision@10: {precision_at_k:.4f}")

# 推荐电影
recommendations = recommend_movies(196, top_k=10)
print("为用户196推荐的电影:")
for movie, rating in recommendations:
    print(f"电影ID: {movie}, 预测评分: {rating:.2f}")

测试集上的 RMSE: 1.1008
测试集上的 MAE: 0.9041
测试集上的 NDCG@10: 0.4188
测试集上的 Precision@10: 3.6000
为用户196推荐的电影:
电影ID: 214, 预测评分: 4.19
电影ID: 51, 预测评分: 4.15
电影ID: 286, 预测评分: 4.12
电影ID: 483, 预测评分: 4.11
电影ID: 299, 预测评分: 4.10
电影ID: 89, 预测评分: 4.08
电影ID: 484, 预测评分: 4.08
电影ID: 133, 预测评分: 4.07
电影ID: 302, 预测评分: 4.07
电影ID: 549, 预测评分: 4.06


In [9]:
def recommend_movies(user_id, top_k=10):
    # 获取所有电影ID
    all_movies = data["itemID"].unique()
    
    # 构造用户-电影对
    user_features = torch.tensor([[user_id]], dtype=torch.float32).repeat(len(all_movies), 1)
    item_features = torch.tensor(all_movies, dtype=torch.float32).unsqueeze(1)
    
    # 获取所有电影的 genre 特征
    genre_features = torch.tensor(
        data[data["itemID"].isin(all_movies)]["genre"].tolist(), 
        dtype=torch.float32
    )
    
    # 确保 genre_features 的形状与 user_features 和 item_features 匹配
    if genre_features.shape[0] != len(all_movies):
        # 如果 genre_features 的长度不匹配，可能需要重新获取 genre 特征
        genre_features = torch.tensor(
            [data[data["itemID"] == movie]["genre"].values[0] for movie in all_movies],
            dtype=torch.float32
        )
    
    # 使用模型预测评分
    with torch.no_grad():
        predictions = model(user_features, item_features, genre_features)
    
    # 将预测结果与电影ID配对并排序
    movie_ratings = list(zip(all_movies, predictions.squeeze().tolist()))
    top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:top_k]
    
    return top_movies
# 为用户196推荐10部电影
recommendations = recommend_movies(196, top_k=10)
print("为用户196推荐的电影:")
for movie, rating in recommendations:
    print(f"电影ID: {movie}, 预测评分: {rating:.2f}")

为用户196推荐的电影:
电影ID: 1366, 预测评分: 4.96
电影ID: 1065, 预测评分: 4.88
电影ID: 1313, 预测评分: 4.82
电影ID: 1294, 预测评分: 4.78
电影ID: 1232, 预测评分: 4.78
电影ID: 1307, 预测评分: 4.77
电影ID: 1201, 预测评分: 4.76
电影ID: 1318, 预测评分: 4.75
电影ID: 1184, 预测评分: 4.75
电影ID: 1212, 预测评分: 4.74
