In [6]:
import pandas as pd
from recommenders.datasets import movielens
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from recommenders.datasets.python_splitters import python_random_split

# 加载MovieLens数据
data = movielens.load_pandas_df(
    size="100k",
    header=["userID", "itemID", "rating"],
    genres_col="genre"
)

# 将电影类型编码为多热向量
genres_encoder = MultiLabelBinarizer()
data["genre"] = genres_encoder.fit_transform(
    data["genre"].apply(lambda s: s.split("|"))
).tolist()

# 打印电影类型
print("电影类型:", genres_encoder.classes_)
print(data.head())

# 划分训练集和测试集
train, test = python_random_split(data, ratio=0.75, seed=42)
print(f"{len(train)} 条训练样本和 {len(test)} 条测试样本")

100%|████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:02<00:00, 1.94kKB/s]


电影类型: ['Action' 'Adventure' 'Animation' "Children's" 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western' 'unknown']
   userID  itemID  rating                                              genre
0     196     242     3.0  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1     186     302     3.0  [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...
2      22     377     1.0  [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3     244      51     2.0  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...
4     166     346     1.0  [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
75000 条训练样本和 25000 条测试样本


In [25]:
import torch.nn as nn
import torch.nn.functional as F

class WideAndDeepModel(nn.Module):
    def __init__(self, user_dim, item_dim, num_genres, hidden_units):
        super(WideAndDeepModel, self).__init__()
        
        # Wide部分：交叉特征
        self.user_embedding_wide = nn.Embedding(num_embeddings=1000, embedding_dim=1)
        self.item_embedding_wide = nn.Embedding(num_embeddings=2000, embedding_dim=1)
        
        # Deep部分：嵌入层和全连接层
        self.user_embedding_deep = nn.Embedding(num_embeddings=1000, embedding_dim=user_dim)
        self.item_embedding_deep = nn.Embedding(num_embeddings=2000, embedding_dim=item_dim)
        self.genre_embedding = nn.Linear(num_genres, item_dim)
        
        self.deep = nn.Sequential(
            nn.Linear(user_dim + item_dim * 2, hidden_units[0]),
            nn.ReLU(),
            nn.Linear(hidden_units[0], hidden_units[1]),
            nn.ReLU(),
            nn.Linear(hidden_units[1], 1)
        )
    
    def forward(self, user_ids, item_ids, genres):
        # Wide部分
        user_emb_wide = self.user_embedding_wide(user_ids).squeeze()
        item_emb_wide = self.item_embedding_wide(item_ids).squeeze()
        wide_input = user_emb_wide + item_emb_wide
        
        # Deep部分
        user_emb_deep = self.user_embedding_deep(user_ids)
        item_emb_deep = self.item_embedding_deep(item_ids)
        genre_emb = self.genre_embedding(genres)
        
        deep_input = torch.cat([user_emb_deep, item_emb_deep, genre_emb], dim=-1)
        deep_output = self.deep(deep_input)
        
        # 合并Wide和Deep部分
        output = wide_input + deep_output.squeeze()
        return output

In [11]:
from torch.utils.data import DataLoader, Dataset

class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user_id = torch.tensor(row["userID"], dtype=torch.long)
        item_id = torch.tensor(row["itemID"], dtype=torch.long)
        genre = torch.tensor(row["genre"], dtype=torch.float)
        rating = torch.tensor(row["rating"], dtype=torch.float)
        return user_id, item_id, genre, rating

# 创建数据加载器
train_dataset = MovieLensDataset(train)
test_dataset = MovieLensDataset(test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [28]:
import torch.optim as optim

# 超参数
DNN_USER_DIM = 32
DNN_ITEM_DIM = 32
DNN_HIDDEN_UNITS = [64, 32]
DNN_OPTIMIZER_LR = 0.001
LINEAR_OPTIMIZER_LR = 0.01
DNN_DROPOUT = 0.5
DNN_BATCH_NORM = True
STEPS = 20

# 初始化模型
model = WideAndDeepModel(DNN_USER_DIM, DNN_ITEM_DIM, len(genres_encoder.classes_), DNN_HIDDEN_UNITS)
optimizer = optim.Adam(model.parameters(), lr=DNN_OPTIMIZER_LR)
criterion = nn.MSELoss()

# 训练模型
for epoch in range(STEPS):
    model.train()
    for user_ids, item_ids, genres, ratings in train_loader:
        optimizer.zero_grad()
        predictions = model(user_ids, item_ids, genres)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{STEPS}, Loss: {loss.item():.4f}")

Epoch 1/20, Loss: 1.8598
Epoch 2/20, Loss: 0.9601
Epoch 3/20, Loss: 0.8334
Epoch 4/20, Loss: 0.8266
Epoch 5/20, Loss: 0.8306
Epoch 6/20, Loss: 0.8440
Epoch 7/20, Loss: 0.9627
Epoch 8/20, Loss: 0.8207
Epoch 9/20, Loss: 0.9529
Epoch 10/20, Loss: 0.9054
Epoch 11/20, Loss: 0.6285
Epoch 12/20, Loss: 0.6978
Epoch 13/20, Loss: 0.6394
Epoch 14/20, Loss: 1.0055
Epoch 15/20, Loss: 0.7117
Epoch 16/20, Loss: 0.8786
Epoch 17/20, Loss: 0.6169
Epoch 18/20, Loss: 0.7495
Epoch 19/20, Loss: 0.5488
Epoch 20/20, Loss: 0.8523


In [21]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def calculate_rmse(y_true, y_pred):
    """计算 RMSE（均方根误差）"""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mae(y_true, y_pred):
    """计算 MAE（平均绝对误差）"""
    return mean_absolute_error(y_true, y_pred)

def calculate_ndcg(y_true, y_pred, k=10):
    """计算 NDCG@K（归一化折损累积增益）"""
    def dcg_score(y_true, y_pred, k):
        order = np.argsort(y_pred)[::-1]
        y_true = np.take(y_true, order[:k])
        gain = 2 ** y_true - 1
        discounts = np.log2(np.arange(len(y_true)) + 2)
        return np.sum(gain / discounts)
    
    dcg = dcg_score(y_true, y_pred, k)
    idcg = dcg_score(y_true, y_true, k)
    return dcg / idcg if idcg > 0 else 0

def calculate_precision_at_k(y_true, y_pred, k=10):
    """计算 Precision@K"""
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    return np.sum(y_true) / k


In [29]:
# 评估模型
model.eval()
pred_ratings = []
true_ratings = []

with torch.no_grad():
    for user_ids, item_ids, genres, ratings in test_loader:
        predictions = model(user_ids, item_ids, genres)
        pred_ratings.extend(predictions.tolist())
        true_ratings.extend(ratings.tolist())

# 计算评估指标
rmse_value = calculate_rmse(true_ratings, pred_ratings)
mae_value = calculate_mae(true_ratings, pred_ratings)
ndcg_value = calculate_ndcg(true_ratings, pred_ratings, k=10)
precision_value = calculate_precision_at_k(true_ratings, pred_ratings, k=10)

print("RMSE:", rmse_value)
print("MAE:", mae_value)
print("NDCG@10:", ndcg_value)
print("Precision@10:", precision_value)

RMSE: 1.022181516793539
MAE: 0.8009374121320247
NDCG@10: 0.8706325607206493
Precision@10: 4.6


NameError: name 'recommendations' is not defined