In [1]:
# 推荐系统：基于 PyTorch 的神经协同过滤模型（NCF）

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 1. 读取数据
df = pd.read_csv('filtered_data.csv')
df = df[['user_id', 'item_id','category_id', 'behavior_type']]
#df=df.head(10000000)  # 仅使用前1000万条数据进行测试

# 2. 仅保留购买行为作为正样本
df = df[df['behavior_type'] == 'buy']

# 3. 去重 & 映射ID为连续索引
df = df.drop_duplicates(['user_id', 'item_id'])

user2idx = {uid: idx for idx, uid in enumerate(df['user_id'].unique())}
item2idx = {iid: idx for idx, iid in enumerate(df['item_id'].unique())}

df['user'] = df['user_id'].map(user2idx)
df['item'] = df['item_id'].map(item2idx)

# 4. 构造正负样本
interactions = set(zip(df['user'], df['item']))
all_users = list(user2idx.values())
all_items = list(item2idx.values())

import random
neg_samples = []
for u, i in interactions:
    for _ in range(1):  # 每个正样本采样1个负样本
        j = random.choice(all_items)
        while (u, j) in interactions:
            j = random.choice(all_items)
        neg_samples.append([u, j, 0])  # label=0

df_pos = df[['user', 'item']].copy()
df_pos['label'] = 1
df_neg = pd.DataFrame(neg_samples, columns=['user', 'item', 'label'])
df_all = pd.concat([df_pos, df_neg], ignore_index=True)

# 5. 构建Dataset和DataLoader
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

train_df, test_df = train_test_split(df_all, test_size=0.2, random_state=42)
train_dataset = InteractionDataset(train_df)
test_dataset = InteractionDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024)

# 6. 构建神经协同过滤模型（NCF）
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user, item):
        user_vec = self.user_embedding(user)
        item_vec = self.item_embedding(item)
        x = torch.cat([user_vec, item_vec], dim=-1)
        return self.mlp(x).squeeze()

# 7. 初始化模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(len(user2idx), len(item2idx)).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 8. 模型训练
for epoch in range(5):
    model.train()
    total_loss = 0
    for users, items, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        users, items, labels = users.to(device), items.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(users, items)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 9. 推荐函数
def recommend_top_k(model, user_id_raw, k=10):
    user_id = user2idx.get(user_id_raw)
    if user_id is None:
        return []

    user_tensor = torch.tensor([user_id] * len(item2idx), dtype=torch.long).to(device)
    item_tensor = torch.tensor(list(item2idx.values()), dtype=torch.long).to(device)
    model.eval()
    with torch.no_grad():
        scores = model(user_tensor, item_tensor).cpu().numpy()
    top_items_idx = scores.argsort()[-k:][::-1]
    top_item_ids = [list(item2idx.keys())[i] for i in top_items_idx]
    return top_item_ids

# 10. 示例推荐
test_user_raw = df['user_id'].iloc[0]
recommended_items = recommend_top_k(model, test_user_raw, k=10)
print(f"为用户 {test_user_raw} 推荐的商品：{recommended_items}")


Epoch 1: 100%|██████████| 165/165 [00:04<00:00, 36.88it/s]


Epoch 1, Loss: 114.4658


Epoch 2: 100%|██████████| 165/165 [00:04<00:00, 39.15it/s]


Epoch 2, Loss: 114.2007


Epoch 3: 100%|██████████| 165/165 [00:04<00:00, 35.52it/s]


Epoch 3, Loss: 113.6063


Epoch 4: 100%|██████████| 165/165 [00:04<00:00, 36.29it/s]


Epoch 4, Loss: 111.5075


Epoch 5: 100%|██████████| 165/165 [00:04<00:00, 38.67it/s]


Epoch 5, Loss: 106.7102
为用户 100 推荐的商品：[np.int64(1272704), np.int64(1578482), np.int64(984545), np.int64(740947), np.int64(692135), np.int64(3839718), np.int64(1116492), np.int64(4940506), np.int64(593132), np.int64(4695988)]
