In [90]:
import numpy as np
import pandas as pd

n_samples = 100000
user_ids = np.random.randint(1, 10001, size=n_samples)
movie_ids = np.random.randint(1, 1001, size=n_samples)
ratings = np.random.randint(1, 6, size=n_samples)

df = pd.DataFrame({
    'UserID': user_ids,
    'MovieID': movie_ids,
    'Rating': ratings
})

df.to_csv('data.csv', index=False, header=True)

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('data.csv')
X = df[['UserID', 'MovieID']]
y = df['Rating']

# UserID和MovieID进行编码
le_u = LabelEncoder()
le_m = LabelEncoder()
X['UserID'] = le_u.fit_transform(X['UserID'])
X['MovieID'] = le_m.fit_transform(X['MovieID'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['UserID'] = le_u.fit_transform(X['UserID'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['MovieID'] = le_m.fit_transform(X['MovieID'])


In [92]:
import torch.nn as nn


class Recommender(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size, hidden_size):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        self.fc = nn.Linear(embedding_size * 2, hidden_size)
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, user_ids, movie_ids):
        user_emb = self.user_embedding(user_ids)
        movie_emb = self.movie_embedding(movie_ids)
        x = torch.cat([user_emb, movie_emb], dim=1)
        x = self.fc(x)
        x = nn.ReLU()(x)
        x = self.output(x)
        return x

In [93]:
import torch.optim as optim


model = Recommender(num_users=len(le_u.classes_), num_movies=len(le_m.classes_), embedding_size=32, hidden_size=64)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [94]:
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


batch_size = 128
train_dataset = TensorDataset(torch.tensor(X_train.values), torch.tensor(y_train.values))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(tqdm(train_dataloader, desc=f'Epoch {epoch+1}')):
        optimizer.zero_grad()
        outputs = model(inputs[:, 0].long(), inputs[:, 1].long())
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_dataloader)}')


Epoch 1: 100%|██████████| 625/625 [00:02<00:00, 210.70it/s]


Epoch 1, Loss: 2.7117916975021363


Epoch 2: 100%|██████████| 625/625 [00:03<00:00, 181.50it/s]


Epoch 2, Loss: 2.039401442337036


Epoch 3: 100%|██████████| 625/625 [00:03<00:00, 178.21it/s]


Epoch 3, Loss: 1.9783723110198974


Epoch 4: 100%|██████████| 625/625 [00:03<00:00, 198.02it/s]


Epoch 4, Loss: 1.9296103435516356


Epoch 5: 100%|██████████| 625/625 [00:03<00:00, 190.28it/s]


Epoch 5, Loss: 1.8833528064727783


Epoch 6: 100%|██████████| 625/625 [00:03<00:00, 205.72it/s]


Epoch 6, Loss: 1.8338251295089723


Epoch 7: 100%|██████████| 625/625 [00:02<00:00, 224.13it/s]


Epoch 7, Loss: 1.7792311292648315


Epoch 8: 100%|██████████| 625/625 [00:02<00:00, 222.54it/s]


Epoch 8, Loss: 1.7197045665740966


Epoch 9: 100%|██████████| 625/625 [00:02<00:00, 226.55it/s]


Epoch 9, Loss: 1.653863657951355


Epoch 10: 100%|██████████| 625/625 [00:02<00:00, 225.86it/s]

Epoch 10, Loss: 1.5859351306915284





In [95]:
test_dataset = TensorDataset(torch.tensor(X_test.values), torch.tensor(y_test.values))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
with torch.no_grad():
    running_loss = 0.0
    for inputs, labels in tqdm(test_dataloader):
        outputs = model(inputs[:, 0].long(), inputs[:, 1].long())
        loss = criterion(outputs.squeeze(), labels)
        running_loss += loss.item()

    print(f'Test Loss: {running_loss / len(test_dataloader)}')


100%|██████████| 157/157 [00:00<00:00, 813.28it/s]

Test Loss: 2.3194596129617873





In [108]:
import random
import torch


# 随机生成一些用户和电影
user_ids = X_test['UserID'].unique()
movie_ids = X_test['MovieID'].unique()
random_user_ids = random.sample(list(user_ids), 5)
random_movie_ids = random.sample(list(movie_ids), 5)

# 构造输入数据
random_user_ids = np.array(random_user_ids)
random_movie_ids = np.array(random_movie_ids)

# 将 random_user_ids 和 random_movie_ids 扩展为 5x5 的矩阵
random_user_ids = np.repeat(random_user_ids.reshape(-1, 1), 5, axis=1)
random_movie_ids = np.repeat(random_movie_ids.reshape(1, -1), 5, axis=0)

# 生成输入数据
inputs = torch.tensor(np.column_stack((
    random_user_ids.ravel(),
    random_movie_ids.ravel()
))).long()

# 预测评分
preds = model(inputs[:, 0], inputs[:, 1]).detach().numpy().ravel()

# 输出预测结果
for i, user_id in enumerate(random_user_ids):
    print(f'对于用户 {int(user_id[0])}，推荐电影评分如下：')
    for j in range(5):
        movie_id = movie_ids[i * 5 + j]
        print(f'    电影 {movie_id} 的评分为 {preds[i*5+j]:.2f}')

KeyboardInterrupt: 