# Deep Learning Recommenddation System

## 数据集准备
基于Movie-Lens 32M数据集进行实验
*ml-32m*
* ml-32m/ratings.csv
* ml-32m/movies.csv
* ml-32m/tags.csv

In [1]:
"""
# ml-32m/ratings.csv
userId,movieId,rating,timestamp
1,17,4.0,944249077
1,25,1.0,944250228
1,29,2.0,943230976
1,30,5.0,944249077
# ml-32m/movies.csv
movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
# ml-32m/tags.csv
userId,movieId,tag,timestamp
22,26479,Kevin Kline,1583038886
22,79592,misogyny,1581476297
22,247150,acrophobia,1622483469
34,2174,music,1249808064
"""

'\n# ml-32m/ratings.csv\nuserId,movieId,rating,timestamp\n1,17,4.0,944249077\n1,25,1.0,944250228\n1,29,2.0,943230976\n1,30,5.0,944249077\n# ml-32m/movies.csv\nmovieId,title,genres\n1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\n2,Jumanji (1995),Adventure|Children|Fantasy\n3,Grumpier Old Men (1995),Comedy|Romance\n4,Waiting to Exhale (1995),Comedy|Drama|Romance\n5,Father of the Bride Part II (1995),Comedy\n# ml-32m/tags.csv\nuserId,movieId,tag,timestamp\n22,26479,Kevin Kline,1583038886\n22,79592,misogyny,1581476297\n22,247150,acrophobia,1622483469\n34,2174,music,1249808064\n'

## 基础NCF模型

#### 导入必要包

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import csv
import os

#### 加载设备

In [3]:
# 初始化模型和优化器
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if(torch.cuda.is_available()):
    torch.cuda.empty_cache()
    print("使用GPU加速")
print(device)

使用GPU加速
cuda:0


#### 数据清洗

In [4]:
# 数据预处理
ratings = pd.read_csv('../Dataset/ml-32m/ratings.csv')
csv_file_path = 'loss_data.csv'
print("数据读取成功")
# 创建用户和电影映射字典,将稀疏数据稠密化
user_ids = ratings['userId'].unique()
user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
movie_ids = ratings['movieId'].unique()
movie_to_idx = {movie: idx for idx, movie in enumerate(movie_ids)}

数据读取成功


In [5]:
# 转换为连续索引
ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)
# 归一化评分到0-1范围
ratings['rating'] = ratings['rating'] / 5.0

#### 数据集划分

In [6]:
# 数据集划分
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

#### 实现Dataset类

In [7]:
# 定义Dataset类
class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.movies[idx], dtype=torch.long),
            torch.tensor(self.ratings[idx], dtype=torch.float)
        )

#### 加载数据

In [19]:
# 创建数据加载器
batch_size = 4096

train_dataset = MovieLensDataset(train_df['user_idx'].values, 
                               train_df['movie_idx'].values,
                               train_df['rating'].values)
val_dataset = MovieLensDataset(val_df['user_idx'].values,
                             val_df['movie_idx'].values,
                             val_df['rating'].values)
test_dataset = MovieLensDataset(test_df['user_idx'].values,
                              test_df['movie_idx'].values,
                              test_df['rating'].values)
print(train_dataset[0])  # 打印第一个样本以验证数据集
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,pin_memory=True)
print("数据加载成功")

(tensor(75039), tensor(3064), tensor(0.9000))
数据加载成功


#### 定义推荐模型

In [16]:
# 定义推荐模型
class Recommender(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.movie_emb = nn.Embedding(num_movies, embedding_dim)
        
        self.fc = nn.Sequential(
            nn.Linear(2*embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, user, movie):
        user_emb = self.user_emb(user)
        movie_emb = self.movie_emb(movie)
        x = torch.cat([user_emb, movie_emb], dim=1).to(device)
        return self.fc(x).squeeze()

#### 加载模型，指定优化器

In [20]:

n_users = len(user_ids)
n_movies = len(movie_ids)

model = Recommender(n_users, n_movies).to(device)
print(model)
if os.path.exists('best_model.pth'):
    model.load_state_dict(torch.load('best_model.pth', map_location=device))
    print("已加载保存的模型参数")
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

Recommender(
  (user_emb): Embedding(200948, 64)
  (movie_emb): Embedding(84432, 64)
  (fc): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=1, bias=True)
    (6): Sigmoid()
  )
)


In [11]:
#### 训练模型

In [21]:
# 训练循环
epochs = 20
best_val_loss = float('inf')
loss_array = []

for epoch in range(epochs):
    # 训练阶段
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1} Training'):
        user, movie, rating = [x.to(device) for x in batch]
        optimizer.zero_grad()
        pred = model(user, movie)
        loss = criterion(pred, rating)

        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * user.size(0)
    train_loss /= len(train_loader.dataset)
    
    # 验证阶段
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch+1} Validation'):
            user, movie, rating = [x.to(device) for x in batch]
            pred = model(user, movie)
            val_loss += criterion(pred, rating).item() * user.size(0)
    val_loss /= len(val_loader.dataset)
    
    loss_array.append([train_loss,val_loss])

    print(f'Epoch {epoch+1}:')
    print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    
    # 保存最佳模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
    # 保存损失数据到CSV文件
    with open(csv_file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['train_loss', 'val_loss'])  # 添加表头
        csv_writer.writerows(loss_array)

print("训练完成，开始测试阶段")


Epoch 1 Training: 100%|██████████| 5626/5626 [11:43<00:00,  7.99it/s]
Epoch 1 Validation: 100%|██████████| 626/626 [00:59<00:00, 10.51it/s]


Epoch 1:
Train Loss: 0.0304 | Val Loss: 0.0289


Epoch 2 Training: 100%|██████████| 5626/5626 [11:40<00:00,  8.03it/s]
Epoch 2 Validation: 100%|██████████| 626/626 [01:01<00:00, 10.18it/s]


Epoch 2:
Train Loss: 0.0279 | Val Loss: 0.0276


Epoch 3 Training: 100%|██████████| 5626/5626 [11:36<00:00,  8.07it/s]
Epoch 3 Validation: 100%|██████████| 626/626 [01:01<00:00, 10.17it/s]


Epoch 3:
Train Loss: 0.0269 | Val Loss: 0.0271


Epoch 4 Training: 100%|██████████| 5626/5626 [11:36<00:00,  8.07it/s]
Epoch 4 Validation: 100%|██████████| 626/626 [00:59<00:00, 10.51it/s]


Epoch 4:
Train Loss: 0.0261 | Val Loss: 0.0271


Epoch 5 Training: 100%|██████████| 5626/5626 [11:37<00:00,  8.06it/s]
Epoch 5 Validation: 100%|██████████| 626/626 [00:58<00:00, 10.73it/s]


Epoch 5:
Train Loss: 0.0253 | Val Loss: 0.0268


Epoch 6 Training:  32%|███▏      | 1776/5626 [03:39<07:55,  8.10it/s]


KeyboardInterrupt: 

#### 测试阶段

In [None]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        user, movie, rating = [x.to(device) for x in batch]
        pred = model(user, movie)
        test_loss += criterion(pred, rating).item() * user.size(0)
test_loss /= len(test_loader.dataset)

test_loss_csv = 'test_loss.csv'

with open(test_loss_csv, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['test_loss'])  # 添加表头
    csv_writer.writerow([test_loss])  # 写入测试损失

print(loss_array)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test RMSE: {np.sqrt(test_loss * 5.0**2):.4f}')  # 反归一化后计算RMSE

#### 数据分析

In [None]:
import matplotlib.pyplot as plt
import  csv
# 直接使用当前工作目录
csv_path = 'loss_data.csv'

# Read train_loss and test_loss from loss_data.csv
train_loss = []
val_loss = []
with open(csv_path, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        train_loss.append(float(row['train_loss']))
        val_loss.append(float(row['val_loss']))

# Create epochs array
epochs = range(1, len(train_loss) + 1)

# Create the plot
plt.figure(figsize=(8,6))
# plt.plot(epochs, train_loss, 'b-', label='Training Loss')
plt.plot(epochs,[tl* 5.0**2 for tl in train_loss],'p-',label="Train RMSE")

# plt.plot(epochs, test_loss, 'r-',label='val Loss')
plt.plot(epochs,[tl* 5.0**2 for tl in val_loss],'p-',label="Validating RMSE")


# Customize the plot
plt.title('Training and Validating Loss Over Epochs')
plt.xlabel('Epochs')
# plt.ylabel('Loss')
plt.ylabel(ylabel='RMSE')
plt.grid(True)
plt.legend()

# Show the plot
plt.show()

## 添加BERT模块对标题进行编码

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import csv

#### 添加BERT模型

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
# 加载BERT tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()  # 推理模式

#### 定义电影名称转BERT向量的函数

In [None]:
import numpy as np

def title_to_bert_vec(title):
    with torch.no_grad():
        inputs = tokenizer(title, return_tensors='pt', truncation=True, max_length=32).to(device)
        outputs = bert_model(**inputs)
        # 取[CLS]向量作为整体表示
        cls_vec = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_vec

#### 数据清洗

In [None]:
# 数据预处理
ratings = pd.read_csv('../Dataset/ml-32m/ratings.csv')
csv_file_path = 'loss_data.csv'
print("数据读取成功")
# 创建用户和电影映射字典,将稀疏数据稠密化
user_ids = ratings['userId'].unique()
user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
movie_ids = ratings['movieId'].unique()
movie_to_idx = {movie: idx for idx, movie in enumerate(movie_ids)}

In [None]:
# 转换为连续索引
ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)
# 归一化评分到0-1范围
ratings['rating'] = ratings['rating'] / 5.0

#### 生成BERT向量

In [None]:
movies = pd.read_csv('../Dataset/ml-32m/movies.csv')
movieid2bertvec = {
    row['movieId']: title_to_bert_vec(row['title'])
    for _, row in movies.iterrows()
}
bert_dim = next(iter(movieid2bertvec.values())).shape[0]

#### 数据集划分

In [None]:
# 数据集划分
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

#### 修改Dataset，返回BERT向量

In [None]:
class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings, movie_ids, movieid2bertvec):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        self.movie_ids = movie_ids # movie_ids = movie_to_idx
        self.movieid2bertvec = movieid2bertvec
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        movie_id = self.movie_ids[idx]
        bert_vec = self.movieid2bertvec[movie_id]
        return (
            torch.tensor(self.users[idx], dtype=torch.long).to(device),
            torch.tensor(self.movies[idx], dtype=torch.long).to(device),
            torch.tensor(self.ratings[idx], dtype=torch.float).to(device),
            torch.tensor(bert_vec, dtype=torch.float).to(device)
        )

#### 加载数据

In [None]:
# 创建数据加载器
batch_size = 2048

train_dataset = MovieLensDataset(train_df['user_idx'].values, 
                               train_df['movie_idx'].values,
                               train_df['rating'].values,
                               train_df["movieId"].values,
                               movieid2bertvec)
val_dataset = MovieLensDataset(val_df['user_idx'].values,
                             val_df['movie_idx'].values,
                             val_df['rating'].values,
                             train_df["movieId"].values,
                             movieid2bertvec)
test_dataset = MovieLensDataset(test_df['user_idx'].values,
                              test_df['movie_idx'].values,
                              test_df['rating'].values,
                              train_df["movieId"].values,
                              movieid2bertvec)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,pin_memory=True)
print("数据加载成功")

#### 定义推荐模型

In [None]:
class Recommender(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64, hidden_dim=128, bert_dim=768):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.movie_emb = nn.Embedding(num_movies, embedding_dim)
        self.fc = nn.Sequential(
            # 两个emb宽度加上bert_vec的宽度
            nn.Linear(2*embedding_dim + bert_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Linear(hidden_dim//2, hidden_dim//4),
            nn.ReLU(),
            nn.Linear(hidden_dim//4, 1),
            
            nn.Sigmoid()
        )
        
    def forward(self, user, movie, bert_vec):
        user_emb = self.user_emb(user)
        movie_emb = self.movie_emb(movie)
        x = torch.cat([user_emb, movie_emb, bert_vec], dim=1)
        return self.fc(x).squeeze()

#### 加载模型，指定优化器

In [None]:
import os

# 初始化模型和优化器

n_users = len(user_ids)
n_movies = len(movie_ids)

model = Recommender(n_users, n_movies).to(device)
if os.path.exists('best_model.pth'):
    model.load_state_dict(torch.load('best_model.pth', map_location=device))
    print("已加载保存的模型参数")
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

## 训练模型

In [None]:
# 训练循环
epochs = 30
best_val_loss = float('inf')
loss_array = []

for epoch in range(epochs):
    # 训练阶段
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1} Training'):
        user, movie, rating, bert_vec = [x.to(device) for x in batch]
        optimizer.zero_grad()
        pred = model(user, movie, bert_vec)
        loss = criterion(pred, rating)

        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * user.size(0)
    train_loss /= len(train_loader.dataset)
    
    # 验证阶段
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch+1} Validation'):
            user, movie, rating = [x.to(device) for x in batch]
            pred = model(user, movie)
            val_loss += criterion(pred, rating).item() * user.size(0)
    val_loss /= len(val_loader.dataset)
    
    loss_array.append([train_loss,val_loss])

    print(f'Epoch {epoch+1}:')
    print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    
    # 保存最佳模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
    # 保存损失数据到CSV文件
    with open(csv_file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['train_loss', 'val_loss'])  # 添加表头
        csv_writer.writerows(loss_array)

print("训练完成，开始测试阶段")


#### 测试阶段

In [None]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        user, movie, rating, bert_vec = [x.to(device) for x in batch]
        pred = model(user, movie, bert_vec)
        test_loss += criterion(pred, rating).item() * user.size(0)
test_loss /= len(test_loader.dataset)

test_loss_csv = 'test_loss.csv'

with open(test_loss_csv, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['test_loss'])  # 添加表头
    csv_writer.writerow([test_loss])  # 写入测试损失

print(loss_array)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test RMSE: {np.sqrt(test_loss * 5.0**2):.4f}')  # 反归一化后计算RMSE

#### 数据分析

In [None]:
import matplotlib.pyplot as plt
import  csv
import os
# 直接使用当前工作目录
csv_path = 'loss_data.csv'

# Read train_loss and test_loss from loss_data.csv
train_loss = []
val_loss = []
with open(csv_path, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        train_loss.append(float(row['train_loss']))
        val_loss.append(float(row['val_loss']))

# Create epochs array
epochs = range(1, len(train_loss) + 1)

# Create the plot
plt.figure(figsize=(8,6))
# plt.plot(epochs, train_loss, 'b-', label='Training Loss')
plt.plot(epochs,[tl* 5.0**2 for tl in train_loss],'p-',label="Train RMSE")

# plt.plot(epochs, test_loss, 'r-',label='val Loss')
plt.plot(epochs,[tl* 5.0**2 for tl in val_loss],'p-',label="Validating RMSE")


# Customize the plot
plt.title('Training and Validating Loss Over Epochs')
plt.xlabel('Epochs')
# plt.ylabel('Loss')
plt.ylabel(ylabel='RMSE')
plt.grid(True)
plt.legend()

# Show the plot
plt.show()