抄袭自 https://www.kaggle.com/code/jamesloy/deep-learning-based-recommender-systems

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# np.random.seed(1024)

print(f'using {device} device')

using cuda device


### Read dataset from file

In [2]:
# From https://grouplens.org/datasets/movielens/latest/
# Should use ml-latest instead of ml-latest-small for final project 
ratings = pd.read_csv('./ml-1m/ratings.dat',
                      sep='::',
                      engine='python',
                      header=None,
                      names=['userId', 'movieId', 'rating', 'timestamp'],
                      parse_dates=['timestamp'],
                      date_parser=lambda s: datetime.fromtimestamp(int(s))
                     )
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
329039,1943,1259,5,2000-11-21 08:55:00
284273,1698,2348,4,2000-11-21 10:10:10
739016,4415,110,5,2000-08-02 01:40:05
107971,710,1196,5,2000-11-30 16:02:19
265740,1613,2891,3,2000-11-20 23:02:20


### Sort entry by timestamp

In [3]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

ratings['rank_latest']

0           42.0
1           23.0
2           28.0
3           47.0
4            4.0
           ...  
1000204    161.0
1000205    293.0
1000206    305.0
1000207    234.0
1000208    246.0
Name: rank_latest, Length: 1000209, dtype: float64

### Train-test split by timestamp
We use earlier entries as train data, latest entries as test data, because it does not makes sense to use future data to predict earlier behaviour.

In [4]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
       .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] >= 4.0]
test_ratings = ratings[ratings['rank_latest'] < 4.0]

train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

train_ratings

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


### Define dataset class

In [5]:
class MovieLensTrainDataset(Dataset):
    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []        
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))
        num_negative = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
        
            for _ in range(num_negative):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
        
                users.append(u)
                items.append(negative_item)
                labels.append(0)
 
        users_tensor = torch.tensor(users)
        items_tensor = torch.tensor(items)
        labels_tensor = torch.tensor(labels)

        return users_tensor, items_tensor, labels_tensor

### Define NCF model structure

In [6]:
class NCF(torch.nn.Module):
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=256)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=256)
        self.fc1 = nn.Linear(in_features=256 + 256, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=64)
        self.output = nn.Linear(in_features=64, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
    
    def forward(self, user_input, item_input):        
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        vector = torch.cat([user_embedded, item_embedded], dim=1)
        vector = nn.Tanh()(self.fc1(vector))
        vector = nn.Dropout(p=0.2)(vector)
        vector = nn.ReLU()(self.fc2(vector))
        vector = nn.Dropout(p=0.2)(vector)
        vector = nn.ReLU()(self.fc3(vector))
        vector = nn.Dropout(p=0.2)(vector)

        output = nn.Sigmoid()(self.output(vector))
        return output

### Construct model

In [7]:
num_users = ratings['userId'].max() + 1
num_items = ratings['movieId'].max() + 1

print(num_users, num_items)

all_movieIds = ratings['movieId'].unique()
model = NCF(num_users, num_items, train_ratings, all_movieIds)
model = model.to(device)

train_dataloader = DataLoader(MovieLensTrainDataset(ratings, all_movieIds),
                              batch_size=64, num_workers=4)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), 1e-4)

def train_one_step(batch):
    optimizer.zero_grad()
    
    user_inputs, item_inputs, labels = batch
    user_inputs = user_inputs.to(device)
    item_inputs = item_inputs.to(device)
    labels = labels.to(device)
    predicted_labels = model(user_inputs, item_inputs)
    loss = criterion(predicted_labels, labels.view(-1, 1).float())
    loss.backward()
    optimizer.step()

    return loss

6041 3953


### Prepare test data

In [8]:
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# { userId: movieIds }
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()
user_interacted_items_set = ratings.groupby('userId')['movieId'].apply(set).to_dict()

### Do the train

In [9]:
import random

def eval():
    sample_count = 0
    hit = 0
    
    with torch.no_grad():
        for u, i in sorted(test_user_item_set):
            not_interacted_items = set(all_movieIds) - user_interacted_items_set[u]
            selected_not_interacted_items = random.sample(list(not_interacted_items), 99)
        
            test_items = selected_not_interacted_items + [i]
        
            user_input = torch.tensor([u] * 100).to(device)
            item_input = torch.tensor(test_items).to(device)
        
            predicted = model(user_input, item_input)
            predicted = predicted.squeeze()
    
            items_to_recommend = torch.topk(predicted, 10).indices.tolist()
            sample_count += 1
            if 99 in items_to_recommend:
                hit += 1
    # Recommend success accuracy
    return (hit / sample_count)

print(eval())
epochs = 100
for i in range(epochs):
    loss_sum = 0.0
    sample = 0
    for batch_idx, batch in enumerate(train_dataloader):
        loss = train_one_step(batch)
        loss_sum += loss.item()
        
    print(f'Epoch {i + 1}/{epochs}, avg_loss={loss_sum/len(train_dataloader)}, hitrate={eval()}')

0.10353200883002207
Epoch 1/100, avg_loss=0.3742671354599027, hitrate=0.45684326710816775
Epoch 2/100, avg_loss=0.35656329303566453, hitrate=0.45921633554083885
Epoch 3/100, avg_loss=0.34897858804831083, hitrate=0.47560706401766006
Epoch 4/100, avg_loss=0.33484954359813873, hitrate=0.5012693156732891
Epoch 5/100, avg_loss=0.3222918165566148, hitrate=0.5320640176600442
Epoch 6/100, avg_loss=0.3133495983783923, hitrate=0.5540838852097131
Epoch 7/100, avg_loss=0.3068103324837876, hitrate=0.5653421633554084
Epoch 8/100, avg_loss=0.3014813748659258, hitrate=0.5834988962472406
Epoch 9/100, avg_loss=0.29699324513682446, hitrate=0.5909492273730684
Epoch 10/100, avg_loss=0.29289986607661656, hitrate=0.6039183222958058
Epoch 11/100, avg_loss=0.2891788255887741, hitrate=0.6138520971302428
Epoch 12/100, avg_loss=0.28584281576022724, hitrate=0.6255518763796909
Epoch 13/100, avg_loss=0.28283163108178944, hitrate=0.6282560706401766
Epoch 14/100, avg_loss=0.27976754228542394, hitrate=0.637086092715231

### Do the test

In [10]:
eval()

0.8125275938189845