抄袭自 https://www.kaggle.com/code/jamesloy/deep-learning-based-recommender-systems

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# np.random.seed(1024)

print(f'using {device} device')

using cuda device


### Read dataset from file

In [2]:
# From https://grouplens.org/datasets/movielens/latest/
# Should use ml-latest instead of ml-latest-small for final project 
ratings = pd.read_csv('./ml-latest-small/ratings.csv', parse_dates=['timestamp'], date_parser=lambda s: datetime.fromtimestamp(int(s)))
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
58019,380,122906,3.5,2018-09-14 05:04:59
95900,601,4963,4.5,2015-09-07 23:31:15
70305,448,118900,3.0,2015-12-13 04:21:19
3085,20,3159,5.0,2003-05-27 20:21:55
29050,200,2599,3.0,2008-12-22 00:20:30


### Sort entry by timestamp

In [3]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

ratings['rank_latest']

0          86.0
1         196.0
2         141.0
3          18.0
4          66.0
          ...  
100831    314.0
100832     95.0
100833     23.0
100834    615.0
100835    610.0
Name: rank_latest, Length: 100836, dtype: float64

### Train-test split by timestamp
We use earlier entries as train data, latest entries as test data, because it does not makes sense to use future data to predict earlier behaviour.

In [4]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
       .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] >= 4.0]
test_ratings = ratings[ratings['rank_latest'] < 4.0]

train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

train_ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


### Define dataset class

In [5]:
class MovieLensTrainDataset(Dataset):
    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []        
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))
        num_negative = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
        
            for _ in range(num_negative):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
        
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        users_tensor = torch.tensor(users)
        items_tensor = torch.tensor(items)
        labels_tensor = torch.tensor(labels)

        return users_tensor, items_tensor, labels_tensor

### Define NCF model structure

In [6]:
class NCF(torch.nn.Module):
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=32)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=32)
        self.fc1 = nn.Linear(in_features=32 + 32, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.output = nn.Linear(in_features=64, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
    
    def forward(self, user_input, item_input):        
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        vector = torch.cat([user_embedded, item_embedded], dim=1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        output = nn.Sigmoid()(self.output(vector))
        return output

### Construct model

In [7]:
num_users = ratings['userId'].max() + 1
num_items = ratings['movieId'].max() + 1

print(num_users, num_items)

all_movieIds = ratings['movieId'].unique()
model = NCF(num_users, num_items, train_ratings, all_movieIds)
model = model.to(device)

train_dataloader = DataLoader(MovieLensTrainDataset(ratings, all_movieIds),
                              batch_size=512, num_workers=4)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

def train_one_step(batch):
    optimizer.zero_grad()
    
    user_inputs, item_inputs, labels = batch
    user_inputs = user_inputs.to(device)
    item_inputs = item_inputs.to(device)
    labels = labels.to(device)
    predicted_labels = model(user_inputs, item_inputs)
    loss = criterion(predicted_labels, labels.view(-1, 1).float())
    loss.backward()
    optimizer.step()

    return loss

611 193610


### Do the train

In [8]:
epochs = 100
for i in range(epochs):
    loss_sum = 0.0
    sample = 0
    for batch_idx, batch in enumerate(train_dataloader):
        loss = train_one_step(batch)
        loss_sum += loss.item()
        
    print(f'Epoch {i + 1}/{epochs}, avg_loss={loss_sum/len(train_dataloader)}')

Epoch 1/100, avg_loss=0.4473012905738075
Epoch 2/100, avg_loss=0.34722361404278557
Epoch 3/100, avg_loss=0.3255049181014753
Epoch 4/100, avg_loss=0.3164929483293882
Epoch 5/100, avg_loss=0.31079592241853626
Epoch 6/100, avg_loss=0.30625686052486983
Epoch 7/100, avg_loss=0.3019910601038618
Epoch 8/100, avg_loss=0.29765245049132916
Epoch 9/100, avg_loss=0.29301929224263595
Epoch 10/100, avg_loss=0.2880580403931855
Epoch 11/100, avg_loss=0.28260333867847615
Epoch 12/100, avg_loss=0.2767180361420975
Epoch 13/100, avg_loss=0.27038728615959284
Epoch 14/100, avg_loss=0.26373357588264545
Epoch 15/100, avg_loss=0.2566913501562806
Epoch 16/100, avg_loss=0.2494507082071401
Epoch 17/100, avg_loss=0.24208052070612834
Epoch 18/100, avg_loss=0.234645957223655
Epoch 19/100, avg_loss=0.22728408885183674
Epoch 20/100, avg_loss=0.21995036659506978
Epoch 21/100, avg_loss=0.21264794568725043
Epoch 22/100, avg_loss=0.2055503193194491
Epoch 23/100, avg_loss=0.1985582836690893
Epoch 24/100, avg_loss=0.1918601

### Prepare test data

In [12]:
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# { userId: movieIds }
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()
user_interacted_items_set = ratings.groupby('userId')['movieId'].apply(set).to_dict()

### Do the test

In [13]:
import random

sample_count = 0
hit = 0

with torch.no_grad():
    for u, i in sorted(test_user_item_set):
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_movieIds) - user_interacted_items_set[u]
        selected_not_interacted_items = random.sample(list(not_interacted_items), 99)
    
        test_items = selected_not_interacted_items + [i]
    
        user_input = torch.tensor([u] * 100).to(device)
        item_input = torch.tensor(test_items).to(device)
    
        predicted = model(user_input, item_input)
        predicted = predicted.squeeze()

        items_to_recommend = torch.topk(predicted, 10).indices.tolist()
        sample_count += 1
        if 99 in items_to_recommend:
            hit += 1

In [14]:
# Recommend success accuracy
print(hit / sample_count)

0.9120218579234972
