In [36]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [37]:
%run -i ../recommender/utils.py

In [38]:
device = CudaUtils.get_device()
device

device(type='cpu')

In [39]:
userid_encoder, isbn_encoder = EncoderUtils.load_encoders()

In [86]:
n_books = len(isbn_encoder.classes_)
n_users = len(userid_encoder.classes_)

print(f"Number of Books: {n_books}")
print(f"Number of Users: {n_users}")

# model = RatingsPredictorMLP(n_books, n_users).to(device)
# model.load_state_dict(torch.load('./models/model_cross_val_3folds.pth'))

Number of Books: 58478
Number of Users: 54426


### Calculate default_user_index - an encoded userid that represents the "average user"

In [41]:
user_embeddings = model.user_embed.weight.data
user_embeddings

tensor([[-0.2894,  0.4537,  1.5684,  ...,  0.0081, -2.0207,  0.7572],
        [-0.2934,  0.9662, -0.3722,  ..., -1.0948,  0.1260,  1.9815],
        [ 0.7165,  0.9545, -1.2309,  ..., -0.0577,  0.2474,  0.9960],
        ...,
        [ 0.0465,  1.1541, -0.1318,  ...,  0.6238,  0.5805, -1.0340],
        [ 1.0899, -0.2101,  0.9566,  ...,  0.3194,  1.2263, -1.3410],
        [-1.4835,  0.1813, -0.6118,  ...,  0.1732, -0.2838, -0.1646]])

In [42]:
average_user_embedding = torch.mean(user_embeddings, dim=0)
average_user_embedding

tensor([-0.0075,  0.0054,  0.0143, -0.0121,  0.0136, -0.0145,  0.0130, -0.0074,
        -0.0053,  0.0158, -0.0071,  0.0145,  0.0087, -0.0054,  0.0181, -0.0141,
         0.0091,  0.0139, -0.0219,  0.0209, -0.0081, -0.0180, -0.0102,  0.0099,
        -0.0028, -0.0216,  0.0066,  0.0123,  0.0169, -0.0077,  0.0114,  0.0030])

In [43]:
cos_sim = F.cosine_similarity(user_embeddings, average_user_embedding.unsqueeze(0))
cos_sim

tensor([-0.0634,  0.0805,  0.1056,  ..., -0.0116,  0.2423, -0.1365])

A default user is one who's ratings are similar (cos_sim) to the avg latent feature vector of all users in the dataset. The first time user can be approximated as an average user with reliable accuracy (as all humans are alike at a surface level). Later, after she/he starts rating books her/his uniqueness will be assessed and addressed by the model. Essentially all users start out as "the average reader".

In [44]:
default_user_idx = torch.argmax(cos_sim).item()
default_user_idx

35553

In [85]:
# Check if the default_user_idx is properly calculated
user_id = userid_encoder.classes_[default_user_idx]
userid_encoder.transform([user_id])[0]

35553

### Recommend top 10 books to the user, based on predicted ratings for all books

In [46]:
user_index = default_user_idx if user_id not in userid_encoder.classes_ else userid_encoder.transform([user_id])[0]
user_tensor = torch.tensor([user_index] * len(isbn_encoder.classes_), dtype=torch.long).to(device)
book_tensor = torch.arange(len(isbn_encoder.classes_), dtype=torch.long).to(device)

In [61]:
with torch.no_grad():
    predictions = model(user_tensor, book_tensor)

ratings_df = DfUtils.get_ratings_df('./main_dataset/updated_ratings.csv')
expected = ratings_df.loc[ratings_df["user_id"] == user_index, ["rating", "isbn"]]
print(expected)

predictions_filtered = predictions[expected["isbn"].values]
x = ( predictions_filtered.flatten() - torch.tensor(expected["rating"].values).to(device) ).abs() > 0.26
x
# predictions_filtered.view(-1) and predictions_filtered.flatten() are functionally the same


        rating   isbn
305877     5.0    399
305879     5.0   7068
305884     5.0   8775
305886     5.0   8851
305887     5.0   8883
...        ...    ...
306105     5.0  52823
306106     5.0  52825
306107     5.0  52831
306108     5.0  52835
306109     5.0  53110

[118 rows x 2 columns]


tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False,  True,
        False, False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False])

Threshold for the model's predictions is ~0.3 

In [79]:
print(predictions.view(-1).sort(descending=True), end="\n\n")
print(predictions.view(-1).sort(descending=True)[1][:10], end="\n\n")
print(predictions.view(-1).argsort(descending=True)[:10], end="\n\n")

torch.return_types.sort(
values=tensor([4.9463, 4.9461, 4.9458,  ..., 4.5933, 4.5868, 4.5769]),
indices=tensor([25753, 32809, 54501,  ..., 39340, 33206, 46078]))

tensor([25753, 32809, 54501, 29826, 23189, 36665, 20530,   833, 14083, 23190])

tensor([25753, 32809, 54501, 29826, 23189, 36665, 20530,   833, 14083, 23190])



In [82]:
top_n_books = predictions.view(-1).argsort(descending=True)[:10]
top_n_books

tensor([25753, 32809, 54501, 29826, 23189, 36665, 20530,   833, 14083, 23190])

In [83]:
recommended_books = isbn_encoder.inverse_transform(top_n_books.numpy())
recommended_books

array(['0446310786', '0553205587', '1575450607', '0451457552',
       '0439139597', '0618002235', '039480029X', '0060194995',
       '0373257988', '0439139600'], dtype=object)

In [None]:
def recommend_for_user(user_id, model, book_encoder, userid_encoder, default_user_index):
    user_index = default_user_index if user_id not in userid_encoder.classes_ else userid_encoder.transform([user_id])[0]
    user_tensor = torch.tensor([user_index] * len(book_encoder.classes_), dtype=torch.long)
    book_indices = torch.arange(len(book_encoder.classes_), dtype=torch.long)
    
    with torch.no_grad():
        predictions = model(user_tensor, book_indices)
    
    top_n_books = predictions.view(-1).argsort(descending=True)[:10]
    recommended_books = book_encoder.inverse_transform(top_n_books.numpy())
    
    return recommended_books