# Setting Training Data

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
device

device(type='cpu')

In [3]:
import pandas as pd
import numpy as np
DATA_PATH = "../../data/"

books_data = pd.read_csv(f"{DATA_PATH}/raw/books_data.csv")
cleaned_reviews = pd.read_csv(f"{DATA_PATH}/cleaned_data/cleaned_reviews.csv")

with open(f"{DATA_PATH}/embeddings/bert_embeddings.npy", "rb") as f:
    bert_embeddings = np.load(f)

In [4]:
bert_embeddings.shape, books_data.shape

((212404, 384), (212404, 10))

In [5]:
# Assuming you have loaded the dataframe as cleaned_reviews
# Filter out users with less than a minimum number of reviews
MIN_REVIEWS = 2
user_counts = cleaned_reviews['User_id'].value_counts()
valid_users = user_counts[user_counts >= MIN_REVIEWS].index

filtered_reviews = cleaned_reviews.copy()[cleaned_reviews['User_id'].isin(valid_users)]

In [6]:
# Compute mean and std per user for rescaling
user_stats = filtered_reviews.groupby('User_id')['review/score'].agg(['mean', 'std'])

# Apply rescaling per user
filtered_reviews['normalized_score'] = filtered_reviews.apply(lambda row: (row['review/score'] - user_stats.loc[row['User_id'], 'mean']) / user_stats.loc[row['User_id'], 'std'], axis=1)

  filtered_reviews['normalized_score'] = filtered_reviews.apply(lambda row: (row['review/score'] - user_stats.loc[row['User_id'], 'mean']) / user_stats.loc[row['User_id'], 'std'], axis=1)


In [7]:
users_with_nan_norm_score = filtered_reviews[filtered_reviews['normalized_score'].isna()].User_id.unique()
filtered_reviews = filtered_reviews[~filtered_reviews.User_id.isin(users_with_nan_norm_score)]

In [8]:
# Create a dictionary where the key is the title and the value is its index in the books_data DataFrame
title_to_index = {title: idx for idx, title in enumerate(books_data['Title'])}

# Map the Title in filtered_reviews to its index in books_data
filtered_reviews = filtered_reviews.copy()
filtered_reviews['Title_Index'] = filtered_reviews['Title'].map(title_to_index)

In [9]:
# Sort the data by user and time
filtered_reviews = filtered_reviews.sort_values(by=['User_id', 'review/time'])

In [10]:
filtered_reviews.shape

(1003101, 7)

In [11]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

MAX_REVIEWS = 10
# We can either use the normalized score or the raw scores
SCORE_COLUMN = "normalized_score"

# Split the unique user_ids into train, validation and test sets
train_users, temp_users = train_test_split(filtered_reviews['User_id'].unique(), test_size=0.4, random_state=42)
valid_users, test_users = train_test_split(temp_users, test_size=0.5, random_state=42)

def prepare_sequences(users, df):
    sequences = []
    for user in tqdm(users):
        user_data = df[df['User_id'] == user]
        
        # Ensure that we keep only the most recent MAX_REVIEWS reviews
        if len(user_data) > MAX_REVIEWS + 1:
            user_data = user_data.iloc[-(MAX_REVIEWS + 1):]  # +1 to account for the query doc

        title_indexes = user_data['Title_Index'].values[:-1]
        scores = user_data[SCORE_COLUMN].values[:-1]
        
        mask = [1] * len(title_indexes) + [0] * (MAX_REVIEWS - len(title_indexes))
        title_indexes = list(title_indexes) + [0] * (MAX_REVIEWS - len(title_indexes))
        scores = list(scores) + [0] * (MAX_REVIEWS - len(scores))
        query_id = user_data['Title_Index'].values[-1]
        target = user_data[SCORE_COLUMN].values[-1]
        
        sequences.append([title_indexes, scores, mask, query_id, target])
    return sequences



train_sequences = prepare_sequences(train_users, filtered_reviews)
valid_sequences = prepare_sequences(valid_users, filtered_reviews)
test_sequences = prepare_sequences(test_users, filtered_reviews)


100%|██████████| 69099/69099 [38:22<00:00, 30.01it/s]
100%|██████████| 23033/23033 [12:48<00:00, 29.97it/s]
100%|██████████| 23033/23033 [12:49<00:00, 29.94it/s]


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        title_indexes, scores, mask, query_id, target = self.sequences[idx]
        return torch.tensor(title_indexes, dtype=torch.long), \
               torch.tensor(scores, dtype=torch.float32), \
               torch.tensor(mask, dtype=torch.float32), \
               torch.tensor(query_id, dtype=torch.long), \
               torch.tensor(target, dtype=torch.float32)

train_dataset = ReviewDataset(train_sequences)
valid_dataset = ReviewDataset(valid_sequences)
test_dataset = ReviewDataset(test_sequences)

In [13]:
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
# If we use the raw scores, we change the prefix string to 'raw_score_'
prefix = "normalized_score_"
filepath_prefix = DATA_PATH + "/datasets/" + prefix

torch.save(train_loader, f"{filepath_prefix}train_loader.pth")
torch.save(valid_loader, f"{filepath_prefix}valid_loader.pth")
torch.save(test_loader, f"{filepath_prefix}test_loader.pth")

In [15]:
train_dataset[0]

(tensor([150003,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 tensor([0.7071, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000]),
 tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor(62165),
 tensor(-0.7071))

# Model Definition

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserBehaviorEncoder(nn.Module):
    def __init__(self, embeddings, num_heads, output_dim, num_layers=1):
        super(UserBehaviorEncoder, self).__init__()
        
        self.doc_emb_dim = embeddings.shape[1]
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embeddings).float(), padding_idx=0, freeze=True)
        
        # Embedding to transform scores to the same dimensionality as document embeddings
        self.score_emb = nn.Linear(1, self.doc_emb_dim)
        
        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.doc_emb_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Dense layer to produce final user behavior representation
        self.fc = nn.Linear(self.doc_emb_dim, output_dim)

    def forward(self, doc_indices, scores, attention_mask=None):
        # Get the document embeddings using the indices
        doc_embeddings = self.embedding(doc_indices)
        
        # Transform scores to embeddings
        scores_emb = self.score_emb(scores.unsqueeze(-1)).squeeze(2)
        
        # Combine embeddings
        combined_emb = doc_embeddings + scores_emb

        # Permute to match expected shape
        combined_emb_permuted = combined_emb.permute(1, 0, 2)
        
        # Apply the transformer encoder
        attended_values_permuted = self.transformer_encoder(
            combined_emb_permuted,
            src_key_padding_mask=(attention_mask == 0).bool()
        )

        # Permute the values back to (batch_size, sequence_length, embedding_dim)
        attended_values = attended_values_permuted.permute(1, 0, 2)
        
        # Produce user behavior embedding
        user_behavior_emb = attended_values.mean(dim=1)
        
        # Pass through a dense layer
        user_behavior_rep = self.fc(user_behavior_emb)
        
        return user_behavior_rep

class UserPreferenceRegressor(nn.Module):
    def __init__(self, embeddings, num_heads, user_behavior_output_dim, predictor_hidden_dim, num_transformer_layers):
        super(UserPreferenceRegressor, self).__init__()
        
        self.encoder = UserBehaviorEncoder(embeddings, num_heads, user_behavior_output_dim, num_layers=num_transformer_layers)
        
        # Define the predictor
        self.predictor = nn.Sequential(
            nn.Linear(user_behavior_output_dim + embeddings.shape[1], predictor_hidden_dim),
            nn.ReLU(),
            nn.Linear(predictor_hidden_dim, 1)
        )
    
    def forward(self, doc_indices, scores, attention_mask, new_doc_embedding):
        user_behavior = self.encoder(doc_indices, scores, attention_mask)
        combined = torch.cat([user_behavior, new_doc_embedding], dim=1)
        return self.predictor(combined)


In [19]:
# Hyperparameters
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
NUM_HEADS = 2  # for MultiHeadAttention
OUTPUT_DIM = 128  # User behavior representation dimension
HIDDEN_DIM = 256  # Predictor hidden layer dimension
NUM_TRANSFORMER_LAYERS = 1

# Initialize model
model = UserPreferenceRegressor(bert_embeddings, NUM_HEADS, OUTPUT_DIM, HIDDEN_DIM, NUM_TRANSFORMER_LAYERS)
model = model.to(device)  # Move model to device
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Model Training

In [20]:
# Training Function
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=NUM_EPOCHS):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for idx, (title_indexes, scores, mask, query_id, target) in enumerate(train_loader):
            title_indexes, scores, mask, query_id, target = title_indexes.to(device), scores.to(device), mask.to(device), query_id.to(device), target.to(device)
            optimizer.zero_grad()
            
            new_doc_embedding = model.encoder.embedding(query_id).squeeze(1)
            outputs = model(title_indexes, scores.unsqueeze(2), mask, new_doc_embedding)
            
            loss = criterion(outputs.squeeze(1), target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}")
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for title_indexes, scores, mask, query_id, target in valid_loader:
                title_indexes, scores, mask, query_id, target = title_indexes.to(device), scores.to(device), mask.to(device), query_id.to(device), target.to(device)
                new_doc_embedding = model.encoder.embedding(query_id).squeeze(1)
                outputs = model(title_indexes, scores.unsqueeze(2), mask, new_doc_embedding)
                loss = criterion(outputs.squeeze(1), target)
                val_loss += loss.item()
        
        print(f"Validation Loss: {val_loss/len(valid_loader):.4f}")
        
# Call the training function
train_model(model, train_loader, valid_loader, criterion, optimizer)


Epoch [1/20], Train Loss: 0.4839
Validation Loss: 0.4517
Epoch [2/20], Train Loss: 0.4456
Validation Loss: 0.4376
Epoch [3/20], Train Loss: 0.4306
Validation Loss: 0.4218
Epoch [4/20], Train Loss: 0.4217
Validation Loss: 0.4168
Epoch [5/20], Train Loss: 0.4131
Validation Loss: 0.4348
Epoch [6/20], Train Loss: 0.4121
Validation Loss: 0.4291
Epoch [7/20], Train Loss: 0.3987
Validation Loss: 0.4214
Epoch [8/20], Train Loss: 0.3954
Validation Loss: 0.4336
Epoch [9/20], Train Loss: 0.3877
Validation Loss: 0.4070
Epoch [10/20], Train Loss: 0.4235
Validation Loss: 0.4321
Epoch [11/20], Train Loss: 0.4309
Validation Loss: 0.4468
Epoch [12/20], Train Loss: 0.4167
Validation Loss: 0.4256
Epoch [13/20], Train Loss: 0.4103
Validation Loss: 0.3867
Epoch [14/20], Train Loss: 0.3197
Validation Loss: 0.4232
Epoch [15/20], Train Loss: 0.3720
Validation Loss: 0.3127
Epoch [16/20], Train Loss: 0.3012
Validation Loss: 0.2967
Epoch [17/20], Train Loss: 0.2886
Validation Loss: 0.3183
Epoch [18/20], Train Lo

# Recommendation Example

In [21]:
def predict_scores_for_all_books(model, embeddings, doc_indices, scores):
    """
    Predict the scores for a user for all N books.

    Parameters:
    - model: The trained UserPreferenceRegressor model.
    - embeddings: The embeddings for all N books.
    - doc_indices: A list of document indices representing user reviews.
    - scores: A list of scores associated with the user reviews.

    Returns:
    - List of tuples with each tuple containing (book_id, predicted_score).
    """

    # Convert the doc_indices and scores to tensors
    doc_indices_tensor = torch.tensor([doc_indices + [0] * (10 - len(doc_indices))], dtype=torch.long)
    scores_tensor = torch.tensor([scores + [0.0] * (MAX_REVIEWS - len(scores))], dtype=torch.float).unsqueeze(-1)
    
    # Create an attention mask for valid reviews
    attention_mask = [False] * len(doc_indices) + [True] * (MAX_REVIEWS - len(doc_indices))
    attention_mask_tensor = torch.tensor([attention_mask])

    all_predictions = []

    # Loop over all book embeddings to predict the scores
    for book_id in tqdm(range(embeddings.shape[0])):
        # Get the book embedding tensor
        new_doc_embedding_tensor = torch.tensor(embeddings[book_id]).unsqueeze(0)

        # Use the model to predict the score
        doc_indices_tensor, scores_tensor, attention_mask_tensor, new_doc_embedding_tensor = doc_indices_tensor.to(device), scores_tensor.to(device), attention_mask_tensor.to(device), new_doc_embedding_tensor.to(device)
        predicted_score_tensor = model(doc_indices_tensor, scores_tensor, attention_mask_tensor, new_doc_embedding_tensor)
        
        # Convert tensor to scalar and append to all_predictions list
        all_predictions.append(predicted_score_tensor.item())

    return all_predictions

preds = predict_scores_for_all_books(model, bert_embeddings, [36776, 99559], [5., 5.])

100%|██████████| 212404/212404 [05:28<00:00, 645.67it/s]


In [28]:
top_results = pd.Series(preds).sort_values(ascending=False)
top_results.name = "score"
top_results = pd.merge(top_results, books_data, left_index=True, right_index=True)[["score","Title","description"]]
top_results.head(20)

Unnamed: 0,score,Title,description
71641,0.999039,Monty Python And The Holy Grail (Book): Monti Python ik den Holie Grailen (Bok),The Monty Python team's first feature film is a mock-heroic tale set in Medieval Britain.
20426,0.901003,"White jacket: Or, The world in a man-of-war (The Works of Herman Melville, standard edition)",
428,0.89191,Munchkins Remember,"Profiles the numerous men and women who played the citizens of Munchkinland, offering their perspectives on the creation of ""The Wizard of Oz,"" the film's stars, and their own lives before and after the film"
78353,0.88572,The Annotated Wizard of Oz. The Wonderful Wizard of Oz,"An annotated version of the classic tale of Dorothy's trip to Oz includes character sources, contemporary references, and inspirations behind the 1900 classic."
27354,0.771891,"An essay on the principle of population;: Or, A view of its past and present effects on human happiness","This book provides a student audience with the best scholarly edition of Malthus' Essay on Population. Written in 1798 as a polite attack on post-French revolutionary speculations on the theme of social and human perfectibility, it remains one of the most powerful statements of the limits to human hopes set by the tension between population growth and natural resources. Based on the authoritat..."
34005,0.753142,The Wonderful Wizard of Oz (Oxford World's Classics),"Published at the dawn of the twentieth century, The Wonderful Wizard of Oz (1900) immediately captivated children and adult readers alike. This new edition is the only one to include many of W.W. Denslow's original illustrations. The Introduction considers both the famous MGM film version and recent literary theory in a fascinating discussion of this timeless classic of children's literature. ..."
80577,0.739985,White Jacket or The World in A Man-of-War,
15669,0.676415,Mad About the Oscars: 38 Best Picture Winners (and Losers!),"This is the first book of its kind. Aubrey Malone has gone back to the start of the Oscar ceremonies and discovered that mistakes have been made every year in the choice of what has been deemed “best” in the categories of acting, directing, producing and the subsidiary awards. He has identified all the great stars (Garbo, Montgomery Clift, Peter O’Toole, Barbara Stanwyck, etc.) who never held ..."
65203,0.657831,The Wizard of Oz & Who he Was,"Dorothy is transported over the rainbow in this picture book adaptation of the classic movie,The Wizard of Oz."
120672,0.651185,The Pinecroft Thoroughbreds,Beautiful young Irish horsewoman Caitlin Cleary wanted to be a jockey but knew it wasn't possible for a woman in the early 20th century.


In [29]:
top_results.loc[[36776, 99559]]

Unnamed: 0,score,Title,description
36776,-1.556391,Harry Potter and The Sorcerer's Stone,"Celebrate 20 years of Harry Potter magic! Harry Potter has never even heard of Hogwarts when the letters start dropping on the doormat at number four, Privet Drive. Addressed in green ink on yellowish parchment with a purple seal, they are swiftly confiscated by his grisly aunt and uncle. Then, on Harry's eleventh birthday, a great beetle-eyed giant of a man called Rubeus Hagrid bursts in with..."
99559,-1.190139,Harry Potter & the Prisoner of Azkaban,"Through classroom activities, wizard rock concerts, and organizations like the Harry Potter Alliance, Harry Potter fans are using creativity to positively impact the world. This collection of essays and interviews examines how playful fandom--from fanfiction to Muggle quidditch, cosplay, role-playing games, and even Harry Potter burlesque--not only reimagines the canon but also challenges cons..."


In the previous example, we have asked the model to recommend us books based on positive reviews for some Harry Potter books. We can notice the following:
- The top results include many fantasy and magic related books, which is good.
- The books we told the model we like have a very low score given by the model.

The last obeservation shows that there's still room for improvement in the model