# Setting Training Data

In [3]:
import pandas as pd
import numpy as np
DATA_PATH = "../../data/"

books_data = pd.read_csv(f"{DATA_PATH}/raw/books_data.csv")
cleaned_reviews = pd.read_csv(f"{DATA_PATH}/cleaned_data/cleaned_reviews.csv")

with open(f"{DATA_PATH}/embeddings/bert_embeddings.npy", "rb") as f:
    bert_embeddings = np.load(f)

In [4]:
# Assuming you have loaded the dataframe as cleaned_reviews
# Filter out users with less than a minimum number of reviews
MIN_REVIEWS = 3
user_counts = cleaned_reviews['User_id'].value_counts()
valid_users = user_counts[user_counts >= MIN_REVIEWS].index

filtered_reviews = cleaned_reviews.copy()[cleaned_reviews['User_id'].isin(valid_users)]

In [5]:
# Compute mean and std per user for rescaling
user_stats = filtered_reviews.groupby('User_id')['review/score'].agg(['mean', 'std'])

# Apply rescaling per user
filtered_reviews['normalized_score'] = filtered_reviews.apply(lambda row: (row['review/score'] - user_stats.loc[row['User_id'], 'mean']) / user_stats.loc[row['User_id'], 'std'], axis=1)

  filtered_reviews['normalized_score'] = filtered_reviews.apply(lambda row: (row['review/score'] - user_stats.loc[row['User_id'], 'mean']) / user_stats.loc[row['User_id'], 'std'], axis=1)


In [6]:
users_with_nan_norm_score = filtered_reviews[filtered_reviews['normalized_score'].isna()].User_id.unique()
filtered_reviews = filtered_reviews[~filtered_reviews.User_id.isin(users_with_nan_norm_score)]

In [7]:
# Create a dictionary where the key is the title and the value is its index in the books_data DataFrame
title_to_index = {title: idx for idx, title in enumerate(books_data['Title'])}

# Map the Title in filtered_reviews to its index in books_data
filtered_reviews = filtered_reviews.copy()
filtered_reviews['Title_Index'] = filtered_reviews['Title'].map(title_to_index)

In [8]:
# Sort the data by user and time
filtered_reviews = filtered_reviews.sort_values(by=['User_id', 'review/time'])

In [9]:
filtered_reviews

Unnamed: 0,Id,Title,User_id,review/score,review/time,normalized_score,Title_Index
17852,B0000CJ9GZ,The richest man in Babylon,A0015610VMNR0JC9XVL1,5.0,1358985600,0.408248,105455
17939,0785263500,ATTITUDE 101,A0015610VMNR0JC9XVL1,3.0,1358985600,-2.041241,119672
17972,B000GQMVWI,The Richest Man in Babylon,A0015610VMNR0JC9XVL1,5.0,1358985600,0.408248,51729
18079,B0007DRIT6,The richest man in Babylon,A0015610VMNR0JC9XVL1,5.0,1358985600,0.408248,105455
18135,B0007G66WI,The richest man in Babylon,A0015610VMNR0JC9XVL1,5.0,1358985600,0.408248,105455
...,...,...,...,...,...,...,...
358987,B000OTPXI6,Our Yanks a Love Story,AZZVZL4QEHEHO,4.0,1201824000,0.225387,175010
359126,0552148229,Our Yanks,AZZVZL4QEHEHO,4.0,1201824000,0.225387,164117
122469,B000GSQ910,Redeeming Love,AZZVZL4QEHEHO,5.0,1322784000,1.030339,127021
107493,0553563793,Long Night Moon,AZZVZL4QEHEHO,3.0,1329523200,-0.579566,145343


## Generating Datasets

In [10]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

MAX_REVIEWS = 10
SCORE_COLUMN = "review/score"

# Split the unique user_ids into train, validation and test sets
train_users, temp_users = train_test_split(filtered_reviews['User_id'].unique(), test_size=0.4, random_state=42)
valid_users, test_users = train_test_split(temp_users, test_size=0.5, random_state=42)

def prepare_sequences(users, df):
    sequences = []
    for user in tqdm(users):
        user_data = df[df['User_id'] == user]
        
        # Ensure that we keep only the most recent MAX_REVIEWS reviews
        if len(user_data) > MAX_REVIEWS + 1:
            user_data = user_data.iloc[-(MAX_REVIEWS + 1):]  # +1 to account for the query doc

        title_indexes = user_data['Title_Index'].values[:-1]
        scores = user_data[SCORE_COLUMN].values[:-1]
        
        mask = [1] * len(title_indexes) + [0] * (MAX_REVIEWS - len(title_indexes))
        title_indexes = list(title_indexes) + [0] * (MAX_REVIEWS - len(title_indexes))
        scores = list(scores) + [0] * (MAX_REVIEWS - len(scores))
        query_id = user_data['Title_Index'].values[-1]
        target = user_data[SCORE_COLUMN].values[-1]
        
        sequences.append([title_indexes, scores, mask, query_id, target])
    return sequences



train_sequences = prepare_sequences(train_users, filtered_reviews)
valid_sequences = prepare_sequences(valid_users, filtered_reviews)
test_sequences = prepare_sequences(test_users, filtered_reviews)


100%|██████████| 47333/47333 [26:17<00:00, 30.00it/s]
100%|██████████| 15778/15778 [09:09<00:00, 28.70it/s]
100%|██████████| 15778/15778 [09:11<00:00, 28.59it/s]


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        title_indexes, scores, mask, query_id, target = self.sequences[idx]
        return torch.tensor(title_indexes, dtype=torch.long), \
               torch.tensor(scores, dtype=torch.float32), \
               torch.tensor(mask, dtype=torch.float32), \
               torch.tensor(query_id, dtype=torch.long), \
               torch.tensor(target, dtype=torch.float32)

train_dataset = ReviewDataset(train_sequences)
valid_dataset = ReviewDataset(valid_sequences)
test_dataset = ReviewDataset(test_sequences)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [13]:
train_dataset[0]

(tensor([  5956, 212159, 202391,      0,      0,      0,      0,      0,      0,
              0]),
 tensor([5., 4., 3., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([1., 1., 1., 0., 0., 0., 0., 0., 0., 0.]),
 tensor(203884),
 tensor(5.))

# Model Definition

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserBehaviorEncoder(nn.Module):
    def __init__(self, embeddings, num_heads, output_dim):
        super(UserBehaviorEncoder, self).__init__()
        
        self.doc_emb_dim = embeddings.shape[1]
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embeddings).float(), padding_idx=0, freeze=True)
        
        # Embedding to transform scores to the same dimensionality as document embeddings
        self.score_emb = nn.Linear(1, self.doc_emb_dim)
        
        # MultiHeadAttention Layer
        self.attention = nn.MultiheadAttention(embed_dim=self.doc_emb_dim, num_heads=num_heads)
        
        # Dense layer to produce final user behavior representation
        self.fc = nn.Linear(self.doc_emb_dim, output_dim)

    def forward(self, doc_indices, scores, attention_mask=None):
        # Get the document embeddings using the indices
        doc_embeddings = self.embedding(doc_indices)
        
        # Transform scores to embeddings
        scores_emb = self.score_emb(scores.unsqueeze(-1)).squeeze(2)  # Add an extra dimension to the last axis and then squeeze it out
        
        # Combine embeddings
        combined_emb = doc_embeddings + scores_emb

        # Permute to match expected shape
        combined_emb_permuted = combined_emb.permute(1, 0, 2)
        
        # Apply the attention mechanism
        attended_values_permuted, _ = self.attention(
            combined_emb_permuted,
            combined_emb_permuted,
            combined_emb_permuted,
            key_padding_mask=(attention_mask == 0).bool()
        )

        # Permute the values back to (batch_size, sequence_length, embedding_dim)
        attended_values = attended_values_permuted.permute(1, 0, 2)
        
        # Produce user behavior embedding
        user_behavior_emb = attended_values.mean(dim=1)
        
        # Pass through a dense layer
        user_behavior_rep = self.fc(user_behavior_emb)
        
        return user_behavior_rep

class UserPreferenceRegressor(nn.Module):
    def __init__(self, embeddings, num_heads, user_behavior_output_dim, predictor_hidden_dim):
        super(UserPreferenceRegressor, self).__init__()
        
        self.encoder = UserBehaviorEncoder(embeddings, num_heads, user_behavior_output_dim)
        
        # Define the predictor
        self.predictor = nn.Sequential(
            nn.Linear(user_behavior_output_dim + embeddings.shape[1], predictor_hidden_dim),
            nn.ReLU(),
            nn.Linear(predictor_hidden_dim, 1)
        )
    
    def forward(self, doc_indices, scores, attention_mask, new_doc_embedding):
        user_behavior = self.encoder(doc_indices, scores, attention_mask)
        combined = torch.cat([user_behavior, new_doc_embedding], dim=1)
        return self.predictor(combined)


In [44]:
# Hyperparameters
LEARNING_RATE = 0.001
NUM_EPOCHS = 100
NUM_HEADS = 2  # for MultiHeadAttention
OUTPUT_DIM = 128  # User behavior representation dimension
HIDDEN_DIM = 256  # Predictor hidden layer dimension

# Initialize model
model = UserPreferenceRegressor(bert_embeddings, NUM_HEADS, OUTPUT_DIM, HIDDEN_DIM)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [45]:
# Training Function
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=NUM_EPOCHS):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for idx, (title_indexes, scores, mask, query_id, target) in enumerate(train_loader):
            optimizer.zero_grad()
            
            new_doc_embedding = model.encoder.embedding(query_id).squeeze(1)
            outputs = model(title_indexes, scores.unsqueeze(2), mask, new_doc_embedding)
            
            loss = criterion(outputs.squeeze(1), target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}")
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for title_indexes, scores, mask, query_id, target in valid_loader:
                new_doc_embedding = model.encoder.embedding(query_id).squeeze(1)
                outputs = model(title_indexes, scores.unsqueeze(2), mask, new_doc_embedding)
                loss = criterion(outputs.squeeze(1), target)
                val_loss += loss.item()
        
        print(f"Validation Loss: {val_loss/len(valid_loader):.4f}")
        
# Call the training function
train_model(model, train_loader, valid_loader, criterion, optimizer)


Epoch [1/100], Train Loss: 1.7129
Validation Loss: 2.3008
Epoch [2/100], Train Loss: 1.6166
Validation Loss: 1.6528
Epoch [3/100], Train Loss: 1.6163
Validation Loss: 1.5950
Epoch [4/100], Train Loss: 1.6035
Validation Loss: 1.5990
Epoch [5/100], Train Loss: 4.1110
Validation Loss: 1.6286
Epoch [6/100], Train Loss: 1.5241
Validation Loss: 1.5363
Epoch [7/100], Train Loss: 1.5047
Validation Loss: 1.4903
Epoch [8/100], Train Loss: 1.5319
Validation Loss: 1.4735
Epoch [9/100], Train Loss: 1.5665
Validation Loss: 1.5696
Epoch [10/100], Train Loss: 1.5510
Validation Loss: 1.5247
Epoch [11/100], Train Loss: 1.5611
Validation Loss: 1.5898
Epoch [12/100], Train Loss: 1.5354
Validation Loss: 1.6701
Epoch [13/100], Train Loss: 1.5063
Validation Loss: 1.4764
Epoch [14/100], Train Loss: 1.4940
Validation Loss: 1.5019
Epoch [15/100], Train Loss: 1.4915
Validation Loss: 1.4820
Epoch [16/100], Train Loss: 1.4770
Validation Loss: 1.4902
Epoch [17/100], Train Loss: 1.4743
Validation Loss: 1.4580
Epoch 

In [37]:
def predict_scores_for_all_books(model, embeddings, doc_indices, scores):
    """
    Predict the scores for a user for all N books.

    Parameters:
    - model: The trained UserPreferenceRegressor model.
    - embeddings: The embeddings for all N books.
    - doc_indices: A list of document indices representing user reviews.
    - scores: A list of scores associated with the user reviews.

    Returns:
    - List of tuples with each tuple containing (book_id, predicted_score).
    """

    # Convert the doc_indices and scores to tensors
    doc_indices_tensor = torch.tensor([doc_indices + [0] * (10 - len(doc_indices))], dtype=torch.long)
    scores_tensor = torch.tensor([scores + [0.0] * (MAX_REVIEWS - len(scores))], dtype=torch.float).unsqueeze(-1)
    
    # Create an attention mask for valid reviews
    attention_mask = [False] * len(doc_indices) + [True] * (MAX_REVIEWS - len(doc_indices))
    attention_mask_tensor = torch.tensor([attention_mask])

    all_predictions = []

    # Loop over all book embeddings to predict the scores
    for book_id in tqdm(range(embeddings.shape[0])):
        # Get the book embedding tensor
        new_doc_embedding_tensor = torch.tensor(embeddings[book_id]).unsqueeze(0)

        # Use the model to predict the score
        predicted_score_tensor = model(doc_indices_tensor, scores_tensor, attention_mask_tensor, new_doc_embedding_tensor)
        
        # Convert tensor to scalar and append to all_predictions list
        all_predictions.append(predicted_score_tensor.item())

    return all_predictions

preds = predict_scores_for_all_books(model, bert_embeddings, [81605, ], [5., ])

100%|██████████| 212404/212404 [01:11<00:00, 2986.81it/s]


In [38]:
top_results = pd.Series(preds).sort_values(ascending=False)
top_results.name = "score"

In [39]:
top_results = pd.merge(top_results, books_data, left_index=True, right_index=True)[["score","Title","description"]]

In [40]:
top_results.head(20)

Unnamed: 0,score,Title,description
132571,4.613784,"The People Came (To Osborne County, Kansas) in Their Prairie Schooners Through the Waves of the Seas of Grass and Stayed (Volume 1)","The first volume in the series State Bibliographies, this book provides comprehensive coverage of secondary materials on Kansas history and also includes useful references to major archival and manuscript collections. Its broad and diverse scope ranges from standard political and economic studies to social and environmental histories, to local studies, to regional studies with special signific..."
153834,4.533461,"No tears for the general;: The life of Alfred Sully, 1821-1879 (Western biography series)","""Letters of Sully, printed for the first time, provide a vivid picture of California in the gold rush, of Minnesota frontier in the 1850s, Civil War, Sioux uprising, etc.""--Bookseller's catalogue."
174319,4.51074,Come and git' it!,"In this colorful tale of the food culture of cattle drives in the 1800s, chuckwagon chef Cookie takes young readers along on a rootin'-tootin' adventure. The days start at three o'clock in the morning, when Cookie makes coffee so thick ""you could float a horseshoe on it!"" With informational sidebars, a historical note, bibliography, and glossary for cowboy food terms such as ""calf slobbers"" an..."
32008,4.494857,"Fifty famous farmers,",
46496,4.474873,Landmarked: Stories of Peggy Simson Curry,"Peggy Simson Curry's memorable short stories, many set on Wyoming and Colorado ranches, originally appeared in magazines during the heyday of short stories in the 1950s-60s. Now they are available in book form. This collection includes two Spur winners and stories from Saturday Evening Post, Boy's Life and other magazines."
93031,4.464562,Just Fishing,"The catching of fish, said the Sage of Chocoloskee, is but an incident in fishing. He told the frozen truth. To be out in the open where fish are; to watch them at their great business of living; to see them in the water or out of the water; to fish for them, and even to hook them and have them get away-all this is wonderfully worthwhile-wonderfully better worthwhile than merely to catch and k..."
86331,4.46264,The Kickapoos: Lords of the Middle Border,"The Kickapoo Indians, members of the Algonquian linguistic community, resisted white settlement for more than three hundred years on a front that extended across half a continent. In turn, France, Great Britain, the United States, Spain, and Mexico sought to placate and exploit this fiercely independent people. Eventually forced to remove from their historic homeland to territory west of the M..."
5541,4.458817,OF CABBAGES AND KINGS COUNTY: AGRICULTURE AND THE FORMATION OF MODERN BROOKLYN.,"No one today thinks of Brooklyn, New York, as an agricultural center. Yet Kings County enjoyed over two centuries of farming prosperity. Even as late as 1880 it was one of the nation's leading vegetable producers, second only to neighboring Queens County. In Of Cabbages and Kings County, Marc Linder and Lawrence Zacharias reconstruct the history of a lost agricultural community. Their study fo..."
59841,4.457653,The Quotable Farmer (Quotable Series),"From the gentlemen farmers among our founders to today’s grandchild walking the beans, rural life has always been at the very heart of the American story. This is the life that unfolds page by page in this heartfelt book about working the land. One remarkable photograph after another celebrates the farming life, finding the beauty in work well done, land well tended, and a rest well earned—all..."
84369,4.455821,Glory days of logging,"The reissue of this classic history allows us to once again journey into the past and rediscover for the first time the forgotten men and methods of logging history in the Northwest United States and Canada. This book contain the best photographs of a dozen famous collections: Davis and Benson rafts, river drives, hand logging spar topping big wheels in the pine, saw mills of 1890 to 1915, his..."


In [32]:
pd.set_option('max_colwidth', 400)

In [42]:
top_results.loc[81605]

score                                                                                                                                                                                                                                                                                                                                                                                                                 2.690624
Title                                                                                                                                                                                                                                                                                                                                                                                                       George Orwell 1984
description    "Nineteen Eighty-Four: A Novel", often published as "1984", is a dystopian social science fiction novel by English novelist George Orwell. It was published

In [34]:
books_data[(books_data.Title.str.len()>3) & (books_data.Title.str.lower().str.contains(""))].Title.head(20)

8531                                                                                                                                                     Orwell's London
25642                                                                                                                    George Orwell: Animal Farm-Nineteen Eighty-Four
34435                                                                  History Will Not Absolve Us : Orwellian Control, Public Denial, & the Murder of President Kennedy
37594                                                                                                                                       George Orwell's 1984: A Play
57832                                                                                                                                       CliffsNotes on Orwell's 1984
74376                                                                                                                                       Orwell (Life & 

In [23]:
books_data.iloc[36776]

Title                        Harry Potter and The Sorcerer's Stone
description      Celebrate 20 years of Harry Potter magic! Harr...
authors                                          ['J. K. Rowling']
image            http://books.google.com/books/content?id=HksgD...
previewLink      http://books.google.com/books?id=HksgDQAAQBAJ&...
publisher                                    Bloomsbury Publishing
publishedDate                                           2014-01-09
infoLink         http://books.google.com/books?id=HksgDQAAQBAJ&...
categories                                    ['Juvenile Fiction']
ratingsCount                                                   1.0
Name: 36776, dtype: object