In [17]:
# Data treatment

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import duckdb
from sklearn.preprocessing import OrdinalEncoder


# OS and Dotenv

from dotenv import load_dotenv
import os
load_dotenv()  

# Deep Learning

import torch                  
import torch.nn as nn          
import torch.nn.functional as F  
import torch.optim as optim  
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset, TensorDataset

# GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths

dataset_path = os.getenv('data_dir')
dataset_ratings = os.path.join(dataset_path, "Books_rating.csv")
dataset_books = os.path.join(dataset_path, "books_data.csv")

In [3]:
books_data = pl.read_csv(dataset_books)
ratings = pl.read_csv(dataset_ratings)

con = duckdb.connect("proto.duckdb")

con.execute(f"""
CREATE OR REPLACE TABLE books AS
SELECT * FROM read_csv_auto('{dataset_books}');
""")

con.execute(f"""
CREATE OR REPLACE TABLE ratings AS
SELECT * FROM read_csv_auto('{dataset_ratings}');
""")



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x2078a424b70>

In [4]:
query = """SELECT 
    b.Title, 
    b.authors, 
    b.categories, 
    r.Id, 
    r.User_id, 
    r."review/score",
    b.ratingsCount
FROM books b
JOIN ratings r ON b.Title = r.Title;"""

df_merged = con.execute(query).fetchdf()

## Pre-processing

### EDA


In [5]:
df_merged

Unnamed: 0,Title,authors,categories,Id,User_id,review/score,ratingsCount
0,The coming race,['Edward Bulwer-Lytton'],,B0007EKQS0,AS66VOYLSRMBN,5.0,
1,The coming race,['Edward Bulwer-Lytton'],,B0007EKQS0,A25DO4LKI6AZ9,3.0,
2,The coming race,['Edward Bulwer-Lytton'],,B0007EKQS0,A27DHFFWMH042Y,4.0,
3,The coming race,['Edward Bulwer-Lytton'],,B0007EKQS0,A1BZRECABLFF8G,1.0,
4,The coming race,['Edward Bulwer-Lytton'],,B0007EKQS0,A2RZ7I99IMLVV6,2.0,
...,...,...,...,...,...,...,...
2999787,"The story of San Michele, (A Dutton, Everyman ...",,['American literature'],B00085CZ9G,A913JBH2B0U32,4.0,
2999788,"The story of San Michele, (A Dutton, Everyman ...",,['American literature'],B00085CZ9G,,5.0,
2999789,"The story of San Michele, (A Dutton, Everyman ...",,['American literature'],B00085CZ9G,,5.0,
2999790,"The story of San Michele, (A Dutton, Everyman ...",,['American literature'],B00085CZ9G,,5.0,


I'm going to use only the ratings, since our project will use model-based collaborative filtering.

In [6]:
len(df_merged['Title'].unique())

212403

In [7]:

filtering_ratings_per_user = df_merged.groupby("User_id")["review/score"].count().reset_index(name="total_reviews")


In [8]:
filtering_ratings_per_user['total_reviews'].describe()

count    1.008961e+06
mean     2.416365e+00
std      1.213252e+01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      5.795000e+03
Name: total_reviews, dtype: float64

In [9]:
reviews_median = filtering_ratings_per_user['total_reviews'].median()

Since our dataset is asymmetric, I'm using median instead of mean

In [12]:
n_items = len(df_merged['Title'].unique())
n_users = len(df_merged['User_id'].unique())

print(f"The number of books is {n_items}")
print("---------------------------")
print(f"The number of users is {n_users}")
print("---------------------------")
print(f'The number of ratings per user is {reviews_median} ')
print("---------------------------")
print(f'Full rating matrix will have: {n_users*n_items} elements')
print("---------------------------")
print(f'Number of rating: {len(df_merged)}')
print("---------------------------")
print(f'Therefore: {(len(df_merged) / (n_users * n_items)) * 100}% of matrix is filled')

The number of books is 212403
---------------------------
The number of users is 1008962
---------------------------
The number of ratings per user is 1.0 
---------------------------
Full rating matrix will have: 214306555686 elements
---------------------------
Number of rating: 2999792
---------------------------
Therefore: 0.0013997667921998931% of matrix is filled


### Processing

In [13]:
df_merged['categories'] = df_merged['categories'].fillna('No Category')

df_merged['ratingsCount'] = df_merged['ratingsCount'].fillna(0)

In [14]:
df_merged.describe()

Unnamed: 0,review/score,ratingsCount
count,2999792.0,2999792.0
mean,4.215263,148.676
std,1.203066,598.6146
min,1.0,0.0
25%,4.0,0.0
50%,5.0,1.0
75%,5.0,13.0
max,5.0,4895.0


As expected, we have a lot of NaN values in User_id.

I can't use these IDs in the way they are right now. Im going to treatment them using ordinalencoder, since the matrix factorization presuppose continuous IDs

In [18]:
# 0. Preprocessing the strings

df_merged['categories'] = df_merged['categories'].str.replace('[', '', regex=False).str.replace(']', '', regex=False)
df_merged['authors'] = df_merged['authors'].str.replace('[', '', regex=False).str.replace(']', '', regex=False)

# 1. Ordinal Encoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoded = encoder.fit_transform(df_merged[['User_id', 'Id', 'categories', 'authors']].to_numpy())

df_encoded = df_merged.copy()
df_encoded['User_id'] = encoded[:, 0].astype(int)
df_encoded['Id'] = encoded[:, 1].astype(int)
df_encoded['categories'] = encoded[:, 2].astype(int)
df_encoded['authors'] = encoded[:, 3].astype(int)


df_encoded = df_encoded[(df_encoded['User_id'] != -1) & (df_encoded['Id'] != -1)]

n_users = df_encoded['User_id'].max() + 1
n_items = df_encoded['Id'].max() + 1

print(f"n_users = {n_users}, n_items = {n_items}")




n_users = 1008962, n_items = 221989


I didnt drop the nulls before, but i'm treating with the line df_encoded = df_encoded[(df_encoded['User_id'] != -1) & (df_encoded['Id'] != -1)]


In [20]:
n_genders = len(df_encoded['categories'].unique())
n_authors = len(df_encoded['authors'].unique())

print(f'Number of authors: {n_authors}')
print(f'Number of genders: {n_genders}')

Number of authors: 127278
Number of genders: 10884


To avoid data leak, because we have a lot of users, im going to divide the dataset per user, not only per percentage

In [24]:
from sklearn.model_selection import train_test_split

user_score = df_encoded.groupby("User_id")["review/score"].median().reset_index()

# 2. Stratified split 
train_users, temp_users = train_test_split(
    user_score,
    test_size=0.4,
    stratify=user_score["review/score"],
    random_state=42
)

val_users, test_users = train_test_split(
    temp_users,
    test_size=0.5,
    stratify=temp_users["review/score"],
    random_state=42
)

df_train = df_encoded[df_encoded["User_id"].isin(train_users["User_id"])]
df_val = df_encoded[df_encoded["User_id"].isin(val_users["User_id"])]
df_test = df_encoded[df_encoded["User_id"].isin(test_users["User_id"])]




In [26]:
# Creating the book dict for decoding

book_df = df_encoded[['Id', 'Title']]

book_dict = dict(zip(book_df['Id'], book_df['Title']))

## Data Processing Class for Pytorch

In [25]:
n_users = torch.tensor(n_users, dtype=torch.int64)
n_items = torch.tensor(n_items, dtype=torch.int64)

In [27]:
## Data Processing Class for Pytorch

# Creating dataloader 

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader


class Loader(Dataset):
    def __init__(self, df):
        super().__init__()
        self.ratings = df
        self.ratings = self.ratings.drop(["Title"], axis=1)
        self.X = self.ratings.drop(['review/score'], axis=1).to_numpy()
        self.y = self.ratings['review/score'].to_numpy()
        self.X, self.y = torch.tensor(self.X, dtype=torch.long), torch.tensor(self.y)
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    def __len__(self):
        return len(self.ratings)



In [28]:
train_dataset = Loader(df_train)
test_dataset = Loader(df_test)
eval_dataset = Loader(df_val)

trainloader = DataLoader(train_dataset, batch_size=2048, shuffle=True, drop_last=True)
testloader = DataLoader(test_dataset, batch_size=2048, shuffle=True, drop_last=True)
evalloader = DataLoader(eval_dataset, batch_size=2048, shuffle=True, drop_last=True)

In [29]:
batch_numbers = []
for batch, (X,y)in enumerate(trainloader):
    batch_numbers.append(batch)
    
print(f'The number of batches is {len(batch_numbers)}')
    

The number of batches is 989


## Model Creation

In [30]:

import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm

class NeuMF(nn.Module):
    def __init__(self, n_users, n_items,n_genders, n_authors, device, n_factors=8):
        super().__init__()

        # Embeddings for GMF (Generalized Matrix Factorization) path
        self.user_gmf = nn.Embedding(n_users, n_factors)
        self.item_gmf = nn.Embedding(n_items, n_factors)

        # Embeddings for MLP path
        self.user_mlp = nn.Embedding(n_users, n_factors)
        self.item_mlp = nn.Embedding(n_items, n_factors)
        self.item_gender_emb = nn.Embedding(n_genders, n_factors)
        self.item_authors_emb = nn.Embedding(n_authors, n_factors)

        
        # Initialize embeddings with small uniform values
        self.user_gmf.weight.data.uniform_(0, 0.05)
        self.item_gmf.weight.data.uniform_(0, 0.05)
        self.user_mlp.weight.data.uniform_(0, 0.05)
        self.item_mlp.weight.data.uniform_(0, 0.05)

        # MLP input: user + item embedding + text embedding
        input_dim = n_factors * 2

        # MLP: several layers with ReLU and Dropout to prevent overfitting
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(256, 32)
        )

        # Final prediction layer: GMF + MLP outputs concatenated
        self.final_layer = nn.Linear(n_factors + 32, 1)

    def forward(self, data):
        users = data[:, 1]
        items = data[:, 0]
        genders = data[:, 3]
        authors = data[:, 2]
        ratingsCount = data[:, 4]



        # GMF path
        gmf_user = self.user_gmf(users)
        gmf_item = self.item_gmf(items)
        gmf_out = gmf_user * gmf_item  # element-wise product

        # Gender and authors 
        gender_emb = self.item_gender_emb(genders)
        authors_emb = self.item_authors_emb(authors)


        # MLP path
        mlp_user = self.user_mlp(users)
        mlp_item = self.item_mlp(items)
        mlp_items = torch.cat([mlp_item, gender_emb, authors_emb,ratingsCount.unsqueeze(1)], dim=1)

        # Concatenate user, item, and text embeddings
        mlp_input = torch.cat([mlp_user, mlp_item], dim=1)
       # print(mlp_input.shape)
        mlp_out = self.mlp(mlp_input)

        # Combine GMF and MLP paths and make final prediction
        final_input = torch.cat([gmf_out, mlp_out], dim=1)
        out = self.final_layer(final_input).squeeze(1)

        return out



## Training

In [31]:
from sklearn.metrics import mean_squared_error, r2_score


#i dont want this code to run right now # -> to force error

num_epochs = 128
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

print('Model Architecture\n')

print("Is running on GPU:", use_cuda)

model = NeuMF(n_users, n_items,n_genders,n_authors, device, n_factors=16)

model.to(device)

print(model, '\n')

#for name, param in model.named_parameters():
#    if param.requires_grad:
#        print(name, param.data)

loss_fn = nn.MSELoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)

# Early Stopping Parameters
patience = 10          # Quantas épocas esperar por melhora
min_delta = 0.001     # Mudança mínima para ser considerada melhoria
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Assuming optimizer uses lr = 0.05 for all groups
# lr = 0.05     if epoch < 30
# lr = 0.005    if 30 <= epoch < 60
# lr = 0.0005   if 60 <= epoch < 90
# ...
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

for it in range(num_epochs):
    model.train()
    number_batch = 0
    losses = []
    y_true = []
    y_pred = []
    for X, y in trainloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        number_batch += 1

        if number_batch % 250 == 0:
            
            print(f'Batch atual: {number_batch}, Batch_Loss: {loss.item()}')
    
    print(f'\nIter #{it}', f'Loss: {sum(losses)/len(losses)}, LearningRate: {optimizer.param_groups[0]["lr"]}\n')
    print(f'\nEvaluating...')

    model.eval()
    val_loss = 0.0

    for X, y in evalloader:
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            outputs = model(X)
            eval_loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            val_loss += eval_loss.item() * X.size(0)


            y_pred.append(outputs.squeeze().cpu().numpy())

            y_true.append(y.cpu().numpy())
    val_loss /= len(evalloader.dataset) 


    mse = mean_squared_error(y_true, y_pred)

    rmse = np.sqrt(mse)
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f'Evaluation Loss: {val_loss}\n')

    
    # Early Stopping
    if it >= 15:
        if val_loss < best_val_loss - min_delta:
            best_val_loss = val_loss
            epochs_no_improve = 0
    
            torch.save(model.state_dict(), f'best_model.pth')
            print(f'    --> New best eval loss: {best_val_loss:.4f}. Model saved.')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f'Early stopping activated in epoch {it+1}.')
                early_stop = True
                break

    if early_stop:
        break
    scheduler.step()
    




Model Architecture

Is running on GPU: True
NeuMF(
  (user_gmf): Embedding(1008961, 16)
  (item_gmf): Embedding(216014, 16)
  (user_mlp): Embedding(1008961, 16)
  (item_mlp): Embedding(216014, 16)
  (item_gender_emb): Embedding(10669, 16)
  (item_authors_emb): Embedding(124098, 16)
  (mlp): Sequential(
    (0): Linear(in_features=32, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): GELU(approximate='none')
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): GELU(approximate='none')
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=256, out_features=32, bias=True)
  )
  (final_layer): Linear(in_features=48, out_features=1, bias=True)
) 

Batch atual: 250, Batch_Loss: 1.3149428367614746
Batch atual: 500, Batch_Loss: 1.134939432144165

Iter #0 Loss: 1.579522149736474, LearningRate: 0.0005




## Evaluation

In [32]:
# Loading our model from save state

model = NeuMF(n_users, n_items,n_genders,n_authors, device, n_factors=16)

model.to(device)

model.load_state_dict(torch.load("best_model.pth", weights_only=True))

<All keys matched successfully>

### Predictive Metrics

In [33]:
from sklearn.metrics import mean_squared_error, r2_score


model.eval()
val_loss = 0.0
y_true = []
y_pred = []

for X, y in testloader:
    X, y = X.to(device), y.to(device)
    with torch.no_grad():
        outputs = model(X)
        eval_loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        val_loss += eval_loss.item() * X.size(0)


        y_pred.append(outputs.squeeze().cpu().numpy())

        y_true.append(y.cpu().numpy())
val_loss /= len(evalloader.dataset) 


mse = mean_squared_error(y_true, y_pred)

rmse = np.sqrt(mse)
r2 = r2_score(y_true,y_pred)
print(f"R2: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f'Evaluation Loss: {val_loss}\n')

R2: -0.0315
MSE: 1.4306
RMSE: 1.1961
Evaluation Loss: 1.4375979105343142



### Recommendation Metrics

In [34]:
from collections import defaultdict

def evaluate_batch_precision_recall(df, model, k=10, threshold=4, device="cpu"):
    model.eval()
    user_item_scores = defaultdict(list)

    with torch.no_grad():
        for X, y in df:
            item_ids = X[:, 0].to(device)
            user_ids = X[:, 1].to(device)
            scores_real = y.to(device)

            scores_pred = model(X).squeeze()

            for uid, iid, pred, real in zip(user_ids, item_ids, scores_pred, scores_real):
                user_item_scores[uid.item()].append((iid.item(), pred.item(), real.item()))

    precisions, recalls = [], []

    for uid, interactions in user_item_scores.items():
        # Ordena por score predito (desc)
        topk = sorted(interactions, key=lambda x: x[1], reverse=True)[:k]
        topk_items = {i[0] for i in topk}
        relevant_items = {i[0] for i in interactions if i[2] >= threshold}

        if not relevant_items:
            continue

        hits = topk_items & relevant_items
        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(relevant_items))

    avg_precision = sum(precisions) / len(precisions) if precisions else 0.0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0.0
    f_score = (
        2 * avg_precision * avg_recall / (avg_precision + avg_recall)
        if avg_precision + avg_recall > 0
        else 0.0
    )

    return avg_precision, avg_recall, f_score, user_item_scores




In [35]:

k = 10

precision, recall, f_score, item_r = evaluate_batch_precision_recall(testloader, model, k=k, threshold=4, device="cpu")
print(f"Precision@{k}: {precision * 100:.4f}%")
print(f"Recall@{k}: {recall * 100:.4f}%")
print(f'F-Score@{k}: {f_score * 100:4f}%')


Precision@10: 18.2293%
Recall@10: 98.7676%
F-Score@10: 30.778012%


In [36]:
for i, (uid, items) in enumerate(item_r.items()):
    if i >= 20:
        break
    print(f"User {uid}:")
    for iid, pred, real in items[:10]:  
        book = book_dict[iid]
        print(f" Book {book} - Pred: {pred:.3f} | Real: {real:.3f}")




User 119378:
 Book Q is for Quarry: A Kinsey Millhone Mystery - Pred: 2.960 | Real: 3.000
 Book The Fox, The Captain's Doll, The Ladybird (The Cambridge Edition of the Works of D. H. Lawrence) - Pred: 4.058 | Real: 5.000
 Book The Count Of Monte Cristo, Abridged - Pred: 4.383 | Real: 5.000
 Book A study of North Atlantic ventilation using transient tracers (WHOI-91-27) - Pred: 4.155 | Real: 4.000
 Book Cat Among the Pigeons - Pred: 3.746 | Real: 5.000
 Book Emma (The World's Classics) - Pred: 3.717 | Real: 5.000
 Book The Pursuit of the Well-Beloved and The Well-Beloved (Penguin Classics) - Pred: 3.613 | Real: 3.000
 Book The Great Divorce - Pred: 4.302 | Real: 5.000
 Book Wuthering Heights - Pred: 3.273 | Real: 5.000
 Book Silas Marner (The Classic Collection) - Pred: 3.478 | Real: 5.000
User 16546:
 Book Virgin Land: The American West as Symbol and Myth - Pred: 3.659 | Real: 5.000
 Book Feudal Society: Social Classes and Political Organization, Volume II: Vol 2 - Pred: 4.516 | Real: 