In [1]:

# Data treatment

import numpy as np # linear algebra
import polars as pl # data processing, CSV file I/O (e.g. pd.read_csv)

# Deep Learning

import torch                  
import torch.nn as nn          
import torch.nn.functional as F  
import torch.optim as optim  
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset, TensorDataset

# GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
df_data = pl.read_csv(r'C:\Users\Lucas\Documents\GitHub\Recommind\data\processed\books_data.csv')
df_ratings = pl.read_csv(r'C:\Users\Lucas\Documents\GitHub\Recommind\data\processed\Books_rating.csv')

In [3]:
df_data = df_data[['Title', 'authors', 'categories', 'ratingsCount']]

In [4]:
df_data = df_data.with_columns(
    pl.col("ratingsCount").fill_null(0),
    pl.col("categories").fill_null("No Category")
)


In [5]:
df_ratings2 = df_ratings[['Id','Title', 'User_id', 'review/score']]

In [6]:
df_ratings2 = df_ratings2.drop_nulls()

In [7]:
df_merged = df_ratings2.join(
    df_data.select(["Title", "authors", "categories", "ratingsCount"]),
    on='Title',
    how='left'
)

In [8]:
import polars as pl
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
import numpy as np

# 0. Preprocessing the strings

df_merged = df_merged.with_columns(  

    pl.col("categories").str.replace_all('[','',literal=True).str.replace_all(']','',literal=True),
    pl.col("authors").str.replace_all('[','',literal=True).str.replace_all(']','',literal=True),

)


# 1. Ordinal Encoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoded = encoder.fit_transform(df_merged.select(['User_id', 'Id']).to_numpy())

df_encoded = df_merged.with_columns([
    pl.Series(name='User_id', values=encoded[:, 0].astype(int)),
    pl.Series(name='Id', values=encoded[:, 1].astype(int)),
])

df_encoded = df_encoded.filter(
    (pl.col('User_id') != -1) & (pl.col('Id') != -1)
)

n_users = df_encoded.select(pl.col('User_id').max()).item() + 1
n_items = df_encoded.select(pl.col('Id').max()).item() + 1

print(f"n_users = {n_users}, n_items = {n_items}")


# 2. StandardScaler
#scaler = StandardScaler()

#scores = df_encoded.select('review/score').to_numpy()

#scores_scaled = scaler.fit_transform(scores)

#df_encoded = df_encoded.with_columns(
#    pl.Series(name="review/score_scaled", values=scores_scaled.flatten())
#)

# 3. LabelEncoder


authors = df_encoded["authors"].to_numpy()
categories = df_encoded["categories"].to_numpy()

enc_authors = LabelEncoder()
enc_categories = LabelEncoder()

authors_encoded = enc_authors.fit_transform(authors)
categories_encoded = enc_categories.fit_transform(categories)

# Se quiser colocar de volta no DataFrame
df_encoded = df_encoded.with_columns([
    pl.Series("authors", authors_encoded),
    pl.Series("categories", categories_encoded)
])




n_users = 1008961, n_items = 216014


In [13]:
df_encodeds = df_encoded.to_pandas()

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

user_score = df_encodeds.groupby("User_id")["review/score"].median().reset_index()

# 2. Stratified split 
train_users, temp_users = train_test_split(
    user_score,
    test_size=0.4,
    stratify=user_score["review/score"],
    random_state=42
)

val_users, test_users = train_test_split(
    temp_users,
    test_size=0.5,
    stratify=temp_users["review/score"],
    random_state=42
)

df_train = df_encodeds[df_encodeds["User_id"].isin(train_users["User_id"])]
df_val = df_encodeds[df_encodeds["User_id"].isin(val_users["User_id"])]
df_test = df_encodeds[df_encodeds["User_id"].isin(test_users["User_id"])]




In [15]:
df_train

Unnamed: 0,Id,Title,User_id,review/score,authors,categories,ratingsCount
0,127550,Its Only Art If Its Well Hung!,974357,4.0,64441,3348,0.0
4,72368,Dr. Seuss: American Icon,288503,4.0,93819,1887,0.0
5,72368,Dr. Seuss: American Icon,379349,4.0,93819,1887,0.0
6,72368,Dr. Seuss: American Icon,35079,5.0,93819,1887,0.0
7,72368,Dr. Seuss: American Icon,472538,5.0,93819,1887,0.0
...,...,...,...,...,...,...,...
2438009,205200,Ghost Story,816059,5.0,93247,5287,0.0
2438010,205200,Ghost Story,933817,5.0,93247,5287,0.0
2438013,205239,The Idea of History,875975,5.0,94976,6283,3.0
2438015,205239,The Idea of History,212393,4.0,94976,6283,3.0


In [16]:
df_test

Unnamed: 0,Id,Title,User_id,review/score,authors,categories,ratingsCount
2,72368,Dr. Seuss: American Icon,758564,5.0,93819,1887,0.0
3,72368,Dr. Seuss: American Icon,436241,4.0,93819,1887,0.0
14,37578,Whispers of the Wicked Saints,725367,1.0,118279,5287,0.0
16,37578,Whispers of the Wicked Saints,969929,1.0,118279,5287,0.0
23,37578,Whispers of the Wicked Saints,977535,5.0,118279,5287,0.0
...,...,...,...,...,...,...,...
2437999,42733,My Life,756185,2.0,53793,6957,25.0
2438000,45609,Better Homes and Gardens Casual Entertaining C...,903049,5.0,124097,4891,0.0
2438002,15587,Managing With Operational Research,186838,4.0,62307,10668,0.0
2438007,205200,Ghost Story,953897,5.0,93247,5287,0.0


To test the model working on an application, we are going to recommend books for a random user.


In [17]:
user_1 = df_test[df_test['User_id'] == 4]

In [18]:
df_test['User_id'].value_counts()

User_id
119378     933
108269     900
164796     800
542252     797
165515     780
          ... 
481323       1
914229       1
1005163      1
20706        1
47685        1
Name: count, Length: 201793, dtype: int64

In [19]:
user = df_test[df_test['User_id'] == 165515]

In [20]:
user

Unnamed: 0,Id,Title,User_id,review/score,authors,categories,ratingsCount
2055,160141,1001 Christmas Facts and Fancies,165515,5.0,3728,10668,0.0
4309,189137,The Gods of Mars,165515,5.0,31720,5287,26.0
6386,211531,Foundation,165515,5.0,64868,4635,0.0
8050,47893,King Rat,165515,5.0,51425,5287,19.0
8551,211532,Foundation,165515,5.0,64868,4635,0.0
...,...,...,...,...,...,...,...
2425329,24962,The Wicked Flea (Dog Lover's Mysteries),165515,5.0,111786,5287,0.0
2426095,209299,FOUNDATION,165515,5.0,64868,4635,0.0
2433432,181488,Dandelion Wine,165515,5.0,96234,5287,2.0
2433655,188570,Hemingway: The Paris Years,165515,5.0,83137,1887,4.0


In [21]:
user_items = user['Id']

In [22]:
all_items = df_test['Id'].unique()

In [38]:
items_to_predict = list(set(all_items) - set(user_items))

In [39]:
items_to_predict = pd.DataFrame(items_to_predict, columns=['Id'])

In [35]:
items_to_predict

Unnamed: 0,Ids
0,0
1,1
2,2
3,3
4,4
...,...
107022,216007
107023,216008
107024,216011
107025,216012


Using the items_to_predict we will create a dataframe to predict the ratings of this books to our user

The correct thing to do is to use the Database and merge the book data and use this merge in the model, but here we will create it using only pandas


In [47]:
items_to_predict

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4
...,...
107022,216007
107023,216008
107024,216011
107025,216012


In [43]:
books_data = df_encodeds.drop_duplicates(subset='Id')

In [45]:
resultado = pd.merge(books_data, items_to_predict, on='Id', how='inner')

In [49]:
resultado = resultado.drop('review/score', axis=1)
resultado['User_id'] = 165515

In [59]:
resultado

Unnamed: 0,Id,Title,User_id,authors,categories,ratingsCount
0,72368,Dr. Seuss: American Icon,165515,93819,1887,0.0
1,37578,Whispers of the Wicked Saints,165515,118279,5287,0.0
2,62852,The Church of Christ: A Biblical Ecclesiology ...,165515,36246,9198,5.0
3,75993,Muslim Women's Choices: Religious Belief and S...,165515,15279,9198,0.0
4,87087,Dramatica for Screenwriters,165515,8079,9169,0.0
...,...,...,...,...,...,...
107022,42733,My Life,165515,53793,6957,25.0
107023,45609,Better Homes and Gardens Casual Entertaining C...,165515,124097,4891,0.0
107024,15587,Managing With Operational Research,165515,62307,10668,0.0
107025,205200,Ghost Story,165515,93247,5287,0.0


In [74]:
class PredictDataset(Dataset):
    def __init__(self, dataframe):
        self.X = dataframe
        self.X = self.X.drop('Title', axis=1).to_numpy()

        self.X = torch.tensor(self.X, dtype=torch.long)

    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx]

dataset = PredictDataset(resultado)
loader = DataLoader(dataset, batch_size=2048, shuffle=True, drop_last=True)



In [149]:
import torch
import torch.nn as nn

class NeuMF(nn.Module):
    def __init__(self, n_users, n_items,n_genders, n_authors, n_factors=8):
        super().__init__()

        # Embeddings for GMF (Generalized Matrix Factorization) path
        self.user_gmf = nn.Embedding(n_users, n_factors)
        self.item_gmf = nn.Embedding(n_items, n_factors)

        # Embeddings for MLP path
        self.user_mlp = nn.Embedding(n_users, n_factors)
        self.item_mlp = nn.Embedding(n_items, n_factors)
        self.item_gender_emb = nn.Embedding(n_genders, n_factors)
        self.item_authors_emb = nn.Embedding(n_authors, n_factors)

        
        # Initialize embeddings with small uniform values
        self.user_gmf.weight.data.uniform_(0, 0.05)
        self.item_gmf.weight.data.uniform_(0, 0.05)
        self.user_mlp.weight.data.uniform_(0, 0.05)
        self.item_mlp.weight.data.uniform_(0, 0.05)

        # MLP input: user + item embedding + text embedding
        input_dim = n_factors * 4 + 1 

        # MLP: several layers with ReLU and Dropout to prevent overfitting
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(p=0.4),
            nn.Linear(256, 32)
        )

        # Final prediction layer: GMF + MLP outputs concatenated
        self.final_layer = nn.Linear(n_factors + 32, 1)

    def forward(self, data):
        users = data[:, 1]
        items = data[:, 0]
        genders = data[:, 3]
        authors = data[:, 2]
        ratingsCount = data[:, 4]



        # GMF path
        gmf_user = self.user_gmf(users)
        gmf_item = self.item_gmf(items)
        gmf_out = gmf_user * gmf_item  # element-wise product

        # Gender and authors 
        gender_emb = self.item_gender_emb(genders)
        authors_emb = self.item_authors_emb(authors)


        # MLP path
        mlp_user = self.user_mlp(users)
        mlp_item = self.item_mlp(items)
        mlp_items = torch.cat([mlp_item, gender_emb, authors_emb,ratingsCount.unsqueeze(1)], dim=1)

        # Concatenate user, item, and text embeddings
        mlp_input = torch.cat([mlp_user, mlp_items], dim=1)
       # print(mlp_input.shape)
        mlp_out = self.mlp(mlp_input)

        # Combine GMF and MLP paths and make final prediction
        final_input = torch.cat([gmf_out, mlp_out], dim=1)
        out = self.final_layer(final_input).squeeze(1)

        return out

In [150]:
import os 
recommind_model = torch.load(os.path.join(r"C:\Users\Lucas\Documents\GitHub\Recommind\models\ncf_model", 'recommind_model.pth'))
model = NeuMF(**recommind_model['config'])
model.load_state_dict(recommind_model['model_state_dict'])


<All keys matched successfully>

In [152]:
model.to(device='cuda')

NeuMF(
  (user_gmf): Embedding(1008961, 16)
  (item_gmf): Embedding(216014, 16)
  (user_mlp): Embedding(1008961, 16)
  (item_mlp): Embedding(216014, 16)
  (item_gender_emb): Embedding(10669, 16)
  (item_authors_emb): Embedding(124098, 16)
  (mlp): Sequential(
    (0): Linear(in_features=65, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): GELU(approximate='none')
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): GELU(approximate='none')
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=256, out_features=32, bias=True)
  )
  (final_layer): Linear(in_features=48, out_features=1, bias=True)
)

In [153]:
model.eval()
number_batch = 0
scores = []
with torch.no_grad():
    for X in loader:
        X = X.to(device)
        outputs = model(X)
        scores.append(outputs.cpu())



In [154]:
scores = np.array(scores)

In [155]:
scores = scores.flatten()
scores.shape

(106496,)

In [156]:
scores = torch.tensor(scores)

In [157]:
top_k = 10
top_indices = torch.topk(scores, top_k).indices

In [158]:
top_items = [items_to_predict.iloc[int(i)] for i in top_indices]

In [159]:

book_df = df_encodeds[['Id', 'Title']]

book_dict = dict(zip(book_df['Id'], book_df['Title']))

In [160]:
for idx in range(len(top_items)):
        book_id = top_items[idx]['Id']
        book = book_dict[book_id]
        print(f" Book: {book}")

 Book: A Yellow Journalist
 Book: Michael Collins: A Life
 Book: Forest Green Glass
 Book: Japanese Master Swordsmiths: The Gassan Tradition
 Book: Standard Catalog Of Firearms, 15th Edition (Standard Catalog of Firearms)
 Book: Motorcycle 101
 Book: The Dangerous Baron Leigh (Signet Regency Romance)
 Book: Place of Truth (Stone of Light)
 Book: Sparkledoll Always Into Something-2004 Edition
 Book: Short Stories: Five Decades (Phoenix Fiction)
