In [21]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
import math
from time import time
import matplotlib.pyplot as plt

import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torchvision 
import torch.utils.data as data
import torch.distributions as dist

## Data Preparation

In [6]:
df_books = pd.read_csv('books_autorec.csv')
df_books.sort_values(by='ratings_count', ascending=False, inplace=True)
#for aumenting books number change 
Books_number = 2000
df_books = df_books.iloc[:Books_number]
df_books.to_csv('Spatial_model_books.csv')
df_books['goodreads_book_id'] = df_books['goodreads_book_id'].astype(int)
book_ids = df_books['goodreads_book_id']


In [7]:
df_books = pd.read_csv('Spatial_model_books.csv')

In [8]:
df = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

df_ratings_with_clusters = df_ratings.merge(
    df[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

cluster_sizes = df_books['cluster'].value_counts().sort_index().values

filter_users = {
    user: [triplet for triplet in triplets if triplet[0] in book_ids]
    for user, triplets in sparse_users.items()
}
filter_users = {user: triplets for user, triplets in filter_users.items() if triplets}
#user taken
taken_users = 20000
filter_users = sorted(filter_users.items(), key=lambda x: len(x[1]), reverse=True)[:taken_users]
filter_users = dict(filter_users)


In [9]:
print(len(sparse_users))
print(len(filter_users))

53424
20000


In [10]:
mapping_pos_to_books = dict(zip(range(Books_number), book_ids))
mapping_books_to_pos = dict(zip(book_ids,range(Books_number)))
mapping_pos_to_users = dict(zip(range(taken_users), filter_users.keys()))
mapping_users_to_pos = dict(zip(filter_users.keys(),range(taken_users)))
print(len(mapping_pos_to_books))
print(len(mapping_books_to_pos))
print(len(mapping_pos_to_users))
print(len(mapping_users_to_pos))

2000
2000
20000
20000


In [11]:
n_books = len(mapping_books_to_pos)
user_vectors = []
for user_id, triplets in filter_users.items():
    vector = np.zeros(n_books)  # inizializza vettore di zeri

    for book_id, rating, _ in triplets:
        if book_id in mapping_books_to_pos:  # se il book_id è tra quelli mappati
            index = mapping_books_to_pos[book_id]
            vector[index] = rating  # inserisci il rating nella posizione giusta

    user_vectors.append(vector)
print(len(user_vectors))
print(len(user_vectors[0]))
for i in range(len(user_vectors)):
    user_vectors[i] = [0 if elem < 3 else 1 for elem in user_vectors[i]]
print(user_vectors[0])
df_input_data = pd.DataFrame(user_vectors)

20000
2000
[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Training

In [None]:
class Spatial_F_AE(nn.Module):
    def __init__(self,k):
        super(Spatial_F_AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(k,500),
            nn.ReLU(),
            nn.Linear(500,250),
            nn.ReLU(),
            nn.Linear(250,125),
            nn.ReLU(),
            nn.Linear(125,50),
            nn.ReLU(),
            nn.Linear(50,2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2,50),
            nn.ReLU(),
            nn.Linear(50,125),
            nn.ReLU(),
            nn.Linear(125,250),
            nn.ReLU(),
            nn.Linear(250,500),
            nn.ReLU(),
            nn.Linear(500,k),
            nn.Sigmoid(),
        )
    def forward(self,x):
        z = self.encoder(x)
        final = self.decoder(z)
        return final

In [None]:
class Spatial_F_AE2(nn.Module):
    def __init__(self,k):
        super(Spatial_F_AE2, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(k,750),
            nn.ReLU(),
            nn.Linear(750,500),
            nn.ReLU(),
            nn.Linear(500,250),
            nn.ReLU(),
            nn.Linear(250,125),
            nn.ReLU(),
            nn.Linear(125,75),
            nn.ReLU(),
            nn.Linear(75,50),
            nn.ReLU(),
            nn.Linear(50,25),
            nn.ReLU(),
            nn.Linear(25,10),
            nn.ReLU(),
            nn.Linear(10,2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2,10),
            nn.ReLU(),
            nn.Linear(10,25),
            nn.ReLU(),
            nn.Linear(25,50),
            nn.ReLU(),
            nn.Linear(50,75),
            nn.ReLU(),
            nn.Linear(75,125),
            nn.ReLU(),
            nn.Linear(125,250),
            nn.ReLU(),
            nn.Linear(250,500),
            nn.ReLU(),
            nn.Linear(500,750),
            nn.ReLU(),
            nn.Linear(750,k),
            nn.Sigmoid(),
        )
    def forward(self,x):
        z = self.encoder(x)
        final = self.decoder(z)
        return final

In [None]:
def train(model, dataloader, criterion, optimizer, num_epochs, scheduler=None, best_loss=float('inf')):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    losses = []
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, batch in enumerate(dataloader):
            inputs = batch[0]
            inputs = inputs.to(device)
            recon = model(inputs)
            loss = criterion(recon, inputs)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step(loss.item())
            running_loss += loss.item()
        losses.append(running_loss / (i + 1))
        if running_loss < best_loss:
            best_loss = running_loss
            torch.save(model.state_dict(), 'best_model.pth')
        print(f"Epoch {epoch+1}: Loss = {running_loss / (i + 1):.10f}")
    return losses

In [None]:
def loss_graph(tr_loss,n_epochs):
    plt.plot(range(n_epochs),tr_loss,label='tr_loss', c='black')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over Epochs')
    plt.legend()
    plt.show()

In [12]:
#df_input_data = pd.read_csv('Spatial_model_inputs.csv')
tensor_data = torch.tensor(df_input_data.values, dtype=torch.float32)
dataset = torch.utils.data.TensorDataset(tensor_data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
model = Spatial_F_AE(Books_number)
criterion = nn.MSELoss()
N_Epochs = 20
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=False)

losses = train(model,dataloader,criterion, optimizer, N_Epochs, scheduler)
loss_graph(losses, N_Epochs)



In [None]:
model2 = Spatial_F_AE2(Books_number)
criterion = nn.MSELoss()
N_Epochs = 20
optimizer = torch.optim.Adam(model2.parameters(), lr = 0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=False)

losses = train(model,dataloader,criterion, optimizer, N_Epochs, scheduler)
loss_graph(losses, N_Epochs)

## Generate Recommendations

In [None]:
def recommendations(user, model):
    model.eval()
    recon = model(user)
    old_books= (user != 0).nonzero(as_tuple=True)[0].tolist()
    _, new_books = torch.topk(recon, 25)
    new_books = new_books.tolist()
    old_books_map = [mapping_pos_to_books[pos] for pos in old_books]
    new_books_map = [mapping_pos_to_books[pos] for pos in new_books]
    old_titles = [df_books[df_books['goodreads_book_id'] == id].values.tolist()[0][5] for id in old_books_map]
    new_titles = [df_books[df_books['goodreads_book_id'] == id].values.tolist()[0][5] for id in new_books_map]
    diff = list(set(new_titles) - set(old_titles))
    diff_id = list(set(new_books_map) - set(old_books_map))
    return diff, diff_id, old_books_map

In [None]:
rec, rec_id, old_id = recommendations(dataloader.dataset[678][0], model)
print("Recommendations for user 678: ")
print(rec)
print(rec_id)

Recomendations for user 678: 
['A Tale of Two Cities', 'Memoirs of a Geisha', 'Angels & Demons ', ' The Fellowship of the Ring', 'Un di Velt Hot Geshvign', 'Pippi Långstrump', 'The Da Vinci Code', 'The Curious Incident of the Dog in the Night-Time', 'Modern Romance', 'Of Mice and Men ', 'High Five', 'Freakonomics: A Rogue Economist Explores the Hidden Side of Everything', 'A Confederacy of Dunces', 'Ὀδύσσεια', 'O Alquimista', 'The Tragicall Historie of Hamlet, Prince of Denmark', 'The Boston Girl', 'Beautiful Creatures']
[960, 1953, 930, 34, 865, 1381, 19302, 968, 22450859, 1420, 6304335, 1617, 1202, 1618, 310612, 6423, 23453112, 890]


Evaluate the model performance

In [49]:
def top_tags_for_user(df, books_id, top_n=-1):
    filtered_df = df[df['goodreads_book_id'].isin(books_id)]
    tags = filtered_df['tags_list']
    
    all_tags = []
    for t in tags:
        if isinstance(t, str):
            t = ast.literal_eval(t)
        all_tags.extend(t)

    tag_counts = Counter(all_tags)
    if top_n == -1:
        top_n = len(tag_counts)
    most_common = tag_counts.most_common(top_n)
    return most_common


def top_tags_for_user(df, books_id, top_n=-1):
    filtered_df = df[df['goodreads_book_id'].isin(books_id)]
    tags = filtered_df['tags_list']
    
    all_tags = []
    for t in tags:
        # Only try to parse if it's a string and not already a list
        if isinstance(t, str):
            try:
                t = ast.literal_eval(t)
            except Exception:
                # If parsing fails, skip this entry
                continue
        if isinstance(t, list):
            all_tags.extend(t)
        # If it's not a list, skip

    tag_counts = Counter(all_tags)
    if top_n == -1:
        top_n = len(tag_counts)
    most_common = tag_counts.most_common(top_n)
    return most_common

In [50]:
tags_recomendation = top_tags_for_user(df_books, rec_id)

print(f"\nBooks {rec_id}:")
for tag, count in tags_recomendation:
    print(f"  {tag}:  {count}")


Books [960, 1953, 1381, 19302, 22450859, 1420, 6304335, 1617, 1202, 310612, 6423, 23453112]:
  fiction:  12
  favorites:  12
  owned:  12
  books-i-own:  12
  owned-books:  12
  currently-reading:  12
  library:  12
  to-buy:  11
  audiobook:  11
  kindle:  11
  audiobooks:  11
  own-it:  11
  audio:  11
  default:  10
  adult:  10
  my-library:  10
  i-own:  10
  wish-list:  10
  eng:  10
  favourites:  9
  my-books:  9
  ebook:  9
  ebooks:  9
  contemporary:  8
  literature:  8
  book-club:  8
  re-read:  8
  novels:  8
  adult-fiction:  7
  books:  7
  all-time-favorites:  7
  classics:  7
  romance:  7
  abandoned:  7
  series:  6
  historical-fiction:  6
  historical:  6
  history:  6
  borrowed:  6
  shelfari-favorites:  6
  favorite-books:  6
  school:  6
  classic:  6
  unfinished:  6
  audible:  6
  novel:  5
  general-fiction:  5
  fantasy:  5
  english:  5
  favorite:  5
  american:  5
  home-library:  5
  high-school:  5
  to-re-read:  5
  did-not-finish:  5
  didn-t-fini

In [None]:
def percentage_different_tags(tags_user_preference, tags_recomendation):
    user_tags = set(tag for tag, _ in tags_user_preference)
    rec_tags = set(tag for tag, _ in tags_recomendation)
    diff_tags = rec_tags - user_tags
    if len(rec_tags) == 0:
        return 0.0
    percent_diff = len(diff_tags) / len(rec_tags) * 100
    return percent_diff

def evaluate_user_percentage(user, model):
    # get recommendations
    _, rec_id, old_id = recommendations(dataloader.dataset[user][0], model)
    tags_user_preference = top_tags_for_user(df_books, old_id)
    tags_recomendation = top_tags_for_user(df_books, rec_id)
    
    # get random books
    n_recom_books = len(rec_id)
    sample_books = df_books.sample(n=n_recom_books)['goodreads_book_id'].tolist()
    # get tags for recommended and random books 
    tags_random_books = top_tags_for_user(df_books, sample_books)
    recommended = percentage_different_tags(tags_user_preference, tags_recomendation)
    random = percentage_different_tags(tags_user_preference, tags_random_books)
    return recommended, random


In [52]:
print("Evaluating user 678:")
recommended, random = evaluate_user_percentage(678, model)
print(f"Percentage of different tags in recommendations: {recommended:.2f}%")
print(f"Percentage of different tags in random books: {random:.2f}%")

Evaluating user 678:
Percentage of different tags in recommendations: 65.78%
Percentage of different tags in random books: 69.62%


In [53]:
percentages = []
for user in range(len(dataloader.dataset)):
    recommended, random = evaluate_user_percentage(user, model)
    percentages.append(recommended - random)
np.mean(percentages)

-6.43575163446907

# Ranking