# Autoencoder Recommendation System

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from time import time
from collections import Counter
import ast

import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torchvision 
import torch.utils.data as data
import torch.distributions as dist

## Data Preparation

In [3]:
df_books = pd.read_csv('books_autorec.csv')
df_books.sort_values(by='ratings_count', ascending=False, inplace=True)
Books_number = 2000
df_books = df_books.iloc[:Books_number]
df_books.to_csv('Spatial_model_books.csv')
df_books['goodreads_book_id'] = df_books['goodreads_book_id'].astype(int)
book_ids = df_books['goodreads_book_id']

In [4]:
df_books = pd.read_csv('Spatial_model_books.csv')

In [None]:
df = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

df_ratings_with_clusters = df_ratings.merge(
    df[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

cluster_sizes = df_books['cluster'].value_counts().sort_index().values

filter_users = {
    user: [triplet for triplet in triplets if triplet[0] in book_ids]
    for user, triplets in sparse_users.items()
}
filter_users = {user: triplets for user, triplets in filter_users.items() if triplets}
#user taken
taken_users = 50000
filter_users = sorted(filter_users.items(), key=lambda x: len(x[1]), reverse=True)[:taken_users]
filter_users = dict(filter_users)

In [6]:
print(len(sparse_users))
print(len(filter_users))

53424
48939


In [7]:
mapping_pos_to_books = dict(zip(range(Books_number), book_ids))
mapping_books_to_pos = dict(zip(book_ids,range(Books_number)))
mapping_pos_to_users = dict(zip(range(taken_users), filter_users.keys()))
mapping_users_to_pos = dict(zip(filter_users.keys(),range(taken_users)))
print(len(mapping_pos_to_books))
print(len(mapping_books_to_pos))
print(len(mapping_pos_to_users))
print(len(mapping_users_to_pos))

2000
2000
48939
48939


In [9]:
n_books = len(mapping_books_to_pos)
user_vectors = []
for user_id, triplets in filter_users.items():
    vector = np.zeros(n_books)  # initialize vector with zeros

    for book_id, rating, _ in triplets:
        if book_id in mapping_books_to_pos:  # if book_id is in the mapping
            index = mapping_books_to_pos[book_id]
            vector[index] = rating  # insert rating in the correct position

    user_vectors.append(vector)
print(len(user_vectors))
print(len(user_vectors[0]))
for i in range(len(user_vectors)):
    user_vectors[i] = [0 if elem < 3 else 1 for elem in user_vectors[i]]
print(user_vectors[0])
df_input_data = pd.DataFrame(user_vectors)

48939
2000
[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Training

In [10]:
class Spatial_F_AE(nn.Module):
    def __init__(self,k):
        super(Spatial_F_AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(k,500),
            nn.ReLU(),
            nn.Linear(500,250),
            nn.ReLU(),
            nn.Linear(250,125),
            nn.ReLU(),
            nn.Linear(125,50),
            nn.ReLU(),
            nn.Linear(50,2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2,50),
            nn.ReLU(),
            nn.Linear(50,125),
            nn.ReLU(),
            nn.Linear(125,250),
            nn.ReLU(),
            nn.Linear(250,500),
            nn.ReLU(),
            nn.Linear(500,k),
            nn.Sigmoid(),
        )
    def forward(self,x):
        z = self.encoder(x)
        final = self.decoder(z)
        return final

In [15]:
def train(model, dataloader, criterion, optimizer, num_epochs, scheduler=None, best_loss=float('inf')):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    losses = []
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, batch in enumerate(dataloader):
            inputs = batch[0]
            inputs = inputs.to(device)
            recon = model(inputs)
            loss = criterion(recon, inputs)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step(loss.item())
            running_loss += loss.item()
        losses.append(running_loss / (i + 1))
        if running_loss < best_loss:
            best_loss = running_loss
            torch.save(model.state_dict(), 'best_model.pth')
        print(f"Epoch {epoch+1}: Loss = {running_loss / (i + 1):.10f}")
    return losses

In [16]:
def loss_graph(tr_loss,n_epochs):
    plt.plot(range(n_epochs),tr_loss,label='tr_loss', c='black')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over Epochs')
    plt.legend()
    plt.show()

In [17]:
tensor_data = torch.tensor(df_input_data.values, dtype=torch.float32)
dataset = torch.utils.data.TensorDataset(tensor_data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
model = Spatial_F_AE(Books_number)
criterion = nn.MSELoss()
N_Epochs = 20
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=False)

losses = train(model,dataloader,criterion, optimizer, N_Epochs, scheduler)
loss_graph(losses, N_Epochs)



## Generate Recommendations

In [None]:
def recommendations(user, model):
    model.eval()
    recon = model(user)
    old_books= (user != 0).nonzero(as_tuple=True)[0].tolist()
    _, new_books = torch.topk(recon, 100) # top 100 recommendations
    new_books = new_books.tolist()
    old_books_map = [mapping_pos_to_books[pos] for pos in old_books]
    new_books_map = [mapping_pos_to_books[pos] for pos in new_books]
    old_titles = [df_books[df_books['goodreads_book_id'] == id].values.tolist()[0][5] for id in old_books_map]
    new_titles = [df_books[df_books['goodreads_book_id'] == id].values.tolist()[0][5] for id in new_books_map]
    diff = list(set(new_titles) - set(old_titles))
    diff_id = list(set(new_books_map) - set(old_books_map))
    return diff, diff_id, old_books_map

In [None]:
rec, rec_id, old_id = recommendations(dataloader.dataset[678][0], model)
print("Recommendations for user 678: ")
print(rec)
print(rec_id)

Recomendations for user 678: 
['A Tale of Two Cities', 'Memoirs of a Geisha', 'Angels & Demons ', ' The Fellowship of the Ring', 'Un di Velt Hot Geshvign', 'Pippi Långstrump', 'The Da Vinci Code', 'The Curious Incident of the Dog in the Night-Time', 'Modern Romance', 'Of Mice and Men ', 'High Five', 'Freakonomics: A Rogue Economist Explores the Hidden Side of Everything', 'A Confederacy of Dunces', 'Ὀδύσσεια', 'O Alquimista', 'The Tragicall Historie of Hamlet, Prince of Denmark', 'The Boston Girl', 'Beautiful Creatures']
[960, 1953, 930, 34, 865, 1381, 19302, 968, 22450859, 1420, 6304335, 1617, 1202, 1618, 310612, 6423, 23453112, 890]


# Ranking

# Evaluating

In [None]:
def user_refinding(model,user):
    rating_pos = user[user == 1.0]
    removed_pos = rating_pos[:int(len(rating_pos)/5)+1]
    removed_books = [mapping_pos_to_books[elem] for elem in removed_pos.tolist()]
    _, diff_id, _ = recommendations(user, model)
    diff = list(set(removed_books) - set(diff_id))
    return len(diff)/len(removed_books), len(removed_books)

In [None]:
sum = 0
for i in range(len(mapping_users_to_pos)):
    len_ratio, _ = user_refinding(model, dataloader.dataset[i][0])
    sum += len_ratio
print('On avg we refind: ', sum/len(mapping_users_to_pos), "% of the removed items")