In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

In [3]:
from sample_data import *
users, ratings, books = get_saved_samples()

In [4]:
books['Book-Title'] = books['Book-Title'].str.lower()
books['Book-Author'] = books['Book-Author'].str.lower()
# Eliminamos posibles duplicados
books.drop_duplicates(subset='ISBN', keep='first', inplace=True)

In [5]:
books['combined_features'] = books['Book-Title'] + ' ' + books['Book-Author']


In [6]:
# Vectorización de los títulos de los libros
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['combined_features'])


In [7]:
svd = TruncatedSVD(n_components=100)  # Ajustado a 100
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Cálculo de similitud del coseno por lotes
def batch_cosine_similarity(matrix, batch_size=10000):
    cosine_sim_matrix = np.zeros((matrix.shape[0], matrix.shape[0]))
    
    for start_row in range(0, matrix.shape[0], batch_size):
        end_row = min(start_row + batch_size, matrix.shape[0])
        batch = matrix[start_row:end_row]
        
        # Calcular la similitud con toda la matriz
        sim_scores = cosine_similarity(batch, matrix)
        
        # Asignar los resultados a la matriz de similitud
        cosine_sim_matrix[start_row:end_row] = sim_scores

    return cosine_sim_matrix

available_memory = 14e9  # 8 GB en bytes
memory_per_row = 800  # Asumiendo 100 componentes con float64
batch_size = available_memory // memory_per_row // 2  # Dividimos por 2 para dejar espacio para la operación y otros procesos
print(batch_size)

cosine_sim_matrix = batch_cosine_similarity(tfidf_matrix_reduced, int(batch_size))


8750000.0


In [8]:
def book_recommendations(title, cosine_sim_matrix=cosine_sim_matrix, books=books):
    # Intentamos encontrar el libro
    try:
        idx = books.index[books['Book-Title'] == title.lower()].tolist()[0]
    except IndexError:
        print(f"El libro con título '{title}' no se encuentra en el dataset.")
        return []

    # Puntajes de similitud en orden descendente
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Puntajes de los 10 libros más similares
    sim_scores = sim_scores[1:11]

    # Índices de libros
    book_indices = [i[0] for i in sim_scores]

    # Títulos de los 10 libros más similares
    return books['Book-Title'].iloc[book_indices]


In [9]:
book_title = "Clara Callan"
recommendations = book_recommendations(book_title)
print(recommendations)

El libro con título 'classical mythology' no se encuentra en el dataset.
[]
