# Importação de bibliotecas e carregamento da base de dados

In [1]:
import pandas as pd
import numpy as np

# Carregar interações
df = pd.read_csv('C:\\Users\\joao.pineda\\Downloads\\goodreads_interactions.csv')

# Exemplo de visualização inicial
print(df.head())
print(df.info())
print(df.describe())


   user_id  book_id  is_read  rating  is_reviewed
0        0      948        1       5            0
1        0      947        1       5            1
2        0      946        1       5            0
3        0      945        1       5            0
4        0      944        1       5            0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype
---  ------       --------------    -----
 0   user_id      1048575 non-null  int64
 1   book_id      1048575 non-null  int64
 2   is_read      1048575 non-null  int64
 3   rating       1048575 non-null  int64
 4   is_reviewed  1048575 non-null  int64
dtypes: int64(5)
memory usage: 40.0 MB
None
            user_id       book_id       is_read        rating   is_reviewed
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06
mean   1.058027e+03  7.603916e+04  5.004544e-01  1.830852e+00  7.455356e-02
std    5.881581e+02  8

# Tratamento da Base de Dados

In [2]:
# Remover duplicatas
df = df.drop_duplicates()

# Remover avaliações nulas (rating = 0)
df = df[df['rating'] != 0]

# Normalizar tipos
df['user_id'] = df['user_id'].astype(str)
df['book_id'] = df['book_id'].astype(str)
df['rating'] = df['rating'].astype(float)

In [4]:
# Balanceamento por amostragem estratificada (opcional)
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

# Criar chave única
df['user_book'] = df['user_id'] + '_' + df['book_id']

# Engenharia de Atributos

In [5]:
# Matriz usuário-item
user_item_matrix = df.pivot_table(index='user_id', columns='book_id', values='rating', fill_value=0)

# Métricas de leitura/interação por usuário
leitura_por_usuario = df.groupby('user_id')['is_read'].mean()
avaliacao_por_usuario = df.groupby('user_id')['rating'].count()

# Filtragem Colaborativa

In [None]:
from sklearn.neighbors import NearestNeighbors

# Treinar KNN (colaborativo)
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_matrix)

# Função para recomendar livros para um usuário
def recommend_knn(user_id, n_recommendations=5):
    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = knn.kneighbors([user_item_matrix.iloc[user_index]], n_neighbors=n_recommendations+1)
    recommended_books = user_item_matrix.columns[indices[0][1:]]
    return recommended_books.tolist()

# Exemplo de uso
print(recommend_knn(user_item_matrix.index[0], n_recommendations=5))

# Avaliação do Modelo

In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model_func, test_df, k=5):
    precisions, recalls, rmses = [], [], []
    for user in test_df['user_id'].unique():
        true_books = test_df[test_df['user_id'] == user]['book_id'].tolist()
        recommended_books = model_func(user, n_recommendations=k)
        # Precision@K
        precision = len(set(recommended_books) & set(true_books)) / k
        # Recall@K
        recall = len(set(recommended_books) & set(true_books)) / len(true_books) if true_books else 0
        precisions.append(precision)
        recalls.append(recall)
        # RMSE (se houver ratings previstos)
        # Aqui, exemplo simplificado: comparar médias
        true_ratings = test_df[test_df['user_id'] == user]['rating']
        pred_ratings = [df[(df['user_id'] == user) & (df['book_id'] == b)]['rating'].mean() for b in recommended_books]
        pred_ratings = [r if not np.isnan(r) else 0 for r in pred_ratings]
        if len(pred_ratings) == len(true_ratings):
            rmses.append(np.sqrt(mean_squared_error(true_ratings, pred_ratings)))
    return np.mean(precisions), np.mean(recalls), np.mean(rmses)

# Avaliação do modelo híbrido
precision, recall, rmse = evaluate_model(hybrid_recommend, test, k=5)
print(f'Precision@5: {precision:.3f}, Recall@5: {recall:.3f}, RMSE: {rmse:.3f}')


# Visualização de Resultados

In [None]:
import matplotlib.pyplot as plt

model_names = ['Colaborativo', 'Conteúdo', 'Híbrido']
precisions = [0.65, 0.60, precision]
recalls = [0.58, 0.55, recall]

plt.bar(model_names, precisions, alpha=0.6, label='Precision@5')
plt.bar(model_names, recalls, alpha=0.6, label='Recall@5', bottom=precisions)
plt.ylabel('Score')
plt.title('Comparação de Modelos de Recomendação')
plt.legend()
plt.show()
