In [1]:
import pandas as pd
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.preprocessing import LabelEncoder

In [5]:
# Construir la ruta del archivo
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
file_path = os.path.join(parent_dir, 'data', 'CIRCULA_23_TRANSF_S.xlsx')

# Cargar datos desde la hoja "CIRCULA_2023" utilizando la primera fila como encabezados
data = pd.read_excel(file_path, sheet_name="CIRCULA_2023", header=0)

# Mostrar las primeras filas del DataFrame
data.head()


Unnamed: 0,Biblioteca de Prestamo,Meson de Circulación,Cod Biblioteca origen Item,Biblioteca origen Item,Cod de Localizacion,Localización Item,Prestamos en Sala,Prestamos Externo,Barcode,MMS Id,...,Fecha de ingreso del Item,Categoría Socio,Identificador Socio,dewey,Dewey_1,Dewey3,Dewey4.1,Dewey4.1 - Copia,Dewey_2,TIPO MATERIAL
0,Biblioteca Luis Ángel Arango,Préstamo Externo Calle 12 - Solo socios,BLAA,Biblioteca Luis Ángel Arango,DC1,Depósito C1,0,1,29004006168206,991009934269707486,...,1997-12-30 00:00:00,Categoría Biblioteca Virtual,3900488,Co868.5 S45p1,2,868.5 S45p1,868,868.0,860,LIBRO
1,Biblioteca Luis Ángel Arango,Préstamo Externo Calle 12 - Solo socios,BLAA,Biblioteca Luis Ángel Arango,DD1,Depósito D1,0,1,29004005863906,991013769509707486,...,1997-02-24 00:00:00,Categoría Biblioteca Virtual,3900488,928.61 S45j1,0,928.61 S45j1,928,928.0,920,LIBRO
2,Biblioteca Luis Ángel Arango,Préstamo Externo Calle 12 - Solo socios,BLAA,Biblioteca Luis Ángel Arango,DD1,Depósito D1,0,1,29004007799355,991008897989707486,...,2000-05-22 00:00:00,Categoría Biblioteca Virtual,3900488,986.251 B82,0,986.251 B82,986,986.0,980,LIBRO
3,Biblioteca Luis Ángel Arango,Préstamo Externo Calle 12 - Solo socios,BLAA,Biblioteca Luis Ángel Arango,DD1,Depósito D1,0,1,29004017298190,991005656579707486,...,2005-03-22 00:00:00,Categoría Biblioteca Virtual,3900488,928.61 S45v2,0,928.61 S45v2,928,928.0,920,LIBRO
4,Biblioteca Luis Ángel Arango,Préstamo Externo Calle 12 - Solo socios,BLAA,Biblioteca Luis Ángel Arango,DD1,Depósito D1,0,1,29004022727563,991009904169707486,...,2010-12-15 00:00:00,Categoría Biblioteca Virtual,3900488,986.126 E75b,0,986.126 E75b,986,986.0,980,LIBRO


In [6]:
# Filtrar los registros donde "Préstamos Externo" es igual a 1
data = data[data['Prestamos Externo'] == 1]

In [7]:
# Eliminar libros que han sido leídos solo una vez
book_counts = data['Título'].value_counts()
books_to_keep = book_counts[book_counts > 1].index
data = data[data['Título'].isin(books_to_keep)]

In [8]:
# Eliminar usuarios que han leído solo un libro
user_counts = data['Identificador Socio'].value_counts()
users_to_keep = user_counts[user_counts > 1].index
data = data[data['Identificador Socio'].isin(users_to_keep)]

In [9]:
#Eliminar duplicados de libros por usuario
data = data.drop_duplicates(subset=['Identificador Socio', 'Título'])

In [10]:
# Codificación de usuarios y libros
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [11]:
data['user_id'] = user_encoder.fit_transform(data['Identificador Socio'])
data['item_id'] = item_encoder.fit_transform(data['Título'])

In [12]:
num_users = data['user_id'].nunique()
num_items = data['item_id'].nunique()

In [13]:
# Usaremos la última interacción de cada usuario para el conjunto de prueba

data['timestamp'] = pd.to_datetime(data['Fecha de préstamo'])
data = data.sort_values(by=['user_id', 'timestamp'])

In [14]:
# Obtener el último registro de cada usuario para el conjunto de prueba
test_data = data.groupby('user_id').tail(1)
train_data = data.drop(test_data.index)

Crear conjunto de datos para entrenamiento y prueba

In [15]:
# Marcar todas las interacciones como positivas
data['rating'] = 1

In [16]:
# Crear un conjunto de datos que simule interacciones negativas
# Para cada interacción positiva, crear N interacciones negativas
def create_negative_samples(df, num_negatives=4):
    users, items, labels = [], [], []
    user_item_set = set(zip(df['user_id'], df['item_id']))
    all_items = set(df['item_id'].unique())

    for (u, i) in user_item_set:
        users.append(u)
        items.append(i)
        labels.append(1)  # Interacción positiva
        for _ in range(num_negatives):
            negative_item = np.random.choice(list(all_items))
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(list(all_items))
            users.append(u)
            items.append(negative_item)
            labels.append(0)  # Interacción negativa

    return pd.DataFrame({'user_id': users, 'item_id': items, 'rating': labels})

train_data_neg = create_negative_samples(train_data)

In [17]:
# Definir el dataset para PyTorch
class LibraryDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_id'].values, dtype=torch.long)
        self.labels = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

In [18]:
# Definir el modelo de recomendación basado en embeddings y FCNN
class NCF(pl.LightningModule):
    def __init__(self, num_users, num_items, embedding_dim=16):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)
        self.fc1 = nn.Linear(in_features=embedding_dim*2, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.loss_fn = nn.BCELoss()

    def forward(self, user_input, item_input):
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        pred = torch.sigmoid(self.output(vector))
        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        preds = self(user_input, item_input)
        loss = self.loss_fn(preds.view(-1), labels)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [None]:
# Entrenar el modelo y optimizar hiperparámetros
embedding_dims = [16, 32, 64]
best_hit_ratio = 0
best_model = None

for embedding_dim in embedding_dims:
    model = NCF(num_users, num_items, embedding_dim=embedding_dim)
    trainer = pl.Trainer(max_epochs=5, accelerator='gpu', devices=1, enable_progress_bar=True)
    train_dataset = LibraryDataset(train_data_neg)
    train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)

    trainer.fit(model, train_loader)

    # Crear conjunto de prueba para cálculo de HIT RATIO@20
    user_item_set = set(zip(train_data['user_id'], train_data['item_id']))
    all_items = set(range(num_items))
    hits = []

    for idx, row in test_data.iterrows():
        u = row['user_id']
        true_item = row['item_id']
        interacted_items = set(train_data[train_data['user_id'] == u]['item_id'])
        non_interacted_items = all_items - interacted_items

        # Seleccionar 19 ítems negativos y agregar el ítem verdadero
        test_items = list(np.random.choice(list(non_interacted_items), 19, replace=False))
        test_items.append(true_item)

        # Realizar predicciones
        user_input = torch.tensor([u]*20, dtype=torch.long)
        item_input = torch.tensor(test_items, dtype=torch.long)

        preds = model(user_input, item_input).detach().numpy().reshape(-1)
        top_items = [test_items[i] for i in np.argsort(preds)[::-1][:20]]

        if true_item in top_items:
            hits.append(1)
        else:
            hits.append(0)

    hit_ratio = np.mean(hits)
    print(f'Embedding Dim: {embedding_dim}, HIT RATIO@20: {hit_ratio:.4f}')

    if hit_ratio > best_hit_ratio:
        best_hit_ratio = hit_ratio
        best_model = model

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4080 SUPER') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_

Training: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

Embedding Dim: 16, HIT RATIO@20: 1.0000


Training: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

Embedding Dim: 32, HIT RATIO@20: 1.0000


Training: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

Embedding Dim: 64, HIT RATIO@20: 1.0000


In [22]:
# Verificar si se cumple el criterio de aceptación
if best_hit_ratio >= 0.79:
    print(f'El mejor modelo alcanza un HIT RATIO@20 de {best_hit_ratio:.4f}, cumpliendo con el criterio.')
else:
    print(f'El mejor modelo alcanza un HIT RATIO@20 de {best_hit_ratio:.4f}, no cumple con el criterio.')

El mejor modelo alcanza un HIT RATIO@20 de 1.0000, cumpliendo con el criterio.
