In [25]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report

In [4]:
def read_idx2label(json_path: str) -> pd.Series:
    """This function read the json file and return a dictionary
    Args:
      json_path (str): path to the json file
     Returns:
      idx2label (dict): dictionary with the mapping"""
    with open(json_path) as f:
        idx2label = json.load(f)
    return idx2label

idx2label = read_idx2label(json_path="../topic_mapping_1.json")

In [5]:
def decode_labels_into_idx(labels: pd.Series, idx2label: dict) -> pd.Series:
    """This function decode the labels into idx
    Args:
      labels (pd.Series): series with the labels
      idx2label (dict): dictionary with the mapping
     Returns:
      labels (pd.Series): series with the labels decoded
    """
    return labels.map(idx2label)

In [6]:
df = pd.read_csv("data/tickets_inputs_eng_2.csv")

In [8]:
y_encoded = df["relevant_topics"]

In [9]:
label2idx = {value: key for key, value in idx2label.items()}
y = decode_labels_into_idx(labels=y_encoded, idx2label=label2idx)

In [12]:
len(label2idx)

3

In [10]:
X = df['processed_text']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar el modelo pre-entrenado BERT y el tokenizador
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label2idx))

# Tokenizar los textos y preparar los datasets para entrenamiento y prueba
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Assuming y_train and y_test are pandas Series
y_train_numeric = y_train.values.astype(np.float64)  # Convert to numpy array and float64
y_train_numeric = y_train_numeric.reshape(-1, 1)  # Reshape if necessary

y_test_numeric = y_test.values.astype(np.float64)  # Convert to numpy array and float64
y_test_numeric = y_test_numeric.reshape(-1, 1)  # Reshape if necessary

# Then create the datasets
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train_numeric)  # Use the converted numerical array
)

test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test_numeric)  # Use the converted numerical array
)

In [29]:
batch_size = 16
epochs = 2
lr = 2e-5

# Crear DataLoaders para entrenamiento y prueba
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)



In [31]:
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Cambiando la función de pérdida

# Entrenamiento del modelo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Agregar tqdm para mostrar la barra de progreso
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)

    for batch in tqdm(progress_bar):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2].squeeze()}  # Assuming 'labels' are squeezed for binary classification

        optimizer.zero_grad()
        outputs = model(**inputs)
        
        # Ajustar las salidas y etiquetas para clasificación binaria
        logits = outputs.logits.squeeze(-1)  # Squeeze the logits to have size [batch_size]
        labels = inputs['labels'].float()  # Assuming 'labels' are already squeezed to [batch_size]

        loss = loss_fn(logits, labels)  # Usar BCEWithLogitsLoss para clasificación binaria
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

        # Actualizar la barra de progreso
        progress_bar.set_postfix({'Training loss': total_loss / len(progress_bar)})

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss}')

  0%|          | 0/948 [14:01<?, ?it/s]           


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 3]))

In [None]:
model.eval()
all_preds = []
all_labels = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1]}

    with torch.no_grad():
        outputs = model(**inputs)
    
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    labels = batch[2].cpu().numpy()
    all_preds.extend(preds)
    all_labels.extend(labels)