In [1]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import re

# Paso 1: Cargar y limpiar el dataset
data = pd.read_csv('dataset/Liar_Dataset.csv')

# Eliminar columnas innecesarias
columns_to_drop = ['[ID].json', 'barely true counts', 'false counts', 
                   'half true counts', 'mostly true counts', 'pants on fire counts']
liar_dataset_cleaned = data.drop(columns=columns_to_drop)

# Rellenar valores nulos y limpiar el texto
liar_dataset_cleaned['speaker\'s job title'].fillna('Unknown', inplace=True)
liar_dataset_cleaned['state info'].fillna('Unknown', inplace=True)
liar_dataset_cleaned['venue'].fillna('Unknown', inplace=True)

# Mapeo de etiquetas a 'TRUE' o 'FALSE'
label_mapping = {
    'TRUE': 'TRUE',
    'mostly-true': 'TRUE',
    'FALSE': 'FALSE',
    'barely-true': 'FALSE',
    'half-true': 'FALSE',
    'pants-fire': 'FALSE'
}
liar_dataset_cleaned['label'] = liar_dataset_cleaned['label'].map(label_mapping)

# Limpieza de texto
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remueve caracteres especiales
    text = re.sub(r'\s+', ' ', text).strip()  # Remueve espacios adicionales
    return text

liar_dataset_cleaned['statement'] = liar_dataset_cleaned['statement'].apply(clean_text)

# Paso 2: Codificación de etiquetas y división en entrenamiento/prueba
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
liar_dataset_cleaned['label_encoded'] = label_encoder.fit_transform(liar_dataset_cleaned['label'])

X = liar_dataset_cleaned['statement']
y = liar_dataset_cleaned['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Paso 3: Cargar DistilBERT y Tokenizar
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)

def tokenize_texts(texts, tokenizer, max_length=64):
    return tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

train_encodings = tokenize_texts(X_train, tokenizer)
test_encodings = tokenize_texts(X_test, tokenizer)

# Paso 4: Configurar el optimizador y la pérdida
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Paso 5: Entrenar el modelo usando GradientTape
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y_train
)).batch(batch_size)

epochs = 3

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_dataset:
        inputs, labels = batch
        with tf.GradientTape() as tape:
            logits = distilbert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
            loss = loss_fn(labels, logits)
        gradients = tape.gradient(loss, distilbert_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, distilbert_model.trainable_variables))
    print(f"Loss after epoch {epoch + 1}: {loss.numpy()}")

# Paso 6: Evaluar el modelo en el conjunto de prueba
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y_test
)).batch(batch_size)

accuracy_metric = tf.keras.metrics.BinaryAccuracy()

for batch in test_dataset:
    inputs, labels = batch
    logits = distilbert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
    predictions = tf.round(tf.nn.sigmoid(logits))  # Convertir logits a etiquetas binarias
    accuracy_metric.update_state(labels, predictions)

print(f"Accuracy en el conjunto de prueba: {accuracy_metric.result().numpy()}")

  from .autonotebook import tqdm as notebook_tqdm
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  liar_dataset_cleaned['speaker\'s job title'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  liar_dataset_cleaned['state info'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0

Epoch 1/3


2024-11-07 17:14:17.184447: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss after epoch 1: 0.5629598498344421
Epoch 2/3


2024-11-07 17:17:51.545302: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss after epoch 2: 0.2551004886627197
Epoch 3/3


2024-11-07 17:21:22.333382: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss after epoch 3: 0.0071550593711435795
Accuracy en el conjunto de prueba: 0.6161063313484192


2024-11-07 17:21:29.954807: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "True"

In [None]:
from tensorflow.keras.optimizers import AdamW

optimizer = AdamW(learning_rate=2e-5)
optimizer.build(distilbert_model.trainable_variables)

# Continuar con el entrenamiento
epochs = 5
batch_size = 32

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_dataset:
        inputs, labels = batch
        with tf.GradientTape() as tape:
            logits = distilbert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']).logits
            loss = loss_fn(labels, logits)
        gradients = tape.gradient(loss, distilbert_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, distilbert_model.trainable_variables))
    print(f"Loss after epoch {epoch + 1}: {loss.numpy()}")

Epoch 1/5


2024-11-07 17:32:18.683821: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss after epoch 1: 0.685687780380249
Epoch 2/5
