In [14]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# 1. Preparación de Datos
data = pd.read_csv('data/normalized_merged_data.csv')
data['text'] = data['text'].str.lower().str.replace('[^a-z\s]', '')

train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding='max_length', max_length=25, return_tensors='tf')
val_encodings = tokenizer(val['text'].tolist(), truncation=True, padding='max_length', max_length=25, return_tensors='tf')
test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding='max_length', max_length=25, return_tensors='tf')

  data['text'] = data['text'].str.lower().str.replace('[^a-z\s]', '')


In [15]:
# 2. Definición del Modelo
base_model = TFGPT2LMHeadModel.from_pretrained("gpt2-medium")
input_layer = tf.keras.layers.Input(shape=(25,), dtype=tf.int32)
sequence_output = base_model(input_layer)[0]
cls_token = sequence_output[:, 0, :]
content_head = tf.keras.layers.Dense(1, activation='sigmoid', name='content')(cls_token)
wording_head = tf.keras.layers.Dense(1, activation='sigmoid', name='wording')(cls_token)
model = tf.keras.models.Model(inputs=input_layer, outputs=[content_head, wording_head])
model.compile(optimizer='adam', loss='mean_squared_error')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [16]:
# 3. Entrenamiento
model.fit(train_encodings['input_ids'], [train['normalized_content'], train['normalized_wording']], validation_data=(val_encodings['input_ids'], [val['normalized_content'], val['normalized_wording']]), epochs=5)

Epoch 1/5
 17/144 [==>...........................] - ETA: 1:35:32 - loss: 0.6729 - content_loss: 0.5251 - wording_loss: 0.1478


KeyboardInterrupt



In [None]:
# 3.5 guardar modelo
model.save('nlp_gpt2.h5')

In [None]:
# 4. Evaluación
losses = model.evaluate(test_encodings['input_ids'], [test['normalized_content'], test['normalized_wording']])

In [None]:
# 5. Predicción
new_text = ["The Third Wave was an experiment to see how people reacted to a new one leader government."]
new_encodings = tokenizer(new_text, truncation=True, padding='max_length', max_length=25, return_tensors='tf')
predicted_content, predicted_wording = model.predict(new_encodings['input_ids'])

In [None]:
# 6. Muestra de graficas evaluando el modelo

import matplotlib.pyplot as plt

# Supongamos que ya tienes el modelo entrenado y que has realizado predicciones en el conjunto de prueba
predicted_content, predicted_wording = model.predict(test_encodings['input_ids'])

# Obtener los valores reales del CSV
actual_content = test['normalized_content']
actual_wording = test['normalized_wording']

# Crear un scatter plot para comparar las predicciones con los valores reales
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(actual_content, predicted_content, alpha=0.5)
plt.title('Comparación de Content (Valores Reales vs. Predicciones)')
plt.xlabel('Valor Real')
plt.ylabel('Predicción')

plt.subplot(1, 2, 2)
plt.scatter(actual_wording, predicted_wording, alpha=0.5)
plt.title('Comparación de Wording (Valores Reales vs. Predicciones)')
plt.xlabel('Valor Real')
plt.ylabel('Predicción')

plt.tight_layout()

# Crear histogramas de errores
error_content = actual_content - predicted_content
error_wording = actual_wording - predicted_wording

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(error_content, bins=20, color='blue', alpha=0.7)
plt.title('Histograma de Errores en Content')
plt.xlabel('Error')
plt.ylabel('Frecuencia')

plt.subplot(1, 2, 2)
plt.hist(error_wording, bins=20, color='green', alpha=0.7)
plt.title('Histograma de Errores en Wording')
plt.xlabel('Error')
plt.ylabel('Frecuencia')

plt.tight_layout()

plt.show()
