1. Preparación de los datos

In [2]:
import pandas as pd

# Cargar el archivo CSV en un DataFrame
data = pd.read_csv("data/merged_data.csv")

# Realizar el preprocesamiento necesario en las columnas de texto (por ejemplo, eliminación de signos de puntuación, tokenización)
# Puedes utilizar bibliotecas como nltk o spaCy para el preprocesamiento.

In [3]:
!pip install nltk




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Descargar recursos adicionales de NLTK (si aún no se han descargado)
nltk.download('stopwords')
nltk.download('punkt')

# Crear una lista de palabras de detención y definir un tokenizador
stop_words = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Función para preprocesar el texto
def preprocess_text(text):
    # Convierte el texto a minúsculas
    text = text.lower()
    
    # Elimina signos de puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenización
    words = word_tokenize(text)
    
    # Elimina palabras de detención
    words = [word for word in words if word not in stop_words]
    
    # Reconstruye el texto preprocesado
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

# Aplica la función de preprocesamiento a la columna 'text' del DataFrame
data['preprocessed_text'] = data['text'].apply(preprocess_text)

# Muestra el DataFrame resultante con el texto preprocesado
print(data[['text', 'preprocessed_text']])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


                                                   text  \
0     The third wave was an experimentto see how peo...   
1     The Third Wave developed  rapidly because the ...   
2     The third wave only started as an experiment w...   
3     The experimen was orginally about how even whe...   
4     The third wave developed so quickly due to the...   
...                                                 ...   
7160  It has to be made on a complex storyline, with...   
7161  Aristotle descirbes an ideal tradgedy as being...   
7162  A tragedy should have a complex plan not a sim...   
7163  Aristotle believed that the ideal tradegy shou...   
7164  An ideal tragety has three elements that make ...   

                                      preprocessed_text  
0     third wave experimentto see people reacted new...  
1     third wave developed rapidly students genuinly...  
2     third wave started experiment within class slo...  
3     experimen orginally even terrible thngs happen...  
4

2. Representaciones de texto con BERT

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Cargar el tokenizador y el modelo BERT pre-entrenado
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Tokenizar tus textos y ajustar la longitud de las secuencias
input_ids = []
attention_masks = []

for text in data['text']:
    encoding = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    input_ids.append(encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3. Construcción del modelo de regresión

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

# Definir un modelo de regresión basado en BERT
class BERTRegressionModel(nn.Module):
    def __init__(self, model):
        super(BERTRegressionModel, self).__init__()
        self.bert = model
        self.regression_head = nn.Linear(768, 1)  # Salida de BERT (768 dimensiones) a una sola neurona para regresión

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        pooled_output = outputs.pooler_output
        regression_output = self.regression_head(pooled_output)
        return regression_output

# Crear una instancia del modelo de regresión
regression_model = BERTRegressionModel(model)

# Definir la función de pérdida y el optimizador para regresión
criterion = nn.MSELoss()  # Error cuadrático medio
optimizer = optim.Adam(regression_model.parameters(), lr=0.001)


4. Entrenamiento del modelo

In [23]:
from torch.utils.data import DataLoader, TensorDataset

# Crear conjuntos de datos de PyTorch
X_train_tensor = torch.stack(input_ids)
mask_train_tensor = torch.stack(attention_masks)
y_content = torch.tensor(data['content'], dtype=torch.float32)  # Reemplaza 'content' con tu columna de contenido
y_wording = torch.tensor(data['wording'], dtype=torch.float32)  # Reemplaza 'wording' con tu columna de redacción

dataset_content = TensorDataset(X_train_tensor, mask_train_tensor, y_content)
dataset_wording = TensorDataset(X_train_tensor, mask_train_tensor, y_wording)

# Define un DataLoader para cargar datos de entrenamiento en lotes
batch_size = 32
dataloader_content = DataLoader(dataset_content, batch_size=batch_size)
dataloader_wording = DataLoader(dataset_wording, batch_size=batch_size)

# Función de entrenamiento
def train_regression_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for inputs, masks, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = criterion(outputs.view(-1), targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Entrenar el modelo para 'content'
train_regression_model(regression_model, dataloader_content, criterion, optimizer)

# Entrenar el modelo para 'wording'
train_regression_model(regression_model, dataloader_wording, criterion, optimizer)


RuntimeError: stack expects each tensor to be equal size, but got [1, 71] at entry 0 and [1, 263] at entry 1

5. Predicciones

In [None]:
model.eval()
with torch.no_grad():
    predictions = model(X_test, attention_masks_test)

6. Evaluación, generación de comentarios y presentación de resultados

In [None]:
# Evaluar el modelo, comparar las predicciones con los valores reales y generar comentarios.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Generar comentarios
for i in range(len(predictions)):
    if predictions[i] >= y_test[i]:
        print(f"Para el ejemplo {i+1}, el contenido es bueno.")
    else:
        print(f"Para el ejemplo {i+1}, el contenido se puede mejorar.")

# Muestra los comentarios junto con las predicciones para proporcionar retroalimentación completa.