1. Preparación de los datos

In [2]:
import pandas as pd

# Cargar el archivo CSV en un DataFrame
data = pd.read_csv("data/merged_data.csv")

# Realizar el preprocesamiento necesario en las columnas de texto (por ejemplo, eliminación de signos de puntuación, tokenización)
# Puedes utilizar bibliotecas como nltk o spaCy para el preprocesamiento.

In [3]:
!pip install nltk




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Descargar recursos adicionales de NLTK (si aún no se han descargado)
nltk.download('stopwords')
nltk.download('punkt')

# Crear una lista de palabras de detención y definir un tokenizador
stop_words = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Función para preprocesar el texto
def preprocess_text(text):
    # Convierte el texto a minúsculas
    text = text.lower()
    
    # Elimina signos de puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenización
    words = word_tokenize(text)
    
    # Elimina palabras de detención
    words = [word for word in words if word not in stop_words]
    
    # Reconstruye el texto preprocesado
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

# Aplica la función de preprocesamiento a la columna 'text' del DataFrame
data['preprocessed_text'] = data['text'].apply(preprocess_text)

# Muestra el DataFrame resultante con el texto preprocesado
print(data[['text', 'preprocessed_text']])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


                                                   text  \
0     The third wave was an experimentto see how peo...   
1     The Third Wave developed  rapidly because the ...   
2     The third wave only started as an experiment w...   
3     The experimen was orginally about how even whe...   
4     The third wave developed so quickly due to the...   
...                                                 ...   
7160  It has to be made on a complex storyline, with...   
7161  Aristotle descirbes an ideal tradgedy as being...   
7162  A tragedy should have a complex plan not a sim...   
7163  Aristotle believed that the ideal tradegy shou...   
7164  An ideal tragety has three elements that make ...   

                                      preprocessed_text  
0     third wave experimentto see people reacted new...  
1     third wave developed rapidly students genuinly...  
2     third wave started experiment within class slo...  
3     experimen orginally even terrible thngs happen...  
4

2. Representaciones de texto con BERT

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Cargar el tokenizador y el modelo BERT pre-entrenado
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Tokenizar tus textos y ajustar la longitud de las secuencias
input_ids = []
attention_masks = []

for text in data['text']:
    encoding = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    input_ids.append(encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from torch.nn.utils.rnn import pad_sequence

# Define la longitud máxima deseada para tus secuencias (ajusta según tus necesidades)
max_sequence_length = 256

# Función para aplicar el relleno
def pad_and_create_mask(input_ids):
    padded_input_ids = []
    attention_masks = []

    for ids in input_ids:
        if len(ids) > max_sequence_length:
            padded_ids = ids[:max_sequence_length]
            mask = torch.tensor([1] * max_sequence_length)
        else:
            padded_ids = torch.cat((ids, torch.zeros(max_sequence_length - len(ids), dtype=torch.long)))
            mask = torch.cat((torch.ones(len(ids), dtype=torch.long)), torch.zeros(max_sequence_length - len(ids), dtype=torch.long))
        
        padded_input_ids.append(padded_ids)
        attention_masks.append(mask)

    return torch.stack(padded_input_ids), torch.stack(attention_masks)

# Aplicar el relleno y crear las máscaras de atención
input_ids, attention_masks = pad_and_create_mask(input_ids)


RuntimeError: Tensors must have same number of dimensions: got 2 and 1

3. Construcción del modelo de regresión

In [None]:
import torch.nn as nn
import torch.optim as optim

# Define un modelo de regresión que utiliza BERT como base
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()
        self.bert = model.bert
        self.linear = nn.Linear(768, 1)  # 768 es la dimensión de salida de BERT
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_masks):
        output = self.bert(input_ids, attention_mask=attention_masks)[0]
        output = output[:, 0, :]  # Tomar la representación [CLS] de BERT
        output = self.linear(output)
        output = self.relu(output)
        return output

model = Regressor()
criterion = nn.MSELoss()  # Error cuadrático medio como función de pérdida
optimizer = optim.Adam(model.parameters(), lr=0.001)

4. Entrenamiento del modelo

In [None]:
from sklearn.model_selection import train_test_split

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(input_ids, data['content'], test_size=0.2, random_state=42)
attention_masks_train, attention_masks_test, _, _ = train_test_split(attention_masks, data['content'], test_size=0.2, random_state=42)

# Entrenar el modelo
epochs = 5  # Número de épocas (ajusta según sea necesario)

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train, attention_masks_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()


5. Predicciones

In [None]:
model.eval()
with torch.no_grad():
    predictions = model(X_test, attention_masks_test)

6. Evaluación, generación de comentarios y presentación de resultados

In [None]:
# Evaluar el modelo, comparar las predicciones con los valores reales y generar comentarios.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Generar comentarios
for i in range(len(predictions)):
    if predictions[i] >= y_test[i]:
        print(f"Para el ejemplo {i+1}, el contenido es bueno.")
    else:
        print(f"Para el ejemplo {i+1}, el contenido se puede mejorar.")

# Muestra los comentarios junto con las predicciones para proporcionar retroalimentación completa.