In [1]:
#Importacion de librerias
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import pandas as pd
from sklearn.metrics import mean_squared_error
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


# Regresion de Texto

## Zero-Shot-Learning

In [2]:
# Importacion del dataset
sentiment140 = load_dataset("stanfordnlp/sentiment140",
cache_dir= "local-datasets", trust_remote_code=True                
)
sentiment140

DatasetDict({
    train: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query'],
        num_rows: 1600000
    })
    test: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query'],
        num_rows: 498
    })
})

In [100]:
sentiment140['train'].to_pandas()

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,TheWDB.com - Very cool to hear old Walt interv...,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,Are you ready for your MoJo Makeover? Ask me f...,Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,Happy 38th Birthday to my boo of alll time!!! ...,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [103]:
df = sentiment140['train'].to_pandas()

In [104]:
df['sentiment'].unique()

array([0, 4], dtype=int32)

In [3]:
# Seleccionar la división 'train'
sentiment140_train = sentiment140['train']

In [4]:
# Seleccionar solo las columnas relevantes
sentiment140_relevant = pd.DataFrame(
    sentiment140['train'].remove_columns([col for col in sentiment140['train'].features if col not in ['text', 'sentiment']])
)

# Mostrar los primeros registros
print(sentiment140_relevant.head())

                                                text  sentiment
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...          0
1  is upset that he can't update his Facebook by ...          0
2  @Kenichan I dived many times for the ball. Man...          0
3    my whole body feels itchy and like its on fire           0
4  @nationwideclass no, it's not behaving at all....          0


In [5]:
# Función para limpiar el texto
def clean_text(text):
    # Eliminar menciones y hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Eliminar URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Eliminar caracteres no alfabéticos y dejar solo letras y espacios
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convertir a minúsculas
    text = text.lower()
    return text

# Aplicar la limpieza al texto en el DataFrame relevante
sentiment140_relevant['text'] = sentiment140_relevant['text'].apply(clean_text)

# Verificar los primeros registros después de la limpieza
print(sentiment140_relevant[['text', 'sentiment']])

                                                      text  sentiment
0           a thats a bummer  you shoulda got david car...          0
1        is upset that he cant update his facebook by t...          0
2         i dived many times for the ball managed to sa...          0
3          my whole body feels itchy and like its on fire           0
4         no its not behaving at all im mad why am i he...          0
...                                                    ...        ...
1599995  just woke up having no school is the best feel...          4
1599996  thewdbcom  very cool to hear old walt intervie...          4
1599997  are you ready for your mojo makeover ask me fo...          4
1599998  happy th birthday to my boo of alll time tupac...          4
1599999                                         happy               4

[1600000 rows x 2 columns]


In [6]:
# Reducir el tamaño del dataset 
sentiment140_sample = sentiment140_relevant.sample(n=4000, random_state=42)
print(f"Tamaño del dataset reducido: {len(sentiment140_sample)}")

Tamaño del dataset reducido: 4000


In [109]:
sentiment140_sample['sentiment'].value_counts(normalize= True)

sentiment
0    0.5
4    0.5
Name: proportion, dtype: float64

In [7]:
# Extraer texto y etiquetas de la muestra
texts = sentiment140_sample['text']
true_labels = sentiment140_sample['sentiment']

In [57]:
# Cargar el modelo y el tokenizer
model_zero_regression = "t5-small"
tokenizer_zero_regression = T5Tokenizer.from_pretrained(model_zero_regression)
model_zero_regression = T5ForConditionalGeneration.from_pretrained(model_zero_regression)




In [60]:
def run_t5_model(texto_entrada, umbral):
    # Crear el prompt
    prompt_zero_regression = (f"Rate the sentiment of the following text from 0 to 4: {texto_entrada}")
    
    # Tokenizar y generar la respuesta
    inputs_zero_regression = tokenizer_zero_regression(prompt_zero_regression, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        output_zero_regression = model_zero_regression.generate(**inputs_zero_regression)
    
    # Decodificar y extraer la puntuación
    result_zero_regression = tokenizer_zero_regression.decode(output_zero_regression[0], skip_special_tokens=True).strip()
    match_zero_regression = re.search(r'\b[0-4]\b', result_zero_regression)  
    probabilidad_zero_regression = float(match_zero_regression.group()) if match_zero_regression else None
    
    # Aplicar filtro por umbral
    return "No estoy seguro" if probabilidad_zero_regression is None or probabilidad_zero_regression < umbral else probabilidad_zero_regression

In [8]:
predictions_zero_regression = []
umbral = 1

In [69]:
for text in texts:
    result_zero_regression = run_t5_model(text, umbral)  
    # Extraer la probabilidad de la respuesta si está en el resultado
    probabilidad_zero_regression = float(result_zero_regression) if isinstance(result_zero_regression, (int, float)) else None
    predictions_zero_regression.append(probabilidad_zero_regression)



In [70]:
# Filtrar predicciones válidas
filtered_true_labels_zero_regression = [true_labels.iloc[i] for i in range(len(predictions_zero_regression)) if i < len(true_labels) and predictions_zero_regression[i] is not None]
filtered_predictions_zero_regression = [pred for pred in predictions_zero_regression if pred is not None]

### Analisis con MSE

In [74]:
# Calcular MSE
if filtered_true_labels_zero_regression and filtered_predictions_zero_regression:
    mse = mean_squared_error(filtered_true_labels_zero_regression, filtered_predictions_zero_regression)
    print(f"Mean Squared Error: {mse}")
else:
    print("No se pudieron calcular métricas debido a predicciones insuficientes.")

Mean Squared Error: 5.333333333333333
