In [20]:
import os
import pandas as pd
import numpy as np
import language_tool_python
import spacy
from spacy.lang.es.stop_words import STOP_WORDS
import re
import nltk
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
import torch
import fasttext

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jescobarmora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
train_df_path = os.path.join(parent_dir, 'data', 'Sarcasmo_train.csv')
test_df_path = os.path.join(parent_dir, 'data', 'Sarcasmo_test.csv')

train_df = pd.read_csv(train_df_path, sep=';', encoding='utf-8')
test_df = pd.read_csv(train_df_path, sep=';', encoding='utf-8')

In [6]:
train_df.head()

Unnamed: 0,Locutor,Locución,Sarcasmo
0,archer,"No, era por saber si tenía que llevar un saco ...",Si
1,archer,"A quién le importa? No, lo pregunto de verdad",No
2,archer,Pero voy a dar por hecho que ha pedido refuerzos,No
3,malory,¿Por qué no te callas? Tengo que pensar,No
4,slater,"Sí, sospechábamos un poco",No


In [7]:
test_df.head()

Unnamed: 0,Locutor,Locución,Sarcasmo
0,archer,"No, era por saber si tenía que llevar un saco ...",Si
1,archer,"A quién le importa? No, lo pregunto de verdad",No
2,archer,Pero voy a dar por hecho que ha pedido refuerzos,No
3,malory,¿Por qué no te callas? Tengo que pensar,No
4,slater,"Sí, sospechábamos un poco",No


In [9]:
# Inicializar herramienta de corrección ortográfica
tool = language_tool_python.LanguageTool('es')

def correct_text(text):
    matches = tool.check(text)
    corrected = language_tool_python.utils.correct(text, matches)
    return corrected

# Aplicar corrección ortográfica
train_df['Locución'] = train_df['Locución'].apply(correct_text)
test_df['Locución'] = test_df['Locución'].apply(correct_text)

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:05<00:00, 43.9MB/s] 
Unzipping /tmp/tmpmh9k7puw.zip to /home/jescobarmora/.cache/language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /home/jescobarmora/.cache/language_tool_python.


In [None]:
# Cargar modelo de spaCy en español
nlp = spacy.load('es_core_news_lg')

# Unir stopwords de NLTK y spaCy
stop_nltk = stopwords.words('spanish')
stop_spacy = list(STOP_WORDS)
stop_words = set(stop_nltk + stop_spacy)

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return ' '.join(tokens)

# Aplicar preprocesamiento
train_df['processed_text'] = train_df['Locución'].apply(preprocess_text)
test_df['processed_text'] = test_df['Locución'].apply(preprocess_text)


In [14]:
def extract_pos_counts(text):
    doc = nlp(text)
    pos_counts = doc.count_by(spacy.attrs.POS)
    total_tokens = len(doc)
    features = {
        'nouns': pos_counts.get(nlp.vocab.strings['NOUN'], 0) / total_tokens,
        'verbs': pos_counts.get(nlp.vocab.strings['VERB'], 0) / total_tokens,
        'adjectives': pos_counts.get(nlp.vocab.strings['ADJ'], 0) / total_tokens,
        'adverbs': pos_counts.get(nlp.vocab.strings['ADV'], 0) / total_tokens,
        # Agrega más categorías si lo deseas
    }
    return pd.Series(features)

# Aplicar extracción de características
train_pos_features = train_df['Locución'].apply(extract_pos_counts)
test_pos_features = test_df['Locución'].apply(extract_pos_counts)

In [17]:
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
model = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return cls_embedding.flatten()

# Obtener embeddings
train_df['bert_embedding'] = train_df['processed_text'].apply(get_bert_embedding)
test_df['bert_embedding'] = test_df['processed_text'].apply(get_bert_embedding)

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [19]:
# Guardar textos en archivos temporales
train_df['processed_text'].to_csv('train_texts.txt', index=False, header=False)
test_df['processed_text'].to_csv('test_texts.txt', index=False, header=False)

# Entrenar modelo FastText
ft_model = fasttext.train_unsupervised('train_texts.txt', model='skipgram')

def get_fasttext_embedding(text):
    words = text.split()
    word_embeddings = [ft_model.get_word_vector(word) for word in words if word in ft_model.words]
    if len(word_embeddings) == 0:
        return np.zeros(ft_model.get_dimension())
    else:
        return np.mean(word_embeddings, axis=0)

# Obtener embeddings
train_df['fasttext_embedding'] = train_df['processed_text'].apply(get_fasttext_embedding)
test_df['fasttext_embedding'] = test_df['processed_text'].apply(get_fasttext_embedding)


Read 0M words
Number of words:  119
Number of labels: 0
Progress: 100.0% words/sec/thread:  258126 lr:  0.000000 avg.loss:  4.128316 ETA:   0h 0m 0s


In [21]:
# Codificar etiquetas
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['Sarcasmo'])
test_df['label'] = label_encoder.transform(test_df['Sarcasmo'])

# Convertir listas de embeddings a matrices
train_bert_embeddings = np.stack(train_df['bert_embedding'].values)
test_bert_embeddings = np.stack(test_df['bert_embedding'].values)

train_fasttext_embeddings = np.stack(train_df['fasttext_embedding'].values)
test_fasttext_embeddings = np.stack(test_df['fasttext_embedding'].values)

# Convertir características gramaticales a matrices
train_pos_features = train_pos_features.reset_index(drop=True)
test_pos_features = test_pos_features.reset_index(drop=True)

In [22]:
from sklearn.metrics import roc_auc_score, accuracy_score

def evaluate_model(model, X_test, y_test):
    y_probs = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_probs)
    accuracy = accuracy_score(y_test, model.predict(X_test))
    return roc_auc, accuracy

Modelo 1: BERT sin Ingeniería de Características

In [23]:
from sklearn.linear_model import LogisticRegression

# Entrenar modelo
model1 = LogisticRegression(max_iter=1000)
model1.fit(train_bert_embeddings, train_df['label'])

# Evaluar modelo
roc_auc1, accuracy1 = evaluate_model(model1, test_bert_embeddings, test_df['label'])
print(f'Modelo 1 - BERT sin ingeniería de características: ROC AUC = {roc_auc1:.4f}, Accuracy = {accuracy1:.4f}')


Modelo 1 - BERT sin ingeniería de características: ROC AUC = 0.9929, Accuracy = 0.9970


Modelo 2: BERT con Ingeniería de Características

In [24]:
from sklearn.preprocessing import StandardScaler

# Concatenar embeddings y características
X_train_model2 = np.hstack([train_bert_embeddings, train_pos_features.values])
X_test_model2 = np.hstack([test_bert_embeddings, test_pos_features.values])

# Escalar características
scaler = StandardScaler()
X_train_model2 = scaler.fit_transform(X_train_model2)
X_test_model2 = scaler.transform(X_test_model2)

# Entrenar modelo
model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train_model2, train_df['label'])

# Evaluar modelo
roc_auc2, accuracy2 = evaluate_model(model2, X_test_model2, test_df['label'])
print(f'Modelo 2 - BERT con ingeniería de características: ROC AUC = {roc_auc2:.4f}, Accuracy = {accuracy2:.4f}')

Modelo 2 - BERT con ingeniería de características: ROC AUC = 0.9996, Accuracy = 0.9985


Modelo 3: FastText sin Ingeniería de Características

In [25]:
# Entrenar modelo
model3 = LogisticRegression(max_iter=1000)
model3.fit(train_fasttext_embeddings, train_df['label'])

# Evaluar modelo
roc_auc3, accuracy3 = evaluate_model(model3, test_fasttext_embeddings, test_df['label'])
print(f'Modelo 3 - FastText sin ingeniería de características: ROC AUC = {roc_auc3:.4f}, Accuracy = {accuracy3:.4f}')

Modelo 3 - FastText sin ingeniería de características: ROC AUC = 0.7176, Accuracy = 0.9285


In [26]:
# Concatenar embeddings y características
X_train_model4 = np.hstack([train_fasttext_embeddings, train_pos_features.values])
X_test_model4 = np.hstack([test_fasttext_embeddings, test_pos_features.values])

# Escalar características
X_train_model4 = scaler.fit_transform(X_train_model4)
X_test_model4 = scaler.transform(X_test_model4)

# Entrenar modelo
model4 = LogisticRegression(max_iter=1000)
model4.fit(X_train_model4, train_df['label'])

# Evaluar modelo
roc_auc4, accuracy4 = evaluate_model(model4, X_test_model4, test_df['label'])
print(f'Modelo 4 - FastText con ingeniería de características: ROC AUC = {roc_auc4:.4f}, Accuracy = {accuracy4:.4f}')

Modelo 4 - FastText con ingeniería de características: ROC AUC = 0.8873, Accuracy = 0.9434


In [None]:
# Crear DataFrame con los resultados
results = pd.DataFrame({
    'Modelo': [
        'BERT sin ingeniería de características',
        'BERT con ingeniería de características',
        'FastText sin ingeniería de características',
        'FastText con ingeniería de características'
    ],
    'ROC AUC': [roc_auc1, roc_auc2, roc_auc3, roc_auc4],
    'Accuracy': [accuracy1, accuracy2, accuracy3, accuracy4]
})

# Ordenar por ROC AUC
results = results.sort_values(by='ROC AUC', ascending=False)
print(results)

# Guardar resultados en CSV
results.to_csv('resultados_modelos.csv', index=False)

                                       Modelo   ROC AUC  Accuracy
1      BERT con ingeniería de características  0.999632  0.998510
0      BERT sin ingeniería de características  0.992894  0.997019
3  FastText con ingeniería de características  0.887256  0.943368
2  FastText sin ingeniería de características  0.717596  0.928465


: 