In [2]:
#Lectura de documentos
import pandas as pd
def leer_json(archivo):
    d = pd.read_json(archivo, lines=True)
    return d['text'].to_numpy(), d['klass'].to_numpy()


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leopa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import re

def eliminar_emojis(texto):
    # Expresión regular que cubre la mayoría de emojis y pictogramas
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticonos
        "\U0001F300-\U0001F5FF"  # símbolos y pictogramas
        "\U0001F680-\U0001F6FF"  # transporte y mapas
        "\U0001F1E0-\U0001F1FF"  # banderas
        "\U00002702-\U000027B0"  # otros símbolos
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', texto)

In [13]:
import unicodedata
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
stemmer = SnowballStemmer("spanish")

_STOPWORDS = stopwords.words("spanish")  # agregar más palabras a esta lista si es necesario
PUNCTUACTION = ";:,.\\-\"'/"
SYMBOLS = "()[]¿?¡!{}~<>|"
NUMBERS= "0123456789"
SKIP_SYMBOLS = set(PUNCTUACTION + SYMBOLS)
SKIP_SYMBOLS_AND_SPACES = set(PUNCTUACTION + SYMBOLS + '\t\n\r ')

def normaliza_texto(input_str,
                    punct=False,
                    accents=False,
                    num=False,
                    max_dup=2):
    """
        punct=False (elimina la puntuación, True deja intacta la puntuación)
        accents=False (elimina los acentos, True deja intactos los acentos)
        num= False (elimina los números, True deja intactos los acentos)
        max_dup=2 (número máximo de símbolos duplicados de forma consecutiva, rrrrr => rr)
    """
    
    nfkd_f = unicodedata.normalize('NFKD', input_str)
    n_str = []
    c_prev = ''
    cc_prev = 0
    for c in nfkd_f:
        if not num:
            if c in NUMBERS:
                continue
        if not punct:
            if c in SKIP_SYMBOLS:
                continue
        if not accents and unicodedata.combining(c):
            continue
        if c_prev == c:
            cc_prev += 1
            if cc_prev >= max_dup:
                continue
        else:
            cc_prev = 0
        n_str.append(c)
        c_prev = c
    texto = unicodedata.normalize('NFKD', "".join(n_str))
    texto = re.sub(r'(\s)+', r' ', texto.strip(), flags=re.IGNORECASE)
    return texto


def mi_preprocesamiento(texto):
    #convierte a minúsculas el texto antes de normalizar
    #print("antes: ", texto)
    texto = texto.lower()
    eliminar_emojis(texto)
    texto = re.sub(r"http\S+|www\S+|https\S+", "", texto)  # URLs
    texto = re.sub(r"@\w+", "", texto)  # Menciones
    tokens = word_tokenize(texto)
    texto = " ".join(tokens)
    texto = normaliza_texto(texto)
    return texto


def mi_tokenizador_sin_stopwords(texto):
    # Añadimos .lower() para ser consistentes
    return [t for t in texto.lower().split() if t not in _STOPWORDS]


def mi_tokenizador_con_stemming(texto):

    # Esta función ya usaba .lower(), lo cual es correcto
    tokens_sin_stopwords = [t for t in texto.lower().split() if t not in _STOPWORDS] 
    
    # Usamos el objeto stemmer_obj que nos pasen como argumento
    tokens_stemmed = [stemmer.stem(t) for t in tokens_sin_stopwords]
    
    return tokens_stemmed

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorizar_countvectorizer(docs, procesador, tokenizador, rango_ngram):
    vec = CountVectorizer(analyzer="word", preprocessor=procesador, tokenizer=tokenizador,  ngram_range=rango_ngram)
    X = vec.fit_transform(docs)
    return X.toarray(), vec

def vectorizar_tfidfvectorizer(docs, procesador, tokenizador, rango_ngram):
    vec_tfidf = TfidfVectorizer(analyzer="word", preprocessor=procesador, tokenizer= tokenizador, ngram_range= rango_ngram)
    X = vec_tfidf.fit_transform(docs)
    return X.toarray(), vec_tfidf


In [5]:
## version cupy
import cupy as cp 
import math

# Función de activación sigmoide
def sigmoid(x):
    return 1 / (1 + cp.exp(-x)) 

# Derivada de la sigmoide
def sigmoid_derivative(x):
    # return sigmoid(x) * (1 - sigmoid(x))
    return x * (1 - x)

# Establece la semilla para la generación de números aleatorios
def seed(random_state=33):
    cp.random.seed(random_state)

def xavier_initialization(input_size, output_size):
    # Calcular el límite de la distribución uniforme
    limit = cp.sqrt(6 / (input_size + output_size)) 
    W = cp.random.uniform(-limit, limit, (input_size, output_size))
    return W

#Genera la inicialización de con la distribución normal estandar
def normal_initialization(input_size, output_size, mean=0.0, std=1.0):
    return cp.random.normal(mean, std, (input_size, output_size))

def create_minibatches(X, y, batch_size):
    n_samples = X.shape[0]
    indices = cp.random.permutation(n_samples)
    X_shuffled, y_shuffled = X[indices], y[indices]
    
    num_batches = math.ceil(n_samples / batch_size)
    
    # Divide los datos en minibatches
    for X_batch, y_batch in zip(cp.array_split(X_shuffled, num_batches),
                                cp.array_split(y_shuffled, num_batches)): 
        yield X_batch, y_batch

    
class MLP_TODO:
    def __init__(self, num_entradas, num_neuronas_ocultas, num_salidas, epochs,fun_init = None, batch_size=32, learning_rate=0.5,tolerancia = 1e-4, patience= 20, min_delta=1e-4,random_state=42):

        seed(random_state)
        # Definir la tasa de aprendizaje
        self.learning_rate = learning_rate
        # Definir el número de épocas
        self.epochs = epochs
        # Definir el tamaño del batch de procesamiento
        self.batch_size = batch_size
        
         # definir las capas
        self.W1 = fun_init(num_neuronas_ocultas, num_entradas)
        self.b1 = cp.zeros((1, num_neuronas_ocultas)) 
        self.W2 = fun_init(num_salidas, num_neuronas_ocultas)
        self.b2 = cp.zeros((1, num_salidas)) 

        self.tolerancia = tolerancia
        self.min_delta = min_delta
        self.patience = patience

    def forward(self, X):
        # 1. Propagación hacia adelante (Forward pass)
        self.X = X
        self.z_c1 = X @ self.W1.T + self.b1    
        self.a_c1 = sigmoid(self.z_c1)

        self.z_c2 = self.a_c1 @ self.W2.T + self.b2
        y_pred = sigmoid(self.z_c2)
        return y_pred
    

    def loss_function_MSE(self, y_pred, y):
        # 2. Cálculo del error con MSE
        self.y_pred = y_pred
        self.y = y
        error = 0.5 * cp.mean((y_pred - y) ** 2) 
        return error
    

    def backward(self):
        # 3. Propagación hacia atrás (Backward pass)
        
        # Gradiente de la salida
        dE_dy_pred = self.y_pred - self.y
        d_y_pred_d_zc2 = sigmoid_derivative(self.y_pred)
        delta_c2 = dE_dy_pred * d_y_pred_d_zc2

        # Gradiente en la capa oculta
        d_zc2_d_a_c1 = delta_c2 @ self.W2
        delta_c1 = d_zc2_d_a_c1 * sigmoid_derivative(self.a_c1)

        #calcula el gradiente de la función de error respecto a los pesos
        self.dE_dW2 = delta_c2.T @ self.a_c1
        self.dE_db2 = cp.mean(delta_c2, axis=0, keepdims=True) 
        self.dE_dW1 = delta_c1.T @ self.X
        self.dE_db1 = cp.mean(delta_c1, axis = 0, keepdims=True) 



    def update(self):
        # Actualización de pesos de la capa de salida
        self.W2 -= self.learning_rate * self.dE_dW2 
        self.b2 -= self.learning_rate * self.dE_db2 

        # Actuailzación de pesos de la capa oculta
        self.W1 -= self.learning_rate * self.dE_dW1 
        self.b1 -= self.learning_rate * self.dE_db1 

    def predict(self, X):
        y_pred = self.forward(X)
        # Obtener la clase para el clasificador binario
        y_pred = cp.where(y_pred >= 0.5, 1, 0) 
        return y_pred

    def train(self, X, Y):
        # El entrenamiento ahora asume que X e Y ya son arrays de CuPy
        
        # --- NUEVO: Inicialización para la paciencia ---
        best_error = float('inf')  # El mejor error (más bajo) visto hasta ahora
        epochs_no_improve = 0      # Contador de épocas sin mejora
        # ------------------------------------------------
        
        final_epoch_error = 0 

        for epoch in range(self.epochs):
            epoch_error = 0 
            num_batch = 0
            
            for X_batch, y_batch in create_minibatches(X, Y, self.batch_size):
                y_pred = self.forward(X_batch)
                error = self.loss_function_MSE(y_pred, y_batch)
                
                epoch_error += error.get() # .get() transfiere a CPU
                
                self.backward()
                self.update()
                num_batch += 1
            
            
            if num_batch > 0:
                avg_epoch_error = epoch_error / num_batch
            else:
                avg_epoch_error = 0

            final_epoch_error = avg_epoch_error 

            # Imprimir el progreso
            # print(f"Epoch {epoch + 1}/{self.epochs}, Error Promedio: {avg_epoch_error:.8f}")

            # --- 1. Condición de parada (Tolerancia/Umbral) ---
            # Tu condición original: si el error es suficientemente bueno.
            if avg_epoch_error < self.tolerancia:
                print(f"--- DETENCIÓN TEMPRANA (Tolerancia alcanzada) ---")
                print(f"Error ({avg_epoch_error:.8f}) < umbral ({self.tolerancia}) en época {epoch + 1}.")
                break 

            # --- 2. NUEVA Condición de parada (Paciencia / No convergencia) ---
            # Comprobar si el error ha mejorado significativamente
            
            if avg_epoch_error < best_error - self.min_delta:
                # ¡Hubo mejora!
                best_error = avg_epoch_error  # Actualizamos el mejor error
                epochs_no_improve = 0       # Reiniciamos el contador de paciencia
            else:
                # No hubo mejora significativa
                epochs_no_improve += 1
            
            # Comprobar si nos hemos quedado sin paciencia
            if epochs_no_improve >= self.patience:
                print(f"--- DETENCIÓN TEMPRANA (Paciencia agotada) ---")
                print(f"No hubo mejora significativa en el error durante {self.patience} épocas.")
                print(f"Último error: {avg_epoch_error:.8f}, Mejor error: {best_error:.8f}")
                break
            # -----------------------------------------------------------------

        return final_epoch_error

In [18]:

def guardar_resultados(datos, archivo):

    df = pd.DataFrame(datos, columns=['klass'])

    df['id'] = df.index + 1

    df = df[['id', 'klass']]

    df.to_csv(archivo, index=False)

    print("¡Archivo 'output_pandas_Nx1.csv' guardado exitosamente!")

In [6]:
df_hateval_es = pd.read_csv('Resultados_temporales_pract2_hateval_es.csv')
df_hateval_es

Unnamed: 0,Nombre,Neuronas_ocultas,Init_func,Pesado_terminos,Terminos,Prepocesamiento,Learnin_rate,Batch_size,Precision_score,Recall_score,F1_score,Accuracy
0,hateval_es,1024,normal_initialization,vectorizar_countvectorizer,"(2, 2)",mi_tokenizador_sin_stopwords,0.10,16,0.706713,0.650091,0.640752,0.678
1,hateval_es,1024,xavier_initialization,vectorizar_countvectorizer,"(1, 2)",mi_tokenizador_con_stemming,0.50,64,0.222000,0.500000,0.307479,0.444
2,hateval_es,256,normal_initialization,vectorizar_countvectorizer,"(2, 2)",mi_tokenizador_sin_stopwords,0.50,16,0.677778,0.645181,0.640624,0.668
3,hateval_es,64,xavier_initialization,vectorizar_countvectorizer,"(2, 2)",,0.01,64,0.745192,0.733035,0.735428,0.744
4,hateval_es,512,xavier_initialization,vectorizar_tfidfvectorizer,"(2, 2)",mi_tokenizador_sin_stopwords,0.50,32,0.222000,0.500000,0.307479,0.444
...,...,...,...,...,...,...,...,...,...,...,...,...
295,hateval_es,64,xavier_initialization,vectorizar_countvectorizer,"(1, 2)",mi_tokenizador_con_stemming,0.50,32,0.222000,0.500000,0.307479,0.444
296,hateval_es,64,normal_initialization,vectorizar_tfidfvectorizer,"(2, 2)",,0.50,32,0.278000,0.500000,0.357326,0.556
297,hateval_es,256,normal_initialization,vectorizar_tfidfvectorizer,"(1, 2)",,0.50,16,0.222000,0.500000,0.307479,0.444
298,hateval_es,256,normal_initialization,vectorizar_tfidfvectorizer,"(1, 1)",,0.10,32,0.702634,0.697955,0.699233,0.706


In [7]:
df_hateval_es.sort_values(by='Precision_score', ascending=False).head(5)

Unnamed: 0,Nombre,Neuronas_ocultas,Init_func,Pesado_terminos,Terminos,Prepocesamiento,Learnin_rate,Batch_size,Precision_score,Recall_score,F1_score,Accuracy
100,hateval_es,512,xavier_initialization,vectorizar_tfidfvectorizer,"(1, 1)",mi_tokenizador_sin_stopwords,0.01,64,0.78252,0.518018,0.395822,0.572
143,hateval_es,64,xavier_initialization,vectorizar_countvectorizer,"(1, 2)",,0.1,16,0.779193,0.769055,0.771709,0.778
237,hateval_es,64,xavier_initialization,vectorizar_countvectorizer,"(1, 2)",,0.1,32,0.776819,0.767256,0.769809,0.776
46,hateval_es,512,xavier_initialization,vectorizar_countvectorizer,"(1, 1)",mi_tokenizador_sin_stopwords,0.01,64,0.775466,0.777675,0.776187,0.778
136,hateval_es,128,xavier_initialization,vectorizar_countvectorizer,"(1, 2)",mi_tokenizador_con_stemming,0.1,16,0.775407,0.774046,0.774646,0.778


In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import ast
import pandas as pd
le = LabelEncoder()

X, Y =  leer_json('../Datasets/dataset_humor_train.json')
Y_encoded = le.fit_transform(Y)

#tamaño de los k-folds
skf = StratifiedKFold(n_splits=5)  
for k, (index_train, index_test) in enumerate(skf.split(X, Y_encoded), start=1):
        
    # index_train y index_test obtienen los índices de las muestras para procesar
    X_train_cv, X_test_cv = X[index_train], X[index_test]

    Y_train_cv, Y_test_cv = Y_encoded[index_train], Y_encoded[index_test]
        
    df_mejores = df_hateval_es.sort_values(by='F1_score', ascending=False).head(5)

    experimentos = 1
            
    for _, filas in df_mejores.iterrows():
        neuronas_ocultas = filas['Neuronas_ocultas']
        func_init = filas['Init_func']
        pes_term = filas['Pesado_terminos']
        terminos = filas['Terminos']
        prepo = filas['Prepocesamiento']
        lr = filas['Learnin_rate']
        batch_size = filas['Batch_size']
        salidas = 1
        epocas = 101
        terminos = ast.literal_eval(terminos)
        semilla = 42

        init_func = xavier_initialization if func_init == 'xavier_initialization' else normal_initialization
        vectorizador = vectorizar_countvectorizer if pes_term == 'vectorizar_countvectorizer' else vectorizar_tfidfvectorizer
        prepocesamiento = None
        if prepo == 'mi_tokenizador_sin_stopwords':
            prepocesamiento = mi_tokenizador_sin_stopwords 
        if prepo == 'mi_tokenizador_con_stemming':
            prepocesamiento = mi_tokenizador_con_stemming
                
                
        try:
            X_train_vectorizado, vec_train = vectorizador(X_train_cv, normaliza_texto, prepocesamiento, terminos)
            X_test_vectorizado = vec_train.transform(X_test_cv).toarray()
             #print(X_train_vectorizado)
            entradas = X_train_vectorizado.shape[1]

            X_train_vectorizado = X_train_vectorizado.astype('float16') 
            X_test_vectorizado = X_test_vectorizado.astype('float16')

            mlp = MLP_TODO(entradas, neuronas_ocultas, salidas, epocas, fun_init= init_func, batch_size=min(batch_size, X_train_vectorizado.shape[0]), learning_rate=lr, random_state=semilla)

            X_train_vectorizado = cp.asarray(X_train_vectorizado)
            X_test_vectorizado = cp.asarray(X_test_vectorizado)
            Y_train = cp.asanyarray(Y_train_cv)
            if Y_train.ndim == 1:
                Y_train = Y_train.reshape(-1, 1)

            error = mlp.train(X_train_vectorizado, Y_train)
            print(f"{experimentos} - último error: {error}")

            y_predicha = mlp.predict(X_test_vectorizado)
            y_predicha = y_predicha.get()
                
            print(f'{experimentos} - P_score: {precision_score(Y_test_cv, y_predicha)}')
            print(f'{experimentos} - F_score: {f1_score(Y_test_cv, y_predicha)}')
            print(f'{experimentos} - Recall_score: {recall_score(Y_test_cv, y_predicha)}')
            print(f'{experimentos} - Accuracy: {accuracy_score(Y_test_cv, y_predicha)}')
            print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)

            experimentos += 1

                    
        except Exception as e:
            print(f"Fallo la configuración: {e}")
            print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)
                
                #print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)
        
    
    



KeyboardInterrupt: 

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import ast
import pandas as pd
le = LabelEncoder()
from sklearn.model_selection import train_test_split
X, Y =  leer_json('../Datasets/dataset_humor_train.json')
#Y_encoded = le.fit_transform(Y)
X_test, _ = leer_json('../Datasets/dataset_humor_test.json')
#X_train, X_test, Y_train, Y_test =  train_test_split(X, Y_encoded, test_size=0.2, stratify= Y_encoded, random_state=42)
df_mejores = df_hateval_es.sort_values(by='F1_score', ascending=False).head(3)


experimentos = 1
            
for _, filas in df_mejores.iterrows():
    neuronas_ocultas = filas['Neuronas_ocultas']
    func_init = filas['Init_func']
    pes_term = filas['Pesado_terminos']
    terminos = filas['Terminos']
    prepo = filas['Prepocesamiento']
    lr = filas['Learnin_rate']
    batch_size = filas['Batch_size']
    salidas = 1
    epocas = 256
    terminos = ast.literal_eval(terminos)
    semilla = 42

    init_func = xavier_initialization if func_init == 'xavier_initialization' else normal_initialization
    vectorizador = vectorizar_countvectorizer if pes_term == 'vectorizar_countvectorizer' else vectorizar_tfidfvectorizer
    prepocesamiento = None
    if prepo == 'mi_tokenizador_sin_stopwords':
        prepocesamiento = mi_tokenizador_sin_stopwords 
    if prepo == 'mi_tokenizador_con_stemming':
        prepocesamiento = mi_tokenizador_con_stemming
                
                
    try:
        print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)
        X_train_vectorizado, vec_train = vectorizador(X, normaliza_texto, prepocesamiento, terminos)
        X_test_vectorizado = vec_train.transform(X_test).toarray()

            #print(X_train_vectorizado)
        entradas = X_train_vectorizado.shape[1]

        X_train_vectorizado = X_train_vectorizado.astype('float16') 
        X_test_vectorizado = X_test_vectorizado.astype('float16')

        mlp = MLP_TODO(entradas, neuronas_ocultas, salidas, epocas, fun_init= init_func, batch_size=min(batch_size, X_train_vectorizado.shape[0]), learning_rate=lr, random_state=semilla)

        X_train_vectorizado = cp.asarray(X_train_vectorizado)
        X_test_vectorizado = cp.asarray(X_test_vectorizado)
        Y_train = cp.asanyarray(Y)
        if Y_train.ndim == 1:
            Y_train = Y_train.reshape(-1, 1)
        
        nombre_archivo =f'{neuronas_ocultas}_{func_init}_{pes_term}_{terminos}_{prepo}_{lr}_{batch_size}.csv'

        error = mlp.train(X_train_vectorizado, Y_train)
        print(f"{experimentos} - último error: {error}")

        y_predicha = mlp.predict(X_test_vectorizado)
        y_predicha = y_predicha.get()
        guardar_resultados(y_predicha,f'../Resultados_Leo/{nombre_archivo}')
        print(y_predicha)
        print(f'sum:{y_predicha.sum()}')        
        print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)


        experimentos += 1

                    
    except Exception as e:
        print(f"Fallo la configuración: {e}")
        print(neuronas_ocultas, func_init, pes_term, terminos,prepo,lr, batch_size)

512 xavier_initialization vectorizar_countvectorizer (1, 1) mi_tokenizador_sin_stopwords 0.01 64




1 - último error: 0.003730449677598128
¡Archivo 'output_pandas_Nx1.csv' guardado exitosamente!
[[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [1]]
sum:1707
512 xavier_initialization vectorizar_countvectorizer (1, 1) mi_tokenizador_sin_stopwords 0.01 64
128 xavier_initialization vectorizar_countvectorizer (1, 2) mi_tokenizador_sin_stopwords 0.01 32


  return _core.array(a, dtype, False, order, blocking=blocking)


Fallo la configuración: Out of memory allocating 1,694,430,720 bytes (allocated so far: 10,239,311,360 bytes).
128 xavier_initialization vectorizar_countvectorizer (1, 2) mi_tokenizador_sin_stopwords 0.01 32
64 xavier_initialization vectorizar_countvectorizer (1, 2) mi_tokenizador_sin_stopwords 0.01 64
--- DETENCIÓN TEMPRANA (Paciencia agotada) ---
No hubo mejora significativa en el error durante 20 épocas.
Último error: 0.00086490, Mejor error: 0.00095382
2 - último error: 0.0008648976714958546
¡Archivo 'output_pandas_Nx1.csv' guardado exitosamente!
[[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [1]]
sum:1522
64 xavier_initialization vectorizar_countvectorizer (1, 2) mi_tokenizador_sin_stopwords 0.01 64
