# Arquitecturas aplicadas a clasificación de texto

Utilizaremos el dataset 20 Newsgroups para probar los modelos.

In [1]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.metrics import F1Score

2025-03-30 19:37:39.652111: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# cargamos 20 Newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))

In [3]:
# descargamos los embeddings de palabras de Fasttext para inglés y descomprimimos el archivo.
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip

zsh:1: command not found: wget
unzip:  cannot find or open wiki-news-300d-1M.vec.zip, wiki-news-300d-1M.vec.zip.zip or wiki-news-300d-1M.vec.zip.ZIP.


In [4]:
# cargamos los embeddings de palabras
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

loading word embeddings...


FileNotFoundError: [Errno 2] No such file or directory: 'wiki-news-300d-1M.vec'

In [None]:
print(newsgroups_train.data[16])

In [None]:
# instanciamos el tokenizador
token = Tokenizer(num_words=30000,
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
                char_level=False,
                oov_token="UNK",
                document_count=0)

In [None]:
# fiteamos el tokenizador
token.fit_on_texts(newsgroups_train.data)

In [None]:
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])

In [None]:
# cargamos en una matriz los embeddings de las palabras
# presentes en el vocabulario
embed_dim=300
num_words=len(dictionary)+1
embedding_matrix=np.zeros([num_words,embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [None]:
# se tokenizan los textos
train_sequences=token.texts_to_sequences(newsgroups_train.data)
test_sequences=token.texts_to_sequences(newsgroups_test.data)

In [None]:
train_sequences[0]

In [None]:
embedding_matrix.shape

En este punto seleccionamos el tamaño de contexto a procesar en la variable `max_len`

In [None]:
max_len=500
train_sequences=pad_sequences(train_sequences,maxlen=max_len)
test_sequences=pad_sequences(test_sequences,maxlen=max_len)

In [None]:
train_sequences[0]

In [None]:
dictionary

In [None]:
token.index_word

# Suma de embeddings + MLP

In [None]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D,\
                         Dropout, Dense, Lambda, Concatenate, Input
from keras.models import Sequential, Model
from keras import optimizers
import tensorflow.keras.backend as K
import keras

ImportError: cannot import name 'clip_to_image_size' from 'keras.src.layers.preprocessing.image_preprocessing.bounding_boxes.converters' (/opt/anaconda3/envs/venv312/lib/python3.12/site-packages/keras/src/layers/preprocessing/image_preprocessing/bounding_boxes/converters.py)

In [None]:
import re

import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Activation, Dropout, Dense, Flatten, LSTM, GlobalMaxPooling1D, Embedding, Input, Concatenate, Bidirectional
from sklearn.model_selection import train_test_split

ImportError: cannot import name 'clip_to_image_size' from 'keras.src.layers.preprocessing.image_preprocessing.bounding_boxes.converters' (/opt/anaconda3/envs/venv312/lib/python3.12/site-packages/keras/src/layers/preprocessing/image_preprocessing/bounding_boxes/converters.py)

In [None]:

class F1Callback(keras.callbacks.Callback):

    '''
    Este callback es una solución ad-hoc para calcular al final de cada epoch de
    entrenamiento la métrica de Perplejidad sobre un conjunto de datos de validación.
    La perplejidad es una métrica cuantitativa para evaluar la calidad de la generación de secuencias.
    Además implementa la finalización del entrenamiento (Early Stopping)
    si la perplejidad no mejora después de `patience` epochs.
    '''

    def __init__(self, X_val,y_val,num_classes, history_f1, patience=5):
      # El callback lo inicializamos con secuencias de validación sobre las cuales
      # mediremos la perplejidad
      self.X_val = X_val
      self.y_val = y_val

      self.max_score = 0
      self.num_classes = num_classes
      self.epsilon = 10E-8
      self.patience = patience
      self.patience_counter = 0


    def on_epoch_end(self, epoch, logs=None):

        predictions = self.model.predict(self.X_val,verbose=0)

        y_pred = np.argmax(predictions,axis=1)

        counter = np.zeros((self.num_classes,self.num_classes))

        for idx_pred,idx_true in zip(y_pred,self.y_val):
          counter[idx_pred,idx_true] += 1

        # sea calcula TP, FN y FP
        TP = np.diag(counter)
        FN = counter.sum(axis=0)-TP
        FP = counter.sum(axis=1)-TP

        precision = TP/(TP+FP+self.epsilon)
        recall = TP/(TP+FN+self.epsilon)

        # se calcula el F1-sscore
        f1 = 2*precision*recall/(precision+recall+self.epsilon)

        current_score = np.mean(f1)

        history_f1.append(current_score)

        print(f'\n f1 macro: {current_score} \n')

        # chequeamos si tenemos que  detener el entrenamiento
        if current_score > self.max_score:
          self.max_score = current_score
          self.model.save("my_model.keras")
          print("Saved new model!")
          self.patience_counter = 0
        else:
          self.patience_counter += 1
          if self.patience_counter == 5:
            print("Stopping training...")
            self.model.stop_training = True


In [None]:
nb_words=num_words

model = Sequential()

model.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Lambda(lambda x: K.sum(x, axis=1)))

model.add(Dense(32, activation='relu'))
model.add(Dense(20, activation='softmax'))  #multi-label (k-hot encoding)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=64,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

# Entrenamiento
epoch_count = range(1, len(history_f1) + 1)
sns.lineplot(x=epoch_count,  y=history_f1)
plt.show()

## Clasificador Embeddings + CNN

In [None]:
nb_words=num_words
num_filters=64

model = Sequential()

model.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))

model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))

model.add(Conv1D(num_filters*2, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dense(20, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=128,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])

## Clasificación con TextCNN


In [None]:

nb_words=num_words
num_filters=64

input_layer = Input(shape=(max_len,))
embedding_layer=Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)

conv4=Conv1D(num_filters, 4, activation='relu', padding='same')(embedding_layer)
conv3=Conv1D(num_filters, 3, activation='relu', padding='same')(embedding_layer)
conv2=Conv1D(num_filters, 2, activation='relu', padding='same')(embedding_layer)
pool4=GlobalMaxPooling1D()(conv4)
pool3=GlobalMaxPooling1D()(conv3)
pool2=GlobalMaxPooling1D()(conv2)
added = Concatenate()([pool4, pool3, pool2])

dense1=Dense(32, activation='relu')(added)
dense2=Dense(20, activation='softmax')(dense1)

model=Model(input_layer , dense2)

# adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:

history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=128,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])

# MLP + Embeddings + Attention

In [None]:
from keras.layers import Dot,RepeatVector,TimeDistributed,Multiply,Lambda,Flatten,Activation,Reshape
from keras.activations import softmax

In [None]:

def softMaxOverTime(x):
    return softmax(x,axis=1)

key_dim=50
nb_words=num_words
num_filters=64

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)

dense_input = Dense(key_dim, activation="tanh")(embedding_layer)
ulog_attention = Dense(1,activation="linear")(dense_input)

attention = Activation(softMaxOverTime)(ulog_attention)

repeated_attention = TimeDistributed(RepeatVector(embed_dim))(attention)

repeated_attention = Reshape([max_len,embed_dim])(repeated_attention)

weighted_embeddings = Multiply()([repeated_attention,embedding_layer])
embedding_sum = Lambda(lambda x: K.sum(x, axis=1),output_shape=(300,))(weighted_embeddings)

dense1=Dense(32, activation='relu')(embedding_sum)
dense2=Dense(20, activation='softmax')(dense1)

model=Model(input_layer , dense2)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=128,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])

# MLP + Embeddings + Attention + CNN

In [None]:

value_dim=100

def softMaxOverTime(x):
    return softmax(x,axis=1)


nb_words=num_words
num_filters=64

input_layer = Input(shape=(max_len,))
embedding_layer=Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)

conv_out=Conv1D(value_dim,8,padding="same")(embedding_layer)
conv_out=Activation("relu")(conv_out)
#conv_out=Conv1D(value_dim,8,activation="relu",padding="same")(conv_out)
conv_out=Conv1D(value_dim,8,activation="tanh",padding="same")(conv_out)

ulog_attention=Dense(1,activation="linear")(conv_out)
attention=Activation(softMaxOverTime)(ulog_attention)
repeated_attention=TimeDistributed(RepeatVector(value_dim))(attention)
repeated_attention=Reshape([max_len,value_dim])(repeated_attention)
weighted_embeddings=Multiply()([repeated_attention,conv_out])
embedding_sum = Lambda(lambda x: K.sum(x, axis=1),output_shape=(100,))(weighted_embeddings)

dense1=Dense(100, activation='relu')(embedding_sum)
dense2=Dense(20, activation='softmax')(dense1)

model=Model(input_layer , dense2)

# adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [None]:
history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=128,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])

# Bidir RNN + Attention

In [None]:
from keras.layers import Bidirectional, LSTM

value_dim=100

def softMaxOverTime(x):
    return softmax(x,axis=1)

nb_words=num_words
num_filters=64

input_layer = Input(shape=(max_len,))
embedding_layer=Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
# lstm_out=Bidirectional(LSTM(value_dim, return_sequences=True))(embedding_layer)
# lstm_out=Bidirectional(LSTM(value_dim, return_sequences=True))(lstm_out)
lstm_out=Bidirectional(LSTM(value_dim, return_sequences=True,activation="relu"),merge_mode="sum")(embedding_layer)

ulog_attention=Dense(1,activation="linear")(lstm_out)
attention=Activation(softMaxOverTime)(ulog_attention)
repeated_attention=TimeDistributed(RepeatVector(value_dim))(attention)
repeated_attention=Reshape([max_len,value_dim])(repeated_attention)
weighted_embeddings=Multiply()([repeated_attention,lstm_out])
embedding_sum = Lambda(lambda x: K.sum(x, axis=1),output_shape=(None,value_dim))(weighted_embeddings)

dense1=Dense(100, activation='relu')(embedding_sum)
dense2=Dense(20, activation='softmax')(dense1)

model=Model(input_layer , dense2)

# adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='sparse_categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
# model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])


model.summary()

NameError: name 'num_words' is not defined

In [None]:
history_f1 = []
model.fit(train_sequences, newsgroups_train.target,batch_size=128,epochs=40,callbacks=[F1Callback(test_sequences,newsgroups_test.target,20,history_f1)])