# Ejemplo y procesamiento de la data

Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

Carga de datos

In [None]:
# Importar librerías necesarias
from sklearn.datasets import fetch_20newsgroups

# Cargar los datos de entrenamiento y prueba
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Verificar la carga de datos
print(f'Número de documentos en el conjunto de entrenamiento: {len(newsgroups_train.data)}')
print(f'Número de documentos en el conjunto de prueba: {len(newsgroups_test.data)}')

# Mostrar un ejemplo de los datos
print("\nEjemplo de documento del conjunto de entrenamiento:")
print(newsgroups_train.data[0])

Número de documentos en el conjunto de entrenamiento: 11314
Número de documentos en el conjunto de prueba: 7532

Ejemplo de documento del conjunto de entrenamiento:
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [None]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [None]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [None]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador (obtener el vocabulario y calcular el vector IDF) y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [None]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [None]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [None]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

In [None]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [None]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [None]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [None]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [None]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [None]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


Modelo de clasificación Naïve Bayes

In [None]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)


In [None]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

# Desafío 1

##Vectorizar documentos.
Tomar 5 documentos al azar y medir similaridad con el resto de los documentos. Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido la similaridad según el contenido del texto y la etiqueta de clasificación.

In [None]:
# Generar 5 índices aleatorios
np.random.seed(42)  # Para reproducibilidad
random_indices = np.random.choice(len(newsgroups_train.data), size=5, replace=False)

# Obtener los textos y etiquetas correspondientes
random_documents = [newsgroups_train.data[idx] for idx in random_indices]
random_labels = [newsgroups_train.target[idx] for idx in random_indices]

# Mostrar los textos seleccionados y sus etiquetas
for i, text in enumerate(random_documents):
    print(f"Documento {i+1}, Etiqueta: {newsgroups_train.target_names[random_labels[i]]}\n")
    print(text)
    print("-------------------------------------------------------------\n")


Documento 1, Etiqueta: misc.forsale

For Sale:

    Roland TR-606 Drum Machine
    Near Mint Condition (no scratches, fully operational).
    Sorry no Manuals.
    Asking $200 US + shipping

    Mirage Rack Mount Sampler
    Minor Scratches around rack ear screws
    with Advanced Sampling Option, 32 Disks
      and both manuals
    It's a long story, but I *may* have the Turtle Beach Vision, sample
        editing software for the IBM PC.
    Asking $400 US + shipping

Send all e-mail requests to:  barsz@bnr.ca

Regards,
-------------------------------------------------------------

Documento 2, Etiqueta: comp.windows.x

I posted this a while ago and didn't recieve one reply, and now we
have another bug report on the same subject. Can anybody help me out?

How can you ensure that accelerators work the same independent of
case?  What I want is Ctrl+O and Ctrl+o to both be accelerators on one
menu entry.

In ORA Vol. 6, in the section on accelerators it says "For information
on how to s

Calcular similaridad con el resto de los documentos

In [None]:
similarities = []
for i, idx in enumerate(random_indices):
    print(f"Documento {i+1} (Etiqueta: {newsgroups_train.target_names[random_labels[i]]})\n")
    print(newsgroups_train.data[idx])
    print("\nDocumentos más similares:\n")

    # Calcular similaridad coseno con respecto a todos los documentos
    cossim = cosine_similarity(X_train[idx], X_train).flatten()
    # Ordenar los índices por similaridad descendente (excluyendo el propio documento)
    sorted_indices = np.argsort(cossim)[::-1][1:6]

    for j, sim_idx in enumerate(sorted_indices):
        print(f"Similitud: {cossim[sim_idx]}")
        print(f"Etiqueta: {newsgroups_train.target_names[y_train[sim_idx]]}")
        #print(newsgroups_train.data[sim_idx])
        #print("-------------------------------------------------------------\n")

Documento 1 (Etiqueta: misc.forsale)

For Sale:

    Roland TR-606 Drum Machine
    Near Mint Condition (no scratches, fully operational).
    Sorry no Manuals.
    Asking $200 US + shipping

    Mirage Rack Mount Sampler
    Minor Scratches around rack ear screws
    with Advanced Sampling Option, 32 Disks
      and both manuals
    It's a long story, but I *may* have the Turtle Beach Vision, sample
        editing software for the IBM PC.
    Asking $400 US + shipping

Send all e-mail requests to:  barsz@bnr.ca

Regards,

Documentos más similares:

Similitud: 0.1971248842399524
Etiqueta: misc.forsale
Similitud: 0.19347705705696125
Etiqueta: misc.forsale
Similitud: 0.1618054234829718
Etiqueta: misc.forsale
Similitud: 0.15496775940587007
Etiqueta: misc.forsale
Similitud: 0.14591636102214595
Etiqueta: misc.forsale
Documento 2 (Etiqueta: comp.windows.x)

I posted this a while ago and didn't recieve one reply, and now we
have another bug report on the same subject. Can anybody help me out

##Clasificación Naïve Bayes
Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación (f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial y ComplementNB.

In [None]:
def train_and_evaluate(vectorizer_params, model_type, model_params):
    # Vectorización
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)

    # Selección del modelo
    if model_type == 'MultinomialNB':
        model = MultinomialNB(**model_params)
    elif model_type == 'ComplementNB':
        model = ComplementNB(**model_params)
    else:
        raise ValueError("Invalid model type. Choose 'MultinomialNB' or 'ComplementNB'.")

    # Entrenamiento y predicción
    model.fit(X_train, newsgroups_train.target)
    y_pred = model.predict(X_test)

    # Evaluación
    f1 = f1_score(newsgroups_test.target, y_pred, average='macro')
    return f1

Diferentes parámetros del vectorizador

In [None]:
# Parámetros del vectorizador para probar
vectorizer_params_list = [
    {'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 1), 'stop_words': 'english'},
    {'max_df': 0.7, 'min_df': 5, 'ngram_range': (1, 2), 'stop_words': 'english'},
    {'max_df': 0.8, 'min_df': 3, 'ngram_range': (1, 1), 'stop_words': None},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'stop_words': 'english'}
]

# Parámetros del modelo Naïve Bayes para probar
model_params = [
    {},
    {'alpha': 0.5},
    {'alpha': 1.0, 'fit_prior': False}
]

best_f1_score = 0
best_params = None

for vectorizer_params in vectorizer_params_list:
    for params in model_params:
        for model_type in ['MultinomialNB', 'ComplementNB']:
            f1 = train_and_evaluate(vectorizer_params, model_type, params)
            if f1 > best_f1_score:
                best_f1_score = f1
                best_params = (vectorizer_params, model_type, params)

print(f"Mejor f1-score: {best_f1_score}")
print(f"Mejores parámetros: {best_params}")


Mejor f1-score: 0.7072715088603252
Mejores parámetros: ({'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'stop_words': 'english'}, 'ComplementNB', {'alpha': 0.5})


# Matriz documento-término
Transponer la matriz documento-término. De esa manera se obtiene una matriz término-documento que puede ser interpretada como una colección de vectorización de palabras.

In [None]:
# Vectorización
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data)

# Transponer la matriz
X_train_transposed = X_train.transpose()

print(f'Tipo de X_train: {type(X_train)}')
print(f'Tipo de X_train_transposed: {type(X_train_transposed)}')
print(f'Shape de X_train: {X_train.shape}')
print(f'Shape de X_train_transposed: {X_train_transposed.shape}')


Tipo de X_train: <class 'scipy.sparse._csr.csr_matrix'>
Tipo de X_train_transposed: <class 'scipy.sparse._csc.csc_matrix'>
Shape de X_train: (11314, 101631)
Shape de X_train_transposed: (101631, 11314)


In [None]:
# Obtener el índice de una palabra en particular
word = 'car'
word_index = tfidfvect.vocabulary_.get(word, -1)

if word_index != -1:
    # Obtener la fila correspondiente a la palabra
    word_vector = X_train_transposed.getrow(word_index).toarray().flatten()
    print(f'Vector de la palabra "{word}":\n', word_vector)

    # Mostrar los documentos donde esta palabra tiene mayor importancia
    top_docs_indices = word_vector.argsort()[::-1][:5]
    for idx in top_docs_indices:
        print(f'\nDocumento {idx} (Importancia: {word_vector[idx]}):')
        print(newsgroups_train.data[idx])
else:
    print(f'La palabra "{word}" no se encuentra en el vocabulario.')

Vector de la palabra "car":
 [0.40464699 0.         0.         ... 0.         0.         0.        ]

Documento 8266 (Importancia: 0.5287256881150304):

Definitely!

Safety is an important criterium for me when buying a car. I won't buy a 
small car like a Civic or whatever.

Great = Safety + Handling + Speed  -  for me

Seems to me that you would be more "dead" in a small car than a large car 
after an accident.

Documento 8013 (Importancia: 0.46008568969406555):

If you don't already know it, you should call the bank/credit union/
finance company that holds the loan on your present car and get the
current payoff cost.

If you are trading in your current car on the new car, subtract the
payoff amount from the trade-in the dealer is giving you.  (If this
turns out to be a negative number, you need to reconsider the deal.)
Subtract this difference from the price of the new car.  This is the
size of the loan you will need for the new car.

The dealer will take care of paying off the loan

In [None]:
#Transponer la matriz documento-término y analizar similaridad entre palabras
X_train_transposed = X_train.transpose()

def analyze_word_similarity(X, vocabulary, words):
    for word in words:
        word_index = vocabulary.get(word, -1)
        if word_index != -1:
            word_vector = X.getrow(word_index).toarray().flatten()
            print(f'Vector de la palabra "{word}":\n', word_vector)

            top_words_indices = word_vector.argsort()[::-1][:5]
            for idx in top_words_indices:
                term = idx2word[idx]
                print(f'\nPalabra {term} (Importancia: {word_vector[idx]}):')
        else:
            print(f'La palabra "{word}" no se encuentra en el vocabulario.')

# Obtener el diccionario índice-palabra
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}

# Seleccionar 5 palabras para analizar
words_to_analyze = ['car', 'computer', 'space', 'health', 'government']

# Analizar similaridad de palabras
analyze_word_similarity(X_train_transposed, tfidfvect.vocabulary_, words_to_analyze)

Vector de la palabra "car":
 [0.40464699 0.         0.         ... 0.         0.         0.        ]

Palabra 4km (Importancia: 0.5287256881150304):

Palabra 496 (Importancia: 0.46008568969406555):

Palabra 4xl (Importancia: 0.44656404990398263):

Palabra 48us (Importancia: 0.4425021341721087):

Palabra 00 (Importancia: 0.4046469916999256):
Vector de la palabra "computer":
 [0.         0.         0.04674622 ... 0.         0.         0.        ]

Palabra 4trt (Importancia: 0.3716229236610804):

Palabra 0s9 (Importancia: 0.3307479091216156):

Palabra 4d50 (Importancia: 0.3221025403419712):

Palabra 1200x900 (Importancia: 0.3122356589269776):

Palabra 5967 (Importancia: 0.3102918554114527):
Vector de la palabra "space":
 [0.         0.         0.         ... 0.         0.26016747 0.        ]

Palabra 636s (Importancia: 0.5824354403360952):

Palabra 4100 (Importancia: 0.5612189836679569):

Palabra 150multidisk (Importancia: 0.508380770881835):

Palabra 1663 (Importancia: 0.4397458203249955

Slack
* https://join.slack.com/t/ceaiworkspace/shared_invite/zt-2l9un8yte-yWrXdu7msfCwr32VG6RIgQ

Github
* https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/blob/main/clase_1/ejercicios/Desafio_1.ipynb

Drive
* https://drive.google.com/drive/u/0/folders/1joS44xgaoCKapc44WmkjcZyjWJh4tQz0

Colab TP1
* https://colab.research.google.com/github/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/blob/main/clase_1/ejercicios/Desafio_1.ipynb
