# Vectorización de texto y modelo de clasificación Naïve Bayes

Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

Carga de datos

In [9]:
# Importar librerías necesarias
from sklearn.datasets import fetch_20newsgroups

# Cargar los datos de entrenamiento y prueba
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Verificar la carga de datos
print(f'Número de documentos en el conjunto de entrenamiento: {len(newsgroups_train.data)}')
print(f'Número de documentos en el conjunto de prueba: {len(newsgroups_test.data)}')

# Mostrar un ejemplo de los datos
print("\nEjemplo de documento del conjunto de entrenamiento:")
print(newsgroups_train.data[0])

Número de documentos en el conjunto de entrenamiento: 11314
Número de documentos en el conjunto de prueba: 7532

Ejemplo de documento del conjunto de entrenamiento:
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [10]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [11]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [12]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador (obtener el vocabulario y calcular el vector IDF) y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [13]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [14]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [15]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [16]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [17]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [18]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [19]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [20]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [21]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [22]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


Modelo de clasificación Naïve Bayes

In [23]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [24]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)


In [25]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

# Desafío 1

##Vectorizar documentos.
Tomar 5 documentos al azar y medir similaridad con el resto de los documentos. Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido la similaridad según el contenido del texto y la etiqueta de clasificación.

In [26]:
# Generar 5 índices aleatorios
np.random.seed(42)  # Para reproducibilidad
random_indices = np.random.choice(len(newsgroups_train.data), size=5, replace=False)

# Obtener los textos y etiquetas correspondientes
random_documents = [newsgroups_train.data[idx] for idx in random_indices]
random_labels = [newsgroups_train.target[idx] for idx in random_indices]

# Mostrar los textos seleccionados y sus etiquetas
for i, text in enumerate(random_documents):
    print(f"Documento {i+1}, Etiqueta: {newsgroups_train.target_names[random_labels[i]]}\n")
    print(text)
    print("-------------------------------------------------------------\n")


Documento 1, Etiqueta: comp.sys.mac.hardware

Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward" 
-------------------------------------------------------------

Documento 2, Etiqueta: comp.os.ms-windows.misc



     Don't bother if you have CPBackup or Fastback.  They all offer options 
not available in the stripped-down MS version (FROM CPS!).  Examples - no 
proprietary format (to save space), probably no direct DMA access, and no 
tape drive!
-------------------------------------------------------------

Documento 3, Etiqueta: misc.forsale

5.25" Internal Low density disk drive.

Monochrome monitor

8088 motherboard, built in parallel and serial ports, built in mono and
color output, 7Mhz.

Libertarian, atheist, semi-anarchal Techno-Rat.
-------------------

Calcular similaridad con el resto de los documentos

In [27]:
similarities = []
for i, idx in enumerate(random_indices):
    print(f"Documento {i+1} (Etiqueta: {newsgroups_train.target_names[random_labels[i]]})\n")
    print(newsgroups_train.data[idx])
    print("\nDocumentos más similares:\n")

    # Calcular similaridad coseno con respecto a todos los documentos
    cossim = cosine_similarity(X_train[idx], X_train).flatten()
    # Ordenar los índices por similaridad descendente (excluyendo el propio documento)
    sorted_indices = np.argsort(cossim)[::-1][1:6]

    for j, sim_idx in enumerate(sorted_indices):
        print(f"Similitud: {cossim[sim_idx]}")
        print(f"Etiqueta: {newsgroups_train.target_names[y_train[sim_idx]]}")
        #print(newsgroups_train.data[sim_idx])
        #print("-------------------------------------------------------------\n")

Documento 1 (Etiqueta: comp.sys.mac.hardware)

Could someone please post any info on these systems.

Thanks.
BoB
-- 
---------------------------------------------------------------------- 
Robert Novitskey | "Pursuing women is similar to banging one's head
rrn@po.cwru.edu  |  against a wall...with less opportunity for reward" 

Documentos más similares:

Similitud: 0.6665262187597427
Etiqueta: comp.sys.mac.hardware
Similitud: 0.34759575958927097
Etiqueta: comp.sys.ibm.pc.hardware
Similitud: 0.1798616722409978
Etiqueta: comp.sys.mac.hardware
Similitud: 0.1546515862768033
Etiqueta: misc.forsale
Similitud: 0.14143450876745123
Etiqueta: comp.sys.mac.hardware
Documento 2 (Etiqueta: comp.os.ms-windows.misc)



     Don't bother if you have CPBackup or Fastback.  They all offer options 
not available in the stripped-down MS version (FROM CPS!).  Examples - no 
proprietary format (to save space), probably no direct DMA access, and no 
tape drive!

Documentos más similares:

Similitud: 0.204049

##Clasificación Naïve Bayes
Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación (f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial y ComplementNB.

In [28]:
def train_and_evaluate(vectorizer_params, model_type, model_params):
    # Vectorización
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)

    # Selección del modelo
    if model_type == 'MultinomialNB':
        model = MultinomialNB(**model_params)
    elif model_type == 'ComplementNB':
        model = ComplementNB(**model_params)
    else:
        raise ValueError("Invalid model type. Choose 'MultinomialNB' or 'ComplementNB'.")

    # Entrenamiento y predicción
    model.fit(X_train, newsgroups_train.target)
    y_pred = model.predict(X_test)

    # Evaluación
    f1 = f1_score(newsgroups_test.target, y_pred, average='macro')
    return f1

Diferentes parámetros del vectorizador

In [29]:
# Parámetros del vectorizador para probar
vectorizer_params_list = [
    {'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 1), 'stop_words': 'english'},
    {'max_df': 0.7, 'min_df': 5, 'ngram_range': (1, 2), 'stop_words': 'english'},
    {'max_df': 0.8, 'min_df': 3, 'ngram_range': (1, 1), 'stop_words': None},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'stop_words': 'english'}
]

# Parámetros del modelo Naïve Bayes para probar
model_params = [
    {},
    {'alpha': 0.5},
    {'alpha': 1.0, 'fit_prior': False}
]

best_f1_score = 0
best_params = None

for vectorizer_params in vectorizer_params_list:
    for params in model_params:
        for model_type in ['MultinomialNB', 'ComplementNB']:
            f1 = train_and_evaluate(vectorizer_params, model_type, params)
            if f1 > best_f1_score:
                best_f1_score = f1
                best_params = (vectorizer_params, model_type, params)

print(f"Mejor f1-score: {best_f1_score}")
print(f"Mejores parámetros: {best_params}")


Mejor f1-score: 0.7072715088603252
Mejores parámetros: ({'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'stop_words': 'english'}, 'ComplementNB', {'alpha': 0.5})


# Matriz documento-término
Transponer la matriz documento-término. De esa manera se obtiene una matriz término-documento que puede ser interpretada como una colección de vectorización de palabras.

In [30]:
# Vectorización
tfidfvect = TfidfVectorizer()
X_train = tfidfvect.fit_transform(newsgroups_train.data)

# Transponer la matriz
X_train_transposed = X_train.transpose()

print(f'Tipo de X_train: {type(X_train)}')
print(f'Tipo de X_train_transposed: {type(X_train_transposed)}')
print(f'Shape de X_train: {X_train.shape}')
print(f'Shape de X_train_transposed: {X_train_transposed.shape}')


Tipo de X_train: <class 'scipy.sparse._csr.csr_matrix'>
Tipo de X_train_transposed: <class 'scipy.sparse._csc.csc_matrix'>
Shape de X_train: (11314, 101631)
Shape de X_train_transposed: (101631, 11314)


In [31]:
# Obtener el índice de una palabra en particular
word = 'car'
word_index = tfidfvect.vocabulary_.get(word, -1)

if word_index != -1:
    # Obtener la fila correspondiente a la palabra
    word_vector = X_train_transposed.getrow(word_index).toarray().flatten()
    print(f'Vector de la palabra "{word}":\n', word_vector)

    # Mostrar los documentos donde esta palabra tiene mayor importancia
    top_docs_indices = word_vector.argsort()[::-1][:5]
    for idx in top_docs_indices:
        print(f'\nDocumento {idx} (Importancia: {word_vector[idx]}):')
        print(newsgroups_train.data[idx])
else:
    print(f'La palabra "{word}" no se encuentra en el vocabulario.')

Vector de la palabra "car":
 [0.40464699 0.         0.         ... 0.         0.         0.        ]

Documento 8266 (Importancia: 0.5287256881150304):

Definitely!

Safety is an important criterium for me when buying a car. I won't buy a 
small car like a Civic or whatever.

Great = Safety + Handling + Speed  -  for me

Seems to me that you would be more "dead" in a small car than a large car 
after an accident.

Documento 8013 (Importancia: 0.46008568969406555):

If you don't already know it, you should call the bank/credit union/
finance company that holds the loan on your present car and get the
current payoff cost.

If you are trading in your current car on the new car, subtract the
payoff amount from the trade-in the dealer is giving you.  (If this
turns out to be a negative number, you need to reconsider the deal.)
Subtract this difference from the price of the new car.  This is the
size of the loan you will need for the new car.

The dealer will take care of paying off the loan

In [32]:
#Transponer la matriz documento-término y analizar similaridad entre palabras
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Transponer X_train para tener palabras como filas
X_train_transposed = X_train.transpose()

def analyze_word_similarity(X, vocabulary, words):
    for word in words:
        word_index = vocabulary.get(word, -1)
        if word_index != -1:
            # Obtener el vector de la palabra
            word_vector = X.getrow(word_index).toarray()

            # Calcular la similitud del coseno entre el vector de la palabra y todos los otros vectores de palabras
            similarities = cosine_similarity(word_vector, X).flatten()

            # Ordenar las palabras por similitud descendente
            top_words_indices = similarities.argsort()[::-1][1:6]  # Excluir la palabra misma (similitud 1)

            print(f'\nPalabra: "{word}"')
            for idx in top_words_indices:
                term = idx2word[idx]
                print(f'Palabra similar: "{term}" (Similitud de coseno: {similarities[idx]:.4f})')
        else:
            print(f'La palabra "{word}" no se encuentra en el vocabulario.')

# Obtener el diccionario índice-palabra
idx2word = {v: k for k, v in tfidfvect.vocabulary_.items()}

# Seleccionar 5 palabras para analizar
words_to_analyze = ['car', 'computer', 'space', 'health', 'government']

# Analizar similaridad de palabras
analyze_word_similarity(X_train_transposed, tfidfvect.vocabulary_, words_to_analyze)



Palabra: "car"
Palabra similar: "cars" (Similitud de coseno: 0.1797)
Palabra similar: "criterium" (Similitud de coseno: 0.1770)
Palabra similar: "civic" (Similitud de coseno: 0.1748)
Palabra similar: "owner" (Similitud de coseno: 0.1689)
Palabra similar: "dealer" (Similitud de coseno: 0.1681)

Palabra: "computer"
Palabra similar: "decwriter" (Similitud de coseno: 0.1563)
Palabra similar: "harkens" (Similitud de coseno: 0.1522)
Palabra similar: "deluged" (Similitud de coseno: 0.1522)
Palabra similar: "shopper" (Similitud de coseno: 0.1443)
Palabra similar: "the" (Similitud de coseno: 0.1361)

Palabra: "space"
Palabra similar: "nasa" (Similitud de coseno: 0.3304)
Palabra similar: "seds" (Similitud de coseno: 0.2966)
Palabra similar: "shuttle" (Similitud de coseno: 0.2928)
Palabra similar: "enfant" (Similitud de coseno: 0.2803)
Palabra similar: "seti" (Similitud de coseno: 0.2465)

Palabra: "health"
Palabra similar: "ohip" (Similitud de coseno: 0.3304)
Palabra similar: "provincial" (Simi

Slack
* https://join.slack.com/t/ceaiworkspace/shared_invite/zt-2l9un8yte-yWrXdu7msfCwr32VG6RIgQ

Github
* https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/blob/main/clase_1/ejercicios/Desafio_1.ipynb

Drive
* https://drive.google.com/drive/u/0/folders/1joS44xgaoCKapc44WmkjcZyjWJh4tQz0

Colab TP1
* https://colab.research.google.com/github/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/blob/main/clase_1/ejercicios/Desafio_1.ipynb
