### Vectorización de texto y modelo de clasificación Naïve Bayes con el dataset 20 newsgroups

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

# 20newsgroups por ser un dataset clásico de NLP ya viene incluido y formateado
# en sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np

## Carga de datos

In [3]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Vectorización

In [4]:
# instanciamos un vectorizador
# ver diferentes parámetros de instanciación en la documentación de sklearn
tfidfvect = TfidfVectorizer()

In [5]:
# en el atributo `data` accedemos al texto
newsgroups_train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [6]:
# con la interfaz habitual de sklearn podemos fitear el vectorizador
# (obtener el vocabulario y calcular el vector IDF)
# y transformar directamente los datos
X_train = tfidfvect.fit_transform(newsgroups_train.data)
# `X_train` la podemos denominar como la matriz documento-término

In [7]:
# recordar que las vectorizaciones por conteos son esparsas
# por ello sklearn convenientemente devuelve los vectores de documentos
# como matrices esparsas
print(type(X_train))
print(f'shape: {X_train.shape}')
print(f'cantidad de documentos: {X_train.shape[0]}')
print(f'tamaño del vocabulario (dimensionalidad de los vectores): {X_train.shape[1]}')

<class 'scipy.sparse._csr.csr_matrix'>
shape: (11314, 101631)
cantidad de documentos: 11314
tamaño del vocabulario (dimensionalidad de los vectores): 101631


In [8]:
# una vez fiteado el vectorizador, podemos acceder a atributos como el vocabulario
# aprendido. Es un diccionario que va de términos a índices.
# El índice es la posición en el vector de documento.
tfidfvect.vocabulary_['car']

25775

In [9]:
# es muy útil tener el diccionario opuesto que va de índices a términos
idx2word = {v: k for k,v in tfidfvect.vocabulary_.items()}

In [10]:
# en `y_train` guardamos los targets que son enteros
y_train = newsgroups_train.target
y_train[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [11]:
# hay 20 clases correspondientes a los 20 grupos de noticias
print(f'clases {np.unique(newsgroups_test.target)}')
newsgroups_test.target_names

clases [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Similaridad de documentos

In [12]:
# Veamos similaridad de documentos. Tomemos algún documento
idx = 4811
print(newsgroups_train.data[idx])

THE WHITE HOUSE

                  Office of the Press Secretary
                   (Pittsburgh, Pennslyvania)
______________________________________________________________
For Immediate Release                         April 17, 1993     

             
                  RADIO ADDRESS TO THE NATION 
                        BY THE PRESIDENT
             
                Pittsburgh International Airport
                    Pittsburgh, Pennsylvania
             
             
10:06 A.M. EDT
             
             
             THE PRESIDENT:  Good morning.  My voice is coming to
you this morning through the facilities of the oldest radio
station in America, KDKA in Pittsburgh.  I'm visiting the city to
meet personally with citizens here to discuss my plans for jobs,
health care and the economy.  But I wanted first to do my weekly
broadcast with the American people. 
             
             I'm told this station first broadcast in 1920 when
it reported that year's presidential elec

In [13]:
# midamos la similaridad coseno con todos los documentos de train
cossim = cosine_similarity(X_train[idx], X_train)[0]

In [14]:
# podemos ver los valores de similaridad ordenados de mayor a menos
np.sort(cossim)[::-1]

array([1.        , 0.70930477, 0.67474953, ..., 0.        , 0.        ,
       0.        ])

In [15]:
# y a qué documentos corresponden
np.argsort(cossim)[::-1]

array([ 4811,  6635,  4253, ...,  1534, 10055,  4750])

In [16]:
# los 5 documentos más similares:
mostsim = np.argsort(cossim)[::-1][1:6]

In [17]:
# el documento original pertenece a la clase:
newsgroups_train.target_names[y_train[idx]]

'talk.politics.misc'

In [18]:
# y los 5 más similares son de las clases:
for i in mostsim:
  print(newsgroups_train.target_names[y_train[i]])

talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc
talk.politics.misc


### Modelo de clasificación Naïve Bayes

In [19]:
# es muy fácil instanciar un modelo de clasificación Naïve Bayes y entrenarlo con sklearn
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [20]:
# con nuestro vectorizador ya fiteado en train, vectorizamos los textos
# del conjunto de test
X_test = tfidfvect.transform(newsgroups_test.data)
y_test = newsgroups_test.target
y_pred =  clf.predict(X_test)

In [21]:
# el F1-score es una metrica adecuada para reportar desempeño de modelos de claificación
# es robusta al desbalance de clases. El promediado 'macro' es el promedio de los
# F1-score de cada clase. El promedio 'micro' es equivalente a la accuracy que no
# es una buena métrica cuando los datasets son desbalanceados
f1_score(y_test, y_pred, average='macro')

0.5854345727938506

### Consigna del desafío 1

**1**. Vectorizar documentos. Tomar 5 documentos al azar y medir similaridad con el resto de los documentos.
Estudiar los 5 documentos más similares de cada uno analizar si tiene sentido
la similaridad según el contenido del texto y la etiqueta de clasificación.



In [22]:
#Empezamos elegindo al azar 5 documentos:
# Veamos similaridad de documentos. Tomemos algún documento
idx_1 = 2517
idx_2 = 4962
idx_3 = 6851
idx_4 = 7585
idx_5 = 10515


In [23]:
#Documento 1
print(newsgroups_train.data[idx_1])




     Depends on what you mean by classy. From what I've heard about
him, he was about as classy as Harold Ballard.  Only difference was
that back then almost all the owners were like that, so he seemed okay
by comparison.  Read the book "Net Worth" for one view of what Smythe
(and Norris and Adams and Campbell) were like. 



     Even more specifically, I think what Roger was saying (and I said
it previously too) is that these are NOT the people who made the
league great, so why should divisions, conferences etc. be named after
them instead of Morenz, Vezina, Howe, Orr etc., the people who DID
make it great.  Instead, the NHL has chosen to immortalize the men who
got rich off of the men who made the game great.  



In [24]:
#Documento 2
print(newsgroups_train.data[idx_2])





In most cases information you come by properly is yours to use as you wish,
but there are certainly exceptions.  If you write a paper which includes
sufficiently detailed information on how to build a nuclear weapon, it is
classified.  As I understand the law, nuclear weapons design is
_automatically_ classified even if you do the work yourself.  I believe you
are then not allowed to read your own paper.

A less serious example: if you tell drivers about a speed trap they are
about to run into, you can be fined, even though you might argue that you
broke no law when you discovered the location of the policeman.  The charge
is interfering with a police officer, which is quite similar what you would
be doing by reverse engineering the Clipper chip.

Don't tell me that you think this violates the Constitution -- find some
court cases which have struck down such laws.  Many people would not be
comforted by the fact that the government violated their rights when it
imprisoned them.



In [25]:
#Documento 3
print(newsgroups_train.data[idx_3])


I'd like to remind people of the withering of the fig tree and Jesus
driving the money changers et. al. out of the temple.  I think those
were two instances of Christ showing anger (as part of His human side).


In [26]:
#Documento 4
print(newsgroups_train.data[idx_4])

Note: I am cross-posting (actually, emailing) this to 
bit.listserv.catholic while main posting goes to 
soc.religion.christian.

[Quotations omitted.  This is in response to a question about
the Immaculate Conception.  I explained it, but left justification
up to our Catholic readers.  --clh]

There is no direct reference in the Holy Scripture except for the
mention of Mary's _blessedness_/full of grace in the "Annunciation" by
Angel Gabriel in Luke 1:26-28

 And in the 6th month the angel Gabriel was sent from God unto
 a city of Galilee, named Nazareth.  To a virgin espoused to a
 man whose name was Joseph, of the house of David; and the virgin's
 name was Mary.  And the angel came unto her and said, _"Hail,
 thou that art highly favoured, the Lord is with thee: blessed
 art thou among women."_

Now, now, hold that line of thought - "the Lord is with Mary &
blessed art thou among women" - while you read on....

In the book, "First Lady of the World, A Popular History of
Devotion to 

In [27]:
#Documento 5
print(newsgroups_train.data[idx_5])




I can see high-voltage type display devices being vulnerable (CRTs,
plasma displays, etc.)  But Jason beat me to this question.  What
about EM radiation from low-voltage items like LCD displays?


### Ahora chequeamos los detalles requisitados para cada documento:

In [28]:
from itertools import count
for doc_id in [idx_1, idx_2, idx_3, idx_4, idx_5]:
  print(f'Documento {doc_id}')
  # midamos la similaridad coseno con todos los documentos de train
  cossim = cosine_similarity(X_train[doc_id], X_train)[0]

  # podemos ver los valores de similaridad ordenados de mayor a menor
  sim_ordenada = np.sort(cossim)[::-1][1:6]

  # y a qué documentos corresponden
  doc_corr = np.argsort(cossim)[::-1][1:6]

  # cantidad de documentos corresponden
  cant_doc_corr = count(np.argsort(cossim)[::-1][1:6])

  # los 5 documentos más similares:
  mostsim = np.argsort(cossim)[::-1][1:6]

  # el documento original pertenece a la clase:
  class_orig = newsgroups_train.target_names[y_train[doc_id]]

  # y los 5 más similares son de las clases:
  for i in mostsim:
    print(newsgroups_train.target_names[y_train[i]])

Documento 2517
rec.sport.hockey
rec.sport.hockey
talk.politics.mideast
talk.politics.mideast
soc.religion.christian
Documento 4962
sci.crypt
talk.politics.mideast
talk.politics.mideast
talk.religion.misc
soc.religion.christian
Documento 6851
alt.atheism
talk.religion.misc
soc.religion.christian
soc.religion.christian
soc.religion.christian
Documento 7585
soc.religion.christian
soc.religion.christian
soc.religion.christian
alt.atheism
soc.religion.christian
Documento 10515
sci.crypt
sci.electronics
sci.electronics
comp.graphics
sci.space


In [29]:
# prompt: para cada documento arriba, chequear los 5 documentos más similares de cada uno analizar si tiene sentido la similaridad según el contenido del texto y la etiqueta de clasificación con el dataset de entrenamiento

from IPython.display import display, HTML

def display_table(doc_ids, newsgroups_train, X_train, y_train):
    table_html = "<table><thead><tr><th>Document ID</th><th>Original Document Class</th><th>5 Most Similar Document IDs</th><th>5 Most Similar Classes</th></tr></thead><tbody>"
    for doc_id in doc_ids:
        cossim = cosine_similarity(X_train[doc_id], X_train)[0]
        mostsim = np.argsort(cossim)[::-1][1:6]
        original_class = newsgroups_train.target_names[y_train[doc_id]]
        similar_docs_classes = ""
        similar_doc_ids = ""
        for i in mostsim:
            similar_docs_classes += f"{newsgroups_train.target_names[y_train[i]]}<br>"
            similar_doc_ids += f"{i}<br>"

        table_html += f"<tr><td>{doc_id}</td><td>{original_class}</td><td>{similar_doc_ids}</td><td>{similar_docs_classes}</td></tr>"
    table_html += "</tbody></table>"
    display(HTML(table_html))

# Example usage (assuming you have the necessary variables defined)
# Replace with your actual document IDs if different
idx_1 = 2517
idx_2 = 4962
idx_3 = 6851
idx_4 = 7585
idx_5 = 10515

display_table([idx_1, idx_2, idx_3, idx_4, idx_5], newsgroups_train, X_train, y_train)

Document ID,Original Document Class,5 Most Similar Document IDs,5 Most Similar Classes
2517,rec.sport.hockey,1898 7087 9623 1292 5826,rec.sport.hockey rec.sport.hockey talk.politics.mideast talk.politics.mideast soc.religion.christian
4962,sci.crypt,9115 8726 4564 8754 10106,sci.crypt talk.politics.mideast talk.politics.mideast talk.religion.misc soc.religion.christian
6851,soc.religion.christian,913 7312 707 634 10754,alt.atheism talk.religion.misc soc.religion.christian soc.religion.christian soc.religion.christian
7585,soc.religion.christian,8222 5572 5099 913 10683,soc.religion.christian soc.religion.christian soc.religion.christian alt.atheism soc.religion.christian
10515,sci.crypt,6535 386 5067 7769 4535,sci.crypt sci.electronics sci.electronics comp.graphics sci.space


In [30]:
from IPython.display import HTML, display

def display_table(doc_ids, newsgroups_train, X_train, y_train):
    table_html = "<table border='1'><thead><tr><th>Document ID</th><th>Original Document Class</th><th>Top 5 Similar Documents (Text Snippets)</th></tr></thead><tbody>"
    for doc_id in doc_ids:
        cossim = cosine_similarity(X_train[doc_id], X_train)[0]
        mostsim = np.argsort(cossim)[::-1][1:6]
        original_class = newsgroups_train.target_names[y_train[doc_id]]
        similar_docs_text = ""
        for i in mostsim:
            text = newsgroups_train.data[i]
            #split the text and take first 10 lines
            lines = text.split('\n')[:10]
            text_snippet = "<br>".join(lines)
            similar_docs_text += f"<p>Document ID: {i}<br>Class: {newsgroups_train.target_names[y_train[i]]}<br>Text:<br>{text_snippet}</p>"


        table_html += f"<tr><td>{doc_id}</td><td>{original_class}</td><td>{similar_docs_text}</td></tr>"
    table_html += "</tbody></table>"
    display(HTML(table_html))

# Example usage
idx_1 = 2517
idx_2 = 4962
idx_3 = 6851
idx_4 = 7585
idx_5 = 10515

display_table([idx_1, idx_2, idx_3, idx_4, idx_5], newsgroups_train, X_train, y_train)

Document ID,Original Document Class,Top 5 Similar Documents (Text Snippets)
2517,rec.sport.hockey,"Document ID: 1898 Class: rec.sport.hockey Text: 	I think that you are incorrect, Roger. Patrick, Smythe and Adams all played or coached in the league before becoming front office types. Hence, they did help build the league, although they were not great players themselves. I agree that a name is a name is a name, and if some peopleDocument ID: 7087 Class: rec.sport.hockey Text: Punch Imlach's contributions as a coach and GM were far greater than those of the above combined. Should we name a division or trophy after him? Smythe and Norris and the bunch were honoured purely because they were powerful owners. As owners they certainly did help to build the league but whether they developed the game is another question altogether. Are we going to honour those who contributed to the league's evolution or are we going to honour those who contributed to the glory of the sport itself? Document ID: 9623 Class: talk.politics.mideast Text: Accounts of Anti-Armenian Human Right Violations in Azerbaijan #012  Prelude to Current Events in Nagorno-Karabakh  +---------------------------------------------------------+  | |  | I saw a naked girl with her hair down. They were |  | dragging her. She kept falling because they were |  | pushing her and kicking her. She fell down, it was |  | muddy there, and later other witnesses who saw it from |  | their balconies told us, they seized her by the hair |Document ID: 1292 Class: talk.politics.mideast Text: Accounts of Anti-Armenian Human Right Violations in Azerbaijan #008 Part B  Prelude to Current Events in Nagorno-Karabakh  (Part B of #008)  +------------------------------------------------------------------+  | |  | ""Oh, yes, I just remembered. While they were raping me they |  | repeated quite frequently, ""Let the Armenian women have babies |  | for us, Muslim babies, let them bear Azerbaijanis for the |Document ID: 5826 Class: soc.religion.christian Text: A listmember (D Andrew Killie, I think) wrote, in response to the suggestion that genocide may sometimes be the will of God:  > Any God who works that way is indescribably evil,  > and unworthy of my worship or faith. Nobuya ""Higgy"" Higashiyama replied (as, in substance, did others):  > Where is your source of moral standards by which you judge God's  > behavior?"
4962,sci.crypt,"Document ID: 9115 Class: sci.crypt Text: It is incompetent, like almost anything you have posted here, so you'll be flamed, sorry. %/$( your ""20 years of background in two Fortune 50 companies""; I've lived 30 years under a totalitarian regime, and boy, I *can* recognize a totalitarian plot when I see one... Document ID: 8726 Class: talk.politics.mideast Text: [After a small refresh Hasan got on the track again.]  |> |> I get the impression Hasan realized he goofed and is now  |> |> trying to drop the thread. Let him. It might save some  |> |> miniscule portion of his sorry face.Document ID: 4564 Class: talk.politics.mideast Text: [ stuff deleted ]  |> Are you calling names, or giving me a title? If the first, read your |> paragraph above, if not I accept the title, in order to let you get into the  |> um, well, debate again.  Hasan replies:  I didnot know that ""Master of wisdom"" can be ""name clling"" too,  unless you consider yourself deserve-less ! Document ID: 8754 Class: talk.religion.misc Text: /(hudson) /If someone inflicts pain on themselves, whether they enjoy it or not, they /are hurting themselves. They may be permanently damaging their body. That is true. It is also none of your business. Some people may also reason that by reading the bible and being a Xtian you are permanently damaging your brain. By your logic, it would be OK for them to come into your home, take away your bible, and send you offDocument ID: 10106 Class: soc.religion.christian Text: [In looking through my files this weekend, I ran across some lyrics from various rock groups that have content. Here are two from Black Sabbath's ""Master of Reality"". I'll say this much for the music of the '60's and early '70's, at least they asked questions of significance. Jethro Tull is another to asked and wrote about things that caused one to wonder. --Rex] AFTER FOREVER Have you ever thought about your soul--  can it be saved?"
6851,soc.religion.christian,"Document ID: 913 Class: alt.atheism Text: The recent rise of nostalgia in this group, combined with the  incredible level of utter bullshit, has prompted me to comb  through my archives and pull out some of ""The Best of Alt.Atheism""  for your reading pleasure. I'll post a couple of these a day  unless group concensus demands that I stop, or I run out of good  material.  I haven't been particularly careful in the past about saving  attributions. I think the following comes from John A. Johnson,  but someone correct me if I'm wrong. This is probably the longestDocument ID: 7312 Class: talk.religion.misc Text: DROPLET VOL 1, No 11, Part 3 D R O P L E T From The Vast Ocean Of The Miraculous Qur'an Translations from the Arabic and Turkish Writings of Bediuzzaman Said Nursi, The Risale-i Noor VOL 1, No 11, Part 3 ------------------------------------------------------------------Document ID: 707 Class: soc.religion.christian Text: I differ with our moderator on this. I thought the whole idea of God coming down to earth to live as one of us ""subject to sin and death"" (as one of the consecration prayers in the Book of Common Prayer (1979) puts it) was that Jesus was tempted, but did not succumb. If sin is not part of the basic definition of humanity, then Jesus ""fully human"" (Nicea) would not be ""subject to sin"", but then the Resurrection loses some of its meaning, because we encounter our humanity most powerfully when we sin. To distinguish between ""human"" and ""fallen human"" makes Jesus less like one of us at the time we need him most. Document ID: 634 Class: soc.religion.christian Text: The existence of repeated earth lives and destiny (karma) does not mean that everything that happens is predetermined by past deeds. There is an oriental view of it that tends in that direction, but I did not subscribe to that view. God may choose one individual over another as the fit instrument for his plans, but that does not preclude that the development of that individual into what he is in this earthly life is not the result of a longer course of development. I do not, and Rudolf Steiner did not, subscribe to the oriental viewDocument ID: 10754 Class: soc.religion.christian Text: [ Much deletion. He is trying to explain the Immaculate Conception and the Assumption of Mary.] If this is true than why in the Genesis story is God concerned that Adam and Eve might also eat from the Tree of Life and live forever and be like gods? Eating of the tree of life would not take away the effects of eating of the Tree of Knowledge. Is there any reason to assume that they had already eaten of the Tree of Life and so had"
7585,soc.religion.christian,"Document ID: 8222 Class: soc.religion.christian Text: Biblical basis for the Immaculate Conception: 1) ""I will put enmity between you [the Serpent] and the woman, and between your seed and her seed, she [can also be read he] shall crush your head and you shall bruise her [or his] heel.""  -Genesis 3.15 2) ""He who commits sin is of the devil ...""  -1 John 3.8 Document ID: 5572 Class: soc.religion.christian Text: A few points about Mary's being taken into heaven at the end of her life on earth: One piece of evidence for Mary's assumption into heaven is the fact that no Christian church ever claimed to be the sight where she was buried. Some Christian churches claimed to be located at the final resting places of Peter, Mark, and other saints, but no one ever claimed to possess the body of Mary, the greatest of the saints. Why? Because everyone knew that she had been taken up into heaven. Document ID: 5099 Class: soc.religion.christian Text: It was a gift from God. I think basically the reasoning was that the tradition in the Church held that Mary was also without sin as was Jesus. As the tenets of faith developed, particularly with Augustine, sin was more and more equated with sex, and thus Mary was assumed to be a virgin for life (since she never sinned, and since she was the spouse of God, etc.) Since we also had this notion of original sin, ie. that man is born with a predisposition to sin, and since Mary did not have this predisposition because she did not ever sin, she didn't have original sin. When science discovered the process of conception, the next step was to assume that Mary was conceived without original sin, the Immaculate Conception.Document ID: 913 Class: alt.atheism Text: The recent rise of nostalgia in this group, combined with the  incredible level of utter bullshit, has prompted me to comb  through my archives and pull out some of ""The Best of Alt.Atheism""  for your reading pleasure. I'll post a couple of these a day  unless group concensus demands that I stop, or I run out of good  material.  I haven't been particularly careful in the past about saving  attributions. I think the following comes from John A. Johnson,  but someone correct me if I'm wrong. This is probably the longestDocument ID: 10683 Class: soc.religion.christian Text: The argument for Luke's genealogy being that of Mary is very weak. According to Luke 3:23  And when he began his ministry, Jesus himself was about thirty years of age, being supposedly the son of Joseph,  the son of Eli, Aside from the fact that Mary is not mentioned, there are two possible"
10515,sci.crypt,"Document ID: 6535 Class: sci.crypt Text: are LCD displays vulnerable to tempest? i'll second that. jason Document ID: 386 Class: sci.electronics Text: Sci.E(E) netters: I am setting out to build and market a small electronic device that requires an LCD display. All of the analog electronics are working fine, I have ordered a PIC ICE (not vice versa) since the PICs are so cheap and low-power, but I am having a devil of a time finding any LCD displays in the 6-8 digit range that are priced as low as I need. I am looking for somthing in the range of $1 in quantities of about 1000-10,000. Document ID: 5067 Class: sci.electronics Text: Since your MOSFET is a 1972 vintage, it's probably not a very good one by today's standards. If you have an idea about its voltage and current ratings, e.g. 60VDC @ 6A, you can probably get away with replacing it with anything with better specs. Early MOSFETS had a gate-source voltage rating of approximately +/- 20 VDCmax, and they would usually turn completely ""ON"" at +10VDC. Otherwise, MOSFETS are not really mysterious -- they're more or less voltage controlled current sources. If the MOSFET in your circuit is Document ID: 7769 Class: comp.graphics Text: I require BGI drivers for Super VGA Displays and Super XVGA Displays. Does anyone know where I could obtain the relevant drivers ? (FTP sites ??) 	Regards Document ID: 4535 Class: sci.space Text: It wasn't especially prominent, as I recall. However, quite possibly it's no longer on display; NASM, like most museums, has much more stuff than it can display at once, and does rotate the displays occasionally."


#### Conclusion de los documentos similares:
##### EL contenido de los documentos tiene sentido la similaridad según el contenido del texto y la etiqueta de clasificación conforme podemos observar arriba en el cuadro comparativo del texto origen y los documentos similares.


**2**. Entrenar modelos de clasificación Naïve Bayes para maximizar el desempeño de clasificación
(f1-score macro) en el conjunto de datos de test. Considerar cambiar parámteros
de instanciación del vectorizador y los modelos y probar modelos de Naïve Bayes Multinomial
y ComplementNB.



### Modelo de clasificación Naïve Bayes

In [31]:
# Diferentes vectorizadores
vectorizers = [
    TfidfVectorizer(),
    TfidfVectorizer(ngram_range=(1, 2)),  # Intentando bigramas
    TfidfVectorizer(max_df=0.90, min_df=2),
    CountVectorizer(),
    CountVectorizer(ngram_range=(1, 2)),
    CountVectorizer(max_df=0.90, min_df=2)
]

# Experimento con distinstos modelos y parametros
naive_bayes_models = [
    MultinomialNB(),
    MultinomialNB(alpha=0.2), # Ayuste fino para identificar mejora
    ComplementNB(),
    ComplementNB(alpha=0.2)
]

best_f1_score = 0
best_vectorizer = None
best_model = None

for vectorizer in vectorizers:
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)

    for model in naive_bayes_models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='macro')
        print(f"Vectorizer: {vectorizer}, Model: {model}, F1-score: {f1}")

        if f1 > best_f1_score:
            best_f1_score = f1
            best_vectorizer = vectorizer
            best_model = model

print(f"\nBest F1-score: {best_f1_score}")
print(f"Best Vectorizer: {best_vectorizer}")
print(f"Best Model: {best_model}")

Vectorizer: TfidfVectorizer(), Model: MultinomialNB(), F1-score: 0.5854345727938506
Vectorizer: TfidfVectorizer(), Model: MultinomialNB(alpha=0.2), F1-score: 0.6424801988698626
Vectorizer: TfidfVectorizer(), Model: ComplementNB(), F1-score: 0.692953349950875
Vectorizer: TfidfVectorizer(), Model: ComplementNB(alpha=0.2), F1-score: 0.6997383695489467
Vectorizer: TfidfVectorizer(ngram_range=(1, 2)), Model: MultinomialNB(), F1-score: 0.5398024946338422
Vectorizer: TfidfVectorizer(ngram_range=(1, 2)), Model: MultinomialNB(alpha=0.2), F1-score: 0.6040818249095631
Vectorizer: TfidfVectorizer(ngram_range=(1, 2)), Model: ComplementNB(), F1-score: 0.6816839649266335
Vectorizer: TfidfVectorizer(ngram_range=(1, 2)), Model: ComplementNB(alpha=0.2), F1-score: 0.7032223083285436
Vectorizer: TfidfVectorizer(max_df=0.9, min_df=2), Model: MultinomialNB(), F1-score: 0.5970494639319617
Vectorizer: TfidfVectorizer(max_df=0.9, min_df=2), Model: MultinomialNB(alpha=0.2), F1-score: 0.655655758634977
Vectorize

**3**. Transponer la matriz documento-término. De esa manera se obtiene una matriz
término-documento que puede ser interpretada como una colección de vectorización de palabras.
Estudiar ahora similaridad entre palabras tomando 5 palabras y estudiando sus 5 más similares. **La elección de palabras no debe ser al azar para evitar la aparición de términos poco interpretables, elegirlas "manualmente"**.




In [34]:
# instanciamos un vectorizador, vamos a elegir el segundo mejor vetorizador del tipo CountVectorizer obtenido en paso anterior (el primer toma mucha memoria y causa crash)
tfidfvect_custom = CountVectorizer(max_df=0.98, min_df=2)

X_train = tfidfvect_custom.fit_transform(newsgroups_train.data)

In [40]:
# Transponer la matriz documento-término
X_train_transposed = X_train.T

# Calcular la similitud coseno entre términos
term_similarity = cosine_similarity(X_train_transposed)

# Agarrar el vocabulario vectorizado
vocabulary = tfidfvect_custom.vocabulary_

# Elegir 5 palabras manualmente
words = ['book', 'religion', 'player', 'market', 'god']
word_indices = [vocabulary[word] for word in words if word in vocabulary]

# Mostrar las 5 palabras más similares para cada palabra elegida
for word_index in word_indices:
    word = list(vocabulary.keys())[list(vocabulary.values()).index(word_index)]  # Seleciona la palabra actual
    similar_word_indices = np.argsort(term_similarity[word_index])[::-1][1:6]  # Hacemos el indice utilizando la similitud coseno y sacamos la propria palabra
    similar_words = [list(vocabulary.keys())[list(vocabulary.values()).index(i)] for i in similar_word_indices]  # Selecionando las palabras similares
    print(f"Palabra: {word}")
    print(f"Palabras similares: {similar_words}")
    print("-" * 20)

Palabra: book
Palabras similares: ['books', 'published', 'anania', 'shirak', 'diplomats']
--------------------
Palabra: religion
Palabras similares: ['religious', 'theists', 'atheists', 'atheist', 'atheism']
--------------------
Palabra: player
Palabras similares: ['team', 'hockey', 'nhl', 'league', 'ihl']
--------------------
Palabra: market
Palabras similares: ['mdc', 'condor', 'ventures', 'venture', 'launch']
--------------------
Palabra: god
Palabras similares: ['christ', 'bible', 'atheist', 'salvation', 'wicked']
--------------------


#### Con eso concluimos que fue posible identificar palabras similares y el calculo de similitud funciona como esperado.