# Definindo o embedding do Top2Vec

## Carregando os dados

In [None]:
import json
import string

def lerDataSet():
    with open("/content/biologia_train_dataset.json", "r", encoding='utf-8') as file:
        docs = json.load(file)

    textos, titulos, keywords = zip(*docs)

    return (textos, titulos, keywords)

In [None]:
def preprocess_text(textos, nlp_model):
    pontuacao = set(string.punctuation)
    textos_processados = [
        ' '.join(str(token) for token in nlp_model(texto.lower()) if not token.is_stop and not token.is_punct and not token.is_digit and not any(char in pontuacao for char in str(token))) for texto in textos
    ]
    return textos_processados

In [None]:
!pip install spacy


In [None]:
!python3 -m spacy download pt_core_news_lg

In [None]:
import spacy

nlp = spacy.load("pt_core_news_lg")

dataSet = lerDataSet()
textos = preprocess_text(dataSet[0], nlp)
titulos = dataSet[0]
keywords = dataSet[1]

In [None]:
import json

def salvarTextosProcessados(textos: list):
  with open("/content/drive/MyDrive/Textos/textosBiologiaNoLematize.json", "w") as file:
    x = json.dumps(textos)
    file.write(x)

def abrirTextos():
  with open("/content/drive/MyDrive/Textos/textosBiologiaNoLematize.json", "r") as file:
    textos = json.load(file)
  return textos

In [None]:
salvarTextosProcessados(textos)

In [None]:
textos = abrirTextos()

## Avaliação

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
import pandas as pd

def similaridade(topicoModelado, topicoCorreto):
  embeddingsModelado = model.encode(topicoModelado)

  embeddingsManual = model.encode(topicoCorreto)

  cos_sim = util.cos_sim(embeddingsModelado, embeddingsManual)

  return float(cos_sim[0][0].item())

def similaridades(outputModelo, labels):
  similaridades = []

  for topicoModelado, topicoCorreto in zip(outputModelo, labels):
    similaridades.append(similaridade(topicoModelado, topicoCorreto))
  return similaridades

def salvarAnalise(outputModelo, labels, similaridadeCosseno, nome):
    df = pd.DataFrame({
        'topicos obtidos': outputModelo,
        'topicos esperados': labels,
        'similaridade': similaridadeCosseno
    })

    df.at[0, 'media_similaridade'] = df['similaridade'].mean()

    df.to_csv(f'/content/drive/MyDrive/ModelosNLP/Analises/resultadosAnaliseNolematize{nome}.csv', index=False)

In [None]:
with open("/content/avaliacaoBio.json", "r") as file:
  avaliacao = json.load(file)

import spacy

nlp = spacy.load("pt_core_news_lg")

questoes = preprocess_text(avaliacao["questoes"], nlp)
labelsManuais = preprocess_text(avaliacao["labels"], nlp)

## Top2Vec paraphrase-multilingual-MiniLM-L12-v2

In [None]:
!pip install top2vec

In [None]:
!pip install top2vec[sentence_transformers]

In [None]:
from top2vec import Top2Vec

umapArgs = {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0, "metric": 'cosine', "random_state": 100}

hdbscan = {"min_cluster_size": 10, "min_samples" : 10, "metric":'euclidean', "prediction_data": True}

topic2Vec_multilingual = Top2Vec(documents = textos,
                                 embedding_model='paraphrase-multilingual-MiniLM-L12-v2',
                                 min_count=25,
                                 verbose = True,
                                 umap_args = umapArgs,
                                 hdbscan_args=hdbscan)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
topic2Vec_multilingual.save("/content/drive/MyDrive/ModelosNLP/topic2Vec_multilingual_biologia_noLematize")

In [None]:
from top2vec import Top2Vec
topic2Vec_multilingual = Top2Vec.load("/content/drive/MyDrive/ModelosNLP/topic2Vec_multilingual_biologia")

In [None]:
def preverTopicos(questoes, topic_model):
  outputs_modelo = []
  for questao in questoes:
    topics_words, word_scores, topic_scores, topic_nums = topic_model.query_topics(questao, num_topics=5)

    topico = topics_words[0]
    obtido = ""
    for i in range(6):
      obtido = obtido + " " + topico[i]

    outputs_modelo.append(obtido)
  return outputs_modelo

In [None]:
outputTop2Vec = preverTopicos(questoes, topic2Vec_multilingual)

similaridadeTop2VecMultilingual = similaridades(outputTop2Vec, labelsManuais)

salvarAnalise(outputTop2Vec, labelsManuais, similaridadeTop2VecMultilingual, "Top2VecMultilingual")

2023-07-02 17:15:30,052 - top2vec - INFO - Downloading paraphrase-multilingual-MiniLM-L12-v2 model
INFO:top2vec:Downloading paraphrase-multilingual-MiniLM-L12-v2 model


### Resultado

O top2Vec (paraphrase-multilingual-MiniLM-L12-v2) se mostrou menos acurado que o bertopic, fazendo uma média de 0.635 pontos (média da similaridade)

## Top2Vec com Word2Vec

In [None]:
from top2vec import Top2Vec

umapArgs = {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0, "metric": 'cosine', "random_state": 100}

hdbscan = {"min_cluster_size": 10, "min_samples" : 10, "metric":'euclidean', "prediction_data": True}

topic2Vec_word2vec = Top2Vec(documents = textos,
                                 embedding_model='doc2vec',
                                 min_count=25,
                                 verbose = True,
                                 umap_args = umapArgs,
                                 hdbscan_args=hdbscan)

In [None]:
topic2Vec_word2vec.save("/content/drive/MyDrive/ModelosNLP/topic2Vec_word2vec_noLematize")

In [None]:
from top2vec import Top2Vec
topic2Vec_word2vec = Top2Vec.load("/content/drive/MyDrive/ModelosNLP/topic2Vec_word2vec")

In [None]:
outputTop2VecDoc2Vec = preverTopicos(questoes, topic2Vec_word2vec)

similaridadeTop2VecWord2Vec = similaridades(outputTop2VecDoc2Vec, labelsManuais)

salvarAnalise(outputTop2VecDoc2Vec, labelsManuais, similaridadeTop2VecWord2Vec, "Top2Vec_Word2Vec")

### Resultado
utilziando o Embedding word2vec o desepenenho abaixou, obtendo 0.487 de pontuação. Muito abaixo do experimento anterior.

## Escolha de embedding

Observando os resultados do modelo top2Vec utilizando os dois embeddings, observa-se que o sentence encoder do Bertopic apresenta uma pontuação maior quando comparado com o Doc2Vec. Desta forma, a partir daqui será utilizado o embedding paraphrase-multilingual-MiniLM-L12-v2 com o Top2Vec.