#Definindo o embedding do Bertopic

## pré-processamento

In [None]:
%%capture
!pip install bertopic

!pip install spacy
!python3 -m spacy download pt_core_news_lg

In [None]:
import json
import string

def lerDataSet():
    with open("/content/biologia_train_dataset.json", "r", encoding='utf-8') as file:
        docs = json.load(file)

    textos, titulos, keywords = zip(*docs)

    return (textos, titulos, keywords)

In [None]:
def preprocess_text(textos, nlp_model):
    pontuacao = set(string.punctuation)
    textos_processados = [
        ' '.join(str(token) for token in nlp_model(texto.lower()) if not token.is_stop and not token.is_punct and not token.is_digit and not any(char in pontuacao for char in str(token))) for texto in textos
    ]
    return textos_processados

In [None]:
import spacy

nlp = spacy.load("pt_core_news_lg")

dataSet = lerDataSet()
textos = preprocess_text(dataSet[0], nlp)
titulos = dataSet[0]
keywords = dataSet[1]

In [None]:
def abrirTextos():
  with open("/content/drive/MyDrive/Textos/textosBiologia.json", "r") as file:
    textos = json.load(file)
  return textos

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
textos = abrirTextos()

## Funções de avaliação

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [None]:
import pandas as pd

def similaridade(topicoModelado, topicoCorreto):
  embeddingsModelado = model.encode(topicoModelado)

  embeddingsManual = model.encode(topicoCorreto)

  cos_sim = util.cos_sim(embeddingsModelado, embeddingsManual)

  return float(cos_sim[0][0].item())

def similaridades(outputModelo, labels):
  similaridades = []

  for topicoModelado, topicoCorreto in zip(outputModelo, labels):
    similaridades.append(similaridade(topicoModelado, topicoCorreto))
  return similaridades

def salvarAnalise(outputModelo, labels, similaridadeCosseno, nome):

  df = pd.DataFrame({
      'topicos obtidos': outputModelo,
      'topicos esperados': labels,
      'similaridade': similaridadeCosseno
  })

  df.at[0, 'media_similaridade'] = df['similaridade'].mean()

  df.to_csv(f'/content/drive/MyDrive/ModelosNLP/Analises/resultadosAnaliseNoLematize{nome}.csv', index=False)

In [None]:
with open("/content/avaliacaoBio.json", "r") as file:
  avaliacao = json.load(file)

import spacy

nlp = spacy.load("pt_core_news_lg")

questoes = preprocess_text(avaliacao["questoes"], nlp)
labelsManuais = preprocess_text(avaliacao["labels"], nlp)

## Parametros comuns

In [None]:
#UMAP
from umap import UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# HDBSCABAN
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples = 10, metric='euclidean', prediction_data=True)

# C-TF-ID
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# VECTORIZER MODEl
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

## Modelo Bertopic com embedding ricardo-filho-nli-assin-2

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Bert-base-portuguese-cased
bert_base_portuguese_cased = SentenceTransformer("ricardo-filho/bert-base-portuguese-cased-nli-assin-2")

topic_unsupervised_bbpc = BERTopic(embedding_model = bert_base_portuguese_cased,
                              hdbscan_model = hdbscan_model,
                              umap_model = umap_model,
                              verbose=True,
                              min_topic_size=25,
                              top_n_words=5,
                              calculate_probabilities = True,
                              ctfidf_model = ctfidf_model,
                              vectorizer_model = vectorizer_model)

topics_bbpc, prob_bbpc = topic_unsupervised_bbpc.fit_transform(textos)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
topic_unsupervised_bbpc.save("/content/drive/MyDrive/ModelosNLP/topic_unsupervised_biologia_ricardo_filho")

In [None]:
topic_unsupervised_bbpc.save("/content/drive/MyDrive/ModelosNLP/topic_unsupervised_biologia_ricardo_filho_spacylarge",
                             serialization="safetensors",
                             save_ctfidf=True,
                             save_embedding_model= bert_base_portuguese_cased)

### Resultados do modelo

In [None]:
from bertopic import BERTopic

topic_unsupervised_bbpc = BERTopic.load("/content/drive/MyDrive/ModelosNLP/topic_unsupervised_biologia_ricardo_filho")

In [None]:
def preverTopicos(questoes, topic_model):
  outputs_modelo = []
  for questao in questoes:
    similarTopics, similarity = topic_model.find_topics(questao, top_n=5)

    topico, prob = zip(*topic_model.get_topic(similarTopics[0]))

    obtido = ""
    for x in topico:
      obtido = obtido + " " + x

    outputs_modelo.append(obtido)
  return outputs_modelo

In [None]:
outputModeloRicardo = preverTopicos(questoes, topic_unsupervised_bbpc)

In [None]:
similiaridadeRicardo = similaridades(outputModeloRicardo, labelsManuais)

In [None]:
salvarAnalise(outputModeloRicardo, labelsManuais, similiaridadeRicardo, "ModeloRicardo")

#ficou com 625MB

## Modelo Bertopic com embedding paraphrase-multilingual-MiniLM-L12-v2

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Paraphrase-multilingual
embedding_multilingual = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

topic_unsupervised_multilingual = BERTopic(embedding_model = embedding_multilingual,
                              hdbscan_model = hdbscan_model,
                              umap_model = umap_model,
                              verbose=True,
                              min_topic_size=25,
                              top_n_words=5,
                              calculate_probabilities = True,
                              ctfidf_model = ctfidf_model,
                              vectorizer_model = vectorizer_model)

topics_multilingual, prob_multilingual = topic_unsupervised_multilingual.fit_transform(textos)

2023-07-02 17:52:51,544 - BERTopic - Reduced dimensionality
2023-07-02 17:54:54,513 - BERTopic - Clustered reduced embeddings


In [None]:
topic_unsupervised_multilingual.save("/content/drive/MyDrive/ModelosNLP/topic_unsupervised_multilingual_complete_noLematize")

#Ficou com 599MB (em pytorch ficou com 20MB)

  self._set_arrayXarray(i, j, x)


In [None]:
from bertopic import BERTopic

topic_unsupervised_multilingual = BERTopic.load("/content/drive/MyDrive/ModelosNLP/topic_unsupervised_multilingual_complete")

### Resultados do modelo

In [None]:
outputModeloMultilingual = preverTopicos(questoes, topic_unsupervised_multilingual)

similiaridadeMultilingual = similaridades(outputModeloMultilingual, labelsManuais)

salvarAnalise(outputModeloMultilingual, labelsManuais, similiaridadeMultilingual, "ModeloMultilingual")

## Resultados

O modelo utilizando o embedding RicardoFilho teve a pontução média de 0.547 (média da similaridade entre os tipicos encontrados e os topicos esperados).
Não obstante, o modelo que utilizou o embedding Multilingual obteve uma pontuação de 0.605. Desta forma, foi escolhido o embedding "paraphrase-multilingual-MiniLM-L12-v2"