# Caregamento dos dados e ferramenas de analise

## Recuperando dados para o treinamento e para a analise

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import string

def lerDataSet():
    with open("/content/drive/MyDrive/ModelosNLP/Dados/train_dataset.json", "r", encoding='utf-8') as file:
        docs = json.load(file)

    textos, titulos, keywords = zip(*docs)

    return (textos, titulos, keywords)

In [None]:
def preprocess_text(textos, nlp_model):
    pontuacao = set(string.punctuation)
    textos_processados = [
        ' '.join(str(token) for token in nlp_model(texto.lower()) if not token.is_stop and not token.is_punct and not token.is_digit and not any(char in pontuacao for char in str(token))) for texto in textos
    ]
    return textos_processados

In [None]:
!pip install spacy
!python3 -m spacy download pt_core_news_lg

In [None]:
import spacy

nlp = spacy.load("pt_core_news_lg")

dataSet = lerDataSet()
textos = preprocess_text(dataSet[0], nlp)
titulos = dataSet[1]
keywords = dataSet[2]

In [None]:
def salvarTextosProcessados(textos: list):
  with open("/content/drive/MyDrive/ModelosNLP/Dados/processedNoLematize_train_dataset.json", "w") as file:
    x = json.dumps(textos)
    file.write(x)

def abrirTextos():
  with open("/content/drive/MyDrive/ModelosNLP/Dados/processed_train_dataset.json", "r") as file:
    textos = json.load(file)
  return textos

In [None]:
salvarTextosProcessados(textos)

In [None]:
textos = abrirTextos()

In [None]:
with open("/content/drive/MyDrive/ModelosNLP/Dados/dicionarioAvalicao2.json", "r") as file:
  avaliacao = json.load(file)


import spacy

nlp = spacy.load("pt_core_news_lg")

questoes = preprocess_text(avaliacao["questoes"], nlp)
labelsManuais = preprocess_text(avaliacao["labels"], nlp)

## Analise de similaridade

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
import pandas as pd

def similaridade(topicoModelado, topicoCorreto):
  embeddingsModelado = model.encode(topicoModelado)

  embeddingsManual = model.encode(topicoCorreto)

  cos_sim = util.cos_sim(embeddingsModelado, embeddingsManual)

  return float(cos_sim[0][0].item())

def similaridades(outputModelo, labels):
  similaridades = []

  for topicoModelado, topicoCorreto in zip(outputModelo, labels):
    similaridades.append(similaridade(topicoModelado, topicoCorreto))
  return similaridades

def salvarAnalise(outputModelo, labels, similaridadeCosseno, nome):
    df = pd.DataFrame({
        'topicos obtidos': outputModelo,
        'topicos esperados': labels,
        'similaridade': similaridadeCosseno
    })

    df.at[0, 'media_similaridade'] = df['similaridade'].mean()

    df.to_csv(f'/content/drive/MyDrive/ModelosNLP/Analises/resultadosAnalise2{nome}.csv', index=False)

# Bertopic

In [None]:
%%capture
!pip install bertopic

In [None]:
#UMAP
from umap import UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# HDBSCABAN
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples = 10, metric='euclidean', prediction_data=True)

# C-TF-ID
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# VECTORIZER MODEl
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Paraphrase-multilingual
embedding_multilingual = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

BERTopic_unsupervised_multilingual = BERTopic(embedding_model = embedding_multilingual,
                              hdbscan_model = hdbscan_model,
                              umap_model = umap_model,
                              verbose=True,
                              min_topic_size=25,
                              top_n_words=5,
                              calculate_probabilities = True,
                              ctfidf_model = ctfidf_model,
                              vectorizer_model = vectorizer_model)

bertopics_multilingual, prob_multilingual = BERTopic_unsupervised_multilingual.fit_transform(textos)

Batches:   0%|          | 0/2635 [00:00<?, ?it/s]

2023-07-02 18:46:15,437 - BERTopic - Transformed documents to Embeddings
2023-07-02 18:48:43,796 - BERTopic - Reduced dimensionality


In [None]:
BERTopic_unsupervised_multilingual.save("/content/drive/MyDrive/ModelosNLP/berttopic_unsupervised_multilingual_pickle")

In [None]:
BERTopic_unsupervised_multilingual.save("/content/drive/MyDrive/ModelosNLP/bertopic_unsupervised_multilingual_safetensors",
                             serialization="safetensors",
                             save_ctfidf=True,
                             save_embedding_model= embedding_multilingual)

In [None]:
from bertopic import BERTopic

BERTopic_unsupervised_multilingual = BERTopic.load("/content/drive/MyDrive/ModelosNLP/berttopic_unsupervised_multilingual_pickle")

In [None]:
def preverTopicosBert(questoes, topic_model):
  outputs_modelo = []
  for questao in questoes:
    similarTopics, similarity = topic_model.find_topics(questao, top_n=5)

    topico, prob = zip(*topic_model.get_topic(similarTopics[0]))

    obtido = ""
    for x in topico:
      obtido = obtido + " " + x

    outputs_modelo.append(obtido)
  return outputs_modelo

In [None]:
outputBertMultilingual = preverTopicosBert(questoes, BERTopic_unsupervised_multilingual)

similiaridadeMultilingual = similaridades(outputBertMultilingual, labelsManuais)

salvarAnalise(outputBertMultilingual, labelsManuais, similiaridadeMultilingual, "BERTopic_unsupervised_multilingual")

# Top2Vec

In [None]:
!pip install top2vec

Installing collected packages: top2vec
Successfully installed top2vec-1.0.29


In [None]:
from top2vec import Top2Vec

umapArgs = {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0, "metric": 'cosine', "random_state": 100}

hdbscan = {"min_cluster_size": 10, "min_samples" : 10, "metric":'euclidean', "prediction_data": True}

topic2Vec_multilingual = Top2Vec(documents = textos,
                                 embedding_model='paraphrase-multilingual-MiniLM-L12-v2',
                                 min_count=25,
                                 verbose = True,
                                 umap_args = umapArgs,
                                 hdbscan_args=hdbscan)

In [None]:
topic2Vec_multilingual.save("/content/drive/MyDrive/ModelosNLP/Modelos/topic2Vec_multilingual_Final_noLematize")

In [None]:
from top2vec import Top2Vec
topic2Vec_multilingual = Top2Vec.load("/content/drive/MyDrive/ModelosNLP/Modelos/topic2Vec_multilingual_Final_noLematize")

In [None]:
def preverTopicos2vec(questoes, topic_model):
  outputs_modelo = []
  for questao in questoes:
    topics_words, word_scores, topic_scores, topic_nums = topic_model.query_topics(questao, num_topics=5)

    topico = topics_words[0]
    obtido = ""
    for i in range(6):
      obtido = obtido + " " + topico[i]

    outputs_modelo.append(obtido)
  return outputs_modelo

In [None]:
outputTop2Vec = preverTopicos2vec(questoes, topic2Vec_multilingual)

similaridadeTop2VecMultilingual = similaridades(outputTop2Vec, labelsManuais)

salvarAnalise(outputTop2Vec, labelsManuais, similaridadeTop2VecMultilingual, "Top2VecMultilingual")

In [None]:
import json

with open("/content/todasQuestoes2.json", "r") as file:
  questoes = json.load(file)

In [None]:
for questao in questoes:
  topics_words, word_scores, topic_scores, topic_nums = topic2Vec_multilingual.query_topics(questao["enunciado"], num_topics=5)
  topico = topics_words[0]
  obtido = ""
  for i in range(6):
    questao["topicos"].append(topico[i])

In [None]:
with open("/content/todasQuestoesComTopicos.json", "w") as file:
  x = json.dumps(questoes)
  file.write(x)