In [None]:
!pip install gensim
!pip install bertopic
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import numpy as np

In [6]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [7]:
def ler_arquivo(arquivo):
    arquivo = open(arquivo, "r")
    conteudo = arquivo.read().splitlines() 
    return conteudo

In [8]:
# docs = ler_arquivo("./centro-oeste.txt")
# docs = ler_arquivo("./sul.txt")
# docs = ler_arquivo("./nordeste.txt")
# docs = ler_arquivo("./sudeste.txt")
# docs = ler_arquivo("./norte.txt")
docs = ler_arquivo("./dados-filtrados.txt")

In [None]:
num_topics_range = range(2, 20)
coherence_scores_per_topic = {i: [] for i in num_topics_range}

num_executions = 10  

for _ in range(num_executions):
    coherence_scores = {i: [] for i in num_topics_range}
    
    for num_topics in num_topics_range:
        topic_model = BERTopic(nr_topics=num_topics, language="multilingual")
        topics, _ = topic_model.fit_transform(docs)
        cleaned_docs = topic_model._preprocess_text(docs)
        vectorizer = topic_model.vectorizer_model
        analyzer = vectorizer.build_analyzer()
        tokens = [analyzer(doc) for doc in cleaned_docs]
        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(token) for token in tokens]
        topics = topic_model.get_topics()
        topics.pop(-1, None)
        topic_words = [
            [word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics
        ]

        coherence_model = CoherenceModel(topics=topic_words,
                                        texts=tokens,
                                        corpus=corpus,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores[num_topics].append(coherence_score)

    for topic, scores in coherence_scores.items():
        coherence_scores_per_topic[topic].extend(scores)

average_coherence_per_topic = {topic: np.mean(scores) for topic, scores in coherence_scores_per_topic.items()}

plt.plot(list(average_coherence_per_topic.keys()), list(average_coherence_per_topic.values()), marker='o')
plt.xticks(np.arange(min(average_coherence_per_topic.keys()), max(average_coherence_per_topic.keys())+1, 1))  # Forcing integer x-axis ticks
plt.xlabel("Número de Tópicos")
plt.ylabel("Média de Coerência")
plt.title("Média de coerência por tópico em 10 execuções")
plt.show()


In [None]:
def descobre_topicos_if(texto, nr_topics):
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    topic_model = BERTopic(nr_topics=15, language="multilingual", top_n_words=30, n_gram_range=(1, 1), ctfidf_model=ctfidf_model)
    topics, _ = topic_model.fit_transform(docs)
    cleaned_docs = topic_model._preprocess_text(docs)
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topics = topic_model.get_topics()
    topics.pop(-1, None)
    topic_words = [
        [word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics
    ]

    coherence_model = CoherenceModel(topics=topic_words,
                                        texts=tokens,
                                        corpus=corpus,
                                        dictionary=dictionary,
                                        coherence='c_v')
    coherence_score = coherence_model.get_coherence_per_topic() 

    topic_tweets = {}
    for tweet, topic_id in zip(texto, topics):
        if topic_id not in topic_tweets:
            topic_tweets[topic_id] = []
        topic_tweets[topic_id].append(tweet)

    return topics, topic_tweets, coherence_score


def printa_topicos_if(topic_words):
    num_linhas = 2
    num_colunas = 2

    fig, axs = plt.subplots(num_linhas, num_colunas, figsize=(12, 8))
    fig.subplots_adjust(hspace=0.5)

    for topic_id in range(len(topic_words)):
        words = [word for word, _ in topic_words[topic_id] if word != ""]
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate(' '.join(words))

        linha_atual = topic_id // num_colunas
        coluna_atual = topic_id % num_colunas

        axs[linha_atual, coluna_atual].imshow(wordcloud, interpolation='bilinear')
        axs[linha_atual, coluna_atual].axis('off')
        axs[linha_atual, coluna_atual].set_title(f"Tópico {topic_id+1}")

    plt.show()

In [None]:
topics, topic_tweets, coherence_score = descobre_topicos_if(docs, 4)
printa_topicos_if(topics)