In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import itertools

In [2]:
class NBTMWE:
    def __init__(self, num_topics, alpha, beta, num_iterations):
        self.num_topics = num_topics
        self.alpha = alpha
        self.beta = beta
        self.num_iterations = num_iterations

    def preprocess(self, df, column_name):
        # Pré-processar o texto
        processed_docs = []

        for text in df[column_name]:
            # Tokenizar e remover stopwords
            tokens = word_tokenize(text)
            filtered_tokens = [token for token in tokens]
            processed_docs.append(filtered_tokens)

        return processed_docs

    def train_embeddings(self, tokenized_documents, size=100, window=5, min_count=1):
        # Treinar os embeddings das palavras
        self.word2vec = Word2Vec(sentences=tokenized_documents, size=size, window=window, min_count=min_count, workers=4)
        self.embedding_matrix = self.word2vec.wv.vectors

    def build_vocabulary(self, tokenized_documents):
        # Construir o vocabulário
        self.vectorizer = CountVectorizer(analyzer=lambda x: x, min_df=1)
        self.vectorizer.fit(tokenized_documents)
        self.word_index = self.vectorizer.vocabulary_

    def initialize_topic_assignments(self):
        self.topic_word_count = np.zeros((self.num_topics, len(self.word_index)))
        self.topic_count = np.zeros(self.num_topics)
        self.biterm_topic_assignments = []

        for doc in self.biterms:
            biterm_topic = []
            for w1, w2 in doc:
                # Converter palavras em índices numéricos
                w1_idx, w2_idx = self.word_index[w1], self.word_index[w2]

                # Atribuir tópicos aleatoriamente aos bitermos
                topic = np.random.randint(self.num_topics)
                biterm_topic.append(topic)

                # Atualizar contagens
                self.topic_word_count[topic, w1_idx] += 1
                self.topic_word_count[topic, w2_idx] += 1
                self.topic_count[topic] += 1
            self.biterm_topic_assignments.append(biterm_topic)

    def gibbs_sampling(self):
        for iter in range(self.num_iterations):
            for doc_idx, doc in enumerate(self.biterms):
                for biterm_idx, (w1, w2) in enumerate(doc):
                    # Converter palavras em índices numéricos
                    w1_idx, w2_idx = self.word_index[w1], self.word_index[w2]

                    # Remover atribuições de tópicos atuais
                    current_topic = self.biterm_topic_assignments[doc_idx][biterm_idx]
                    self.topic_word_count[current_topic, w1_idx] -= 1
                    self.topic_word_count[current_topic, w2_idx] -= 1
                    self.topic_count[current_topic] -= 1

                    # Calcular probabilidades do tópico
                    probabilities = (self.topic_word_count[:, w1_idx] + self.beta) * (self.topic_word_count[:, w2_idx] + self.beta) / ((self.topic_count + len(self.word_index) * self.beta) ** 2)

                    # Normalizar probabilidades
                    probabilities /= np.sum(probabilities)

                    # Amostrar novo tópico
                    new_topic = np.random.choice(self.num_topics, p=probabilities)

                    # Atualizar atribuições de tópicos e contagens
                    self.biterm_topic_assignments[doc_idx][biterm_idx] = new_topic
                    self.topic_word_count[new_topic, w1_idx] += 1
                    self.topic_word_count[new_topic, w2_idx] += 1
                    self.topic_count[new_topic] += 1

                    
    def fit(self, df, column_name):
        tokenized_documents = self.preprocess(df, column_name)
        self.train_embeddings(tokenized_documents)
        self.build_vocabulary(tokenized_documents)

        self.biterms = []
        for doc in tokenized_documents:
            biterms_in_doc = [(doc[i], doc[j]) for i in range(len(doc)) for j in range(i + 1, len(doc))]
            self.biterms.append(biterms_in_doc)

        self.initialize_topic_assignments()
        self.gibbs_sampling()

    def get_topics(self, num_words=10):
        topics = []
        for topic_idx in range(self.num_topics):
            top_word_indices = self.topic_word_count[topic_idx].argsort()[-num_words:][::-1]
            top_words = [self.vectorizer.get_feature_names()[i] for i in top_word_indices]
            topics.append(top_words)

        return topics

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [None]:
nbtmwe = NBTMWE(num_topics=10, alpha=1, beta=0.01, num_iterations=50)
nbtmwe.fit(df, 'texto')
topics = nbtmwe.get_topics()

In [None]:
for idx, topic in enumerate(topics):
    print(f"Topic {idx + 1}: {', '.join(topic)}")