In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from concurrent.futures import ThreadPoolExecutor

In [2]:
class DMM:
    def __init__(self, n_topics, n_iter, alpha, beta):
        self.n_topics = n_topics
        self.n_iter = n_iter
        self.alpha = alpha
        self.beta = beta

    def fit(self, data):
        # Vetorize o texto usando o CountVectorizer
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(data)
        vocab = vectorizer.get_feature_names()
        self.vocab = np.array(vocab)

        # Inicialize o modelo
        self._initialize(X)

        # Atualize o modelo iterativamente
        for _ in range(self.n_iter):
            self._iteration(X)

    def _initialize(self, X):
        n_docs, n_words = X.shape

        # Inicializa o tópico de cada documento
        self.doc_topics = np.random.randint(0, self.n_topics, size=n_docs)

        # Inicializa contagens de palavras por tópico e documentos
        self.word_counts_by_topic = np.zeros((self.n_topics, n_words))
        self.doc_counts_by_topic = np.zeros(self.n_topics)

        # Atualiza contagens de palavras por tópico e documentos com base nas atribuições iniciais do tópico
        for d in range(n_docs):
            topic = self.doc_topics[d]
            self.doc_counts_by_topic[topic] += 1
            self.word_counts_by_topic[topic] += X[d].toarray()[0]

        # Inicializa a matriz de probabilidades condicionais
        self.conditional_probs = np.zeros((n_docs, self.n_topics))

    def _iteration(self, X):
        n_docs, n_words = X.shape

        # Percorra todos os documentos
        for d in range(n_docs):
            # Remova o documento atual das contagens
            current_topic = self.doc_topics[d]
            self.doc_counts_by_topic[current_topic] -= 1
            self.word_counts_by_topic[current_topic] -= X[d].toarray()[0]

            # Calcule as probabilidades condicionais para cada tópico
            for k in range(self.n_topics):
                word_likelihood = np.sum(np.log(self.beta + self.word_counts_by_topic[k]) * X[d].toarray()[0])
                self.conditional_probs[d, k] = np.log(self.alpha + self.doc_counts_by_topic[k]) + word_likelihood

            # Normalize as probabilidades condicionais
            self.conditional_probs[d] = self.conditional_probs[d] - np.max(self.conditional_probs[d])
            self.conditional_probs[d] = np.exp(self.conditional_probs[d])
            self.conditional_probs[d] = self.conditional_probs[d] / np.sum(self.conditional_probs[d])

            # Atribua o documento a um novo tópico com base nas probabilidades condicionais atualizadas
            new_topic = np.random.choice(self.n_topics, p=self.conditional_probs[d])
            self.doc_topics[d] = new_topic

            # Atualize as contagens de palavras por tópico e documentos com base no novo tópico
            self.doc_counts_by_topic[new_topic] += 1
            self.word_counts_by_topic[new_topic] += X[d].toarray()[0]

    def _iteration_batch(self, X, batch_indices):
        for d in batch_indices:
            self._iteration(X, d)

    def fit(self, X, vocab, batch_size=1000, n_jobs=4):
        self.vocab = vocab
        self._initialize(X)

        for i in range(self.n_iter):
            print(f"Iteration {i + 1}/{self.n_iter}")

            with ThreadPoolExecutor(max_workers=n_jobs) as executor:
                n_docs = X.shape[0]
                batches = [(i * batch_size, min((i + 1) * batch_size, n_docs)) for i in range((n_docs + batch_size - 1) // batch_size)]
                for batch_start, batch_end in batches:
                    executor.submit(self._iteration_batch, X, range(batch_start, batch_end))

    def get_topics(self, n_words):
        topics = []

        # Calcule a matriz de distribuição de palavras por tópico
        word_distribution_by_topic = (self.word_counts_by_topic + self.beta) / (np.sum(self.word_counts_by_topic, axis=1)[:, np.newaxis] + self.beta * self.word_counts_by_topic.shape[1])

        # Para cada tópico, encontre as palavras mais relevantes
        for k in range(self.n_topics):
            top_word_indices = word_distribution_by_topic[k].argsort()[-n_words:][::-1]
            top_words = self.vocab[top_word_indices]
            topics.append(top_words)

        return topics

In [None]:
# Leitura dos dados do dataframe
df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
#df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [4]:
# Vetoriza os documentos 
vectorizer = CountVectorizer(stop_words=None)
X = vectorizer.fit_transform(df['texto'])
vocab = np.array(vectorizer.get_feature_names())

In [None]:
# Crie e ajuste o modelo DMM com processamento em lote e paralelização
dmm = DMM(n_topics=20, n_iter=50, alpha=0.1, beta=0.01)
dmm.fit(X, vocab, batch_size=1000, n_jobs=4)

In [6]:
# Obtenha os tópicos
topics = dmm.get_topics(10)

In [None]:
for i, topic in enumerate(topics):
    print(f"Topico {i}: {' '.join(topic)}")