In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Preparação dos dados de entrada
def prepare_input_data(dataframe, column_name):
    vectorizer = CountVectorizer()
    data_matrix = vectorizer.fit_transform(dataframe[column_name])
    return data_matrix, vectorizer

In [3]:
# Classe MTM
class MTM:
    def __init__(self, data_matrix, n_topics, n_iters):
        self.data_matrix = data_matrix
        self.n_docs, self.n_words = data_matrix.shape
        self.n_topics = n_topics
        self.n_iters = n_iters

    def initialize(self):
        self.topic_word_counts = np.random.randint(1, 10, size=(self.n_topics, self.n_words))
        self.topic_counts = np.sum(self.topic_word_counts, axis=1)
        self.document_topic_counts = np.zeros((self.n_docs, self.n_topics))
        self.document_counts = np.zeros(self.n_docs)

    def update(self):
        for d in range(self.n_docs):
            for i in range(self.n_words):
                word_count = self.data_matrix[d, i]
                if word_count > 0:
                    topic_prob = (self.topic_word_counts[:, i] / self.topic_counts) * (self.document_topic_counts[d] / self.document_counts[d])
                    topic_prob = np.nan_to_num(topic_prob, nan=0, posinf=None, neginf=None)
                    topic_prob = np.clip(topic_prob, 1e-10, 1 - 1e-10)
                    topic_prob /= np.sum(topic_prob)

                    new_topic_count = np.random.multinomial(word_count, topic_prob)
                    self.document_topic_counts[d] += new_topic_count
                    self.document_counts[d] += word_count
                    self.topic_word_counts[:, i] += new_topic_count
                    self.topic_counts += new_topic_count

    def fit(self):
        self.initialize()
        for _ in range(self.n_iters):
            self.update()

    def get_topics(self, vectorizer, n_top_words=10):
        feature_names = vectorizer.get_feature_names_out()
        topics = []
        for topic_idx, topic in enumerate(self.topic_word_counts):
            top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
            topics.append((topic_idx, top_words))
        return topics

In [4]:
# Função para treinar o modelo e extrair os tópicos
def train_and_extract_topics(dataframe, column_name, n_topics, n_iters=100, n_top_words=10):
    data_matrix, vectorizer = prepare_input_data(dataframe, column_name)
    mtm = MTM(data_matrix, n_topics, n_iters)
    mtm.fit()
    topics = mtm.get_topics(vectorizer, n_top_words)
    return topics

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [None]:
n_topics = 10
n_iters = 50
n_top_words = 10

topics = train_and_extract_topics(df, 'texto', n_topics, n_iters, n_top_words)

In [None]:
# Imprimir os tópicos encontrados
for topic_idx, top_words in topics:
    print(f"Topic {topic_idx}:")
    print(", ".join(top_words))