In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize

In [2]:
def generalized_polya_urn(num_colors, initial_counts, iterations, poisson_lambda):
    urn_counts = np.array(initial_counts, dtype=int)
    for _ in range(iterations):
        chosen_color = np.random.choice(num_colors, p=urn_counts / urn_counts.sum())
        additional_balls = np.random.poisson(poisson_lambda)
        urn_counts[chosen_color] += additional_balls

    return urn_counts.tolist()

In [3]:
def dirichlet_mixture_model(tokenized_docs, num_topics, alpha, poisson_lambda, iterations):
    num_docs = len(tokenized_docs)
    vocab = sorted(list(set(word for doc in tokenized_docs for word in doc)))
    num_words = len(vocab)
    word_indices = {word: idx for idx, word in enumerate(vocab)}

    word_counts_by_topic = np.zeros((num_topics, num_words), dtype=int)
    topic_assignments = np.random.randint(0, num_topics, size=num_docs)
    epsilon = 1e-9  # Small value to avoid division by zero

    # Initialize word_counts_by_topic
    for doc_idx, doc in enumerate(tokenized_docs):
        topic = topic_assignments[doc_idx]
        for word in doc:
            word_idx = word_indices[word]
            word_counts_by_topic[topic, word_idx] += 1

    # Main loop
    for _ in range(iterations):
        for doc_idx, doc in enumerate(tokenized_docs):
            current_topic = topic_assignments[doc_idx]

            # Remove word counts of the current document from word_counts_by_topic
            for word in doc:
                word_idx = word_indices[word]
                word_counts_by_topic[current_topic, word_idx] -= 1

            # Calculate topic probabilities
            doc_word_indices = [word_indices[word] for word in doc]
            doc_word_counts = word_counts_by_topic[:, doc_word_indices]
            topic_probabilities = (doc_word_counts.sum(axis=1) + alpha) / (doc_word_counts.sum() + num_topics * alpha)
            topic_probabilities *= (doc_word_counts + poisson_lambda).prod(axis=1)
            topic_probabilities = np.maximum(topic_probabilities, epsilon)  # Ensure non-negative probabilities
            topic_probabilities /= topic_probabilities.sum()  # Normalize probabilities

            # Sample a new topic and update word_counts_by_topic
            new_topic = np.random.choice(num_topics, p=topic_probabilities)
            topic_assignments[doc_idx] = new_topic
            for word in doc:
                word_idx = word_indices[word]
                word_counts_by_topic[new_topic, word_idx] += 1

    return topic_assignments, vocab, word_counts_by_topic

In [4]:
def preprocess_data(data):
    data['tokens'] = data['texto'].apply(lambda x: [word for word in word_tokenize(x.lower()) if word.isalnum()])
    return data['tokens'].tolist()

In [5]:
def train_gpu_pdmm(data, num_topics, alpha, poisson_lambda, iterations):
    tokenized_docs = preprocess_data(data)
    topic_assignments = dirichlet_mixture_model(tokenized_docs, num_topics, alpha, poisson_lambda, iterations)
    return topic_assignments

In [6]:
def get_top_words_by_topic(word_counts_by_topic, vocab, num_top_words=10):
    top_words_by_topic = []
    for topic_idx in range(word_counts_by_topic.shape[0]):
        top_word_indices = np.argsort(word_counts_by_topic[topic_idx, :])[-num_top_words:][::-1]
        top_words = [vocab[word_idx] for word_idx in top_word_indices]
        top_words_by_topic.append(top_words)
    return top_words_by_topic

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [8]:
# Tokenize os documentos
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

tokenized_docs = [tokenize(doc) for doc in df['texto']]

In [9]:
# Execute o algoritmo GPU-PDMM
num_topics = 10
alpha = 1.0
poisson_lambda = 1.0
iterations = 10
topic_assignments, vocab, word_counts_by_topic = dirichlet_mixture_model(tokenized_docs, num_topics, alpha, poisson_lambda, iterations)

In [10]:
# Obtenha as palavras mais frequentes para cada tópico
num_top_words = 10
top_words_by_topic = get_top_words_by_topic(word_counts_by_topic, vocab, num_top_words)

In [None]:
# Exiba os tópicos e suas palavras mais frequentes
for topic_idx, top_words in enumerate(top_words_by_topic):
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")