In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def initialize_parameters(data, n_topics, n_terms, n_meta_features):
    theta = np.random.dirichlet(alpha=np.ones(n_topics), size=data.shape[0])
    beta = np.random.dirichlet(alpha=np.ones(n_terms), size=n_topics)
    gamma = np.random.dirichlet(alpha=np.ones(n_meta_features), size=n_topics)
    return theta, beta, gamma

In [3]:
def E_step(data, theta, beta, gamma, meta_info):
    phi = np.zeros((data.shape[0], beta.shape[0]))

    for d in range(data.shape[0]):
        for k in range(beta.shape[0]):
            phi[d, k] = theta[d, k] * np.prod(np.power(beta[k, :], data[d, :])) * np.prod(np.power(gamma[k, :], meta_info[d, :]))
        phi[d, :] /= np.sum(phi[d, :])

    return phi

In [4]:
def M_step(data, phi, theta, beta, gamma, meta_info):
    for d in range(data.shape[0]):
        theta[d, :] = (phi[d, :] + 1) / (np.sum(phi[d, :]) + beta.shape[0])

    for k in range(beta.shape[0]):
        for w in range(beta.shape[1]):
            beta[k, w] = np.sum(data[:, w] * phi[:, k]) / np.sum(data * phi)

        for m in range(gamma.shape[1]):
            gamma[k, m] = np.sum(meta_info[:, m] * phi[:, k]) / np.sum(meta_info * phi)

    return theta, beta, gamma

In [5]:
def MIGA(data, meta_info, n_topics, max_iter=100, tol=1e-6):
    n_terms = data.shape[1]
    n_meta_features = meta_info.shape[1]

    theta, beta, gamma = initialize_parameters(data, n_topics, n_terms, n_meta_features)

    for i in range(max_iter):
        theta_prev = theta.copy()

        phi = E_step(data, theta, beta, gamma, meta_info)
        theta, beta, gamma = M_step(data, phi, theta, beta, gamma, meta_info)

        if np.mean(np.abs(theta - theta_prev)) < tol:
            break

    return theta, beta, gamma

In [6]:
def display_topics(beta, vocab, n_top_words=10):
    for topic_idx, topic in enumerate(beta):
        top_words = [vocab[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [None]:
# Converter para matrizes NumPy
data_np = df['texto'].to_numpy()
meta_info_np = meta_info.to_numpy()

In [None]:
# Executar o algoritmo MIGA
n_topics = 10
theta, beta, gamma = MIGA(data_np, meta_info_np, n_topics)

In [None]:
vectorizer = CountVectorizer()
data_transformed = vectorizer.fit_transform(data)

vocab = {v: k for k, v in vectorizer.vocabulary_.items()}

In [None]:
# Exibir os tópicos e as principais palavras
n_top_words = 10
display_topics(beta, vocab, n_top_words)