In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [2]:
class NQTM:
    def __init__(self, n_topics, n_samples, max_iter=100, tol=1e-4):
        self.n_topics = n_topics
        self.n_samples = n_samples
        self.max_iter = max_iter
        self.tol = tol
        self.Q = None

    def _negative_sampling(self, P):
        neg_P = np.zeros_like(P)
        for t in range(self.n_topics):
            sorted_indices = np.argsort(-P[:, t])
            neg_indices = sorted_indices[self.n_samples:]
            neg_P[neg_indices, t] = P[neg_indices, t]
        return neg_P

    def _quantization(self, P):
        return (P > 0).astype(float)

    def fit_transform(self, X):
        n_docs, n_words = X.shape
        
        # Inicializar a matriz P e a matriz Q.
        P = np.random.rand(n_docs, self.n_topics)
        P /= P.sum(axis=1, keepdims=True)

        self.Q = np.random.rand(self.n_topics, n_words)
        self.Q /= self.Q.sum(axis=1, keepdims=True)


        for _ in range(self.max_iter):
            P_old = P.copy()

            # E-step
            P = X @ self.Q.T
            P = normalize(P, axis=1, norm='l1')

            # M-step
            neg_P = self._negative_sampling(P)
            self.Q = normalize((X.T @ (P - neg_P)).T, axis=1, norm='l1')
            self.Q = self._quantization(self.Q)

            if np.linalg.norm(P - P_old) < self.tol:
                break

        return P

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [4]:
documents = df["texto"].tolist()

In [5]:
# Vetorizar os documentos
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=None)
X = vectorizer.fit_transform(documents)

In [None]:
# Treinar o modelo NQTM
n_topics = 10
n_samples = 10
nqtm = NQTM(n_topics, n_samples)
topic_matrix = nqtm.fit_transform(X.toarray())

In [None]:
# Exibir os tópicos
topic_word_matrix = nqtm.Q
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(topic_word_matrix):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))