In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from concurrent.futures import ThreadPoolExecutor

In [2]:
class PLSA:
    def __init__(self, n_topics, n_iterations=100, tol=1e-4, n_workers=None):
        self.n_topics = n_topics
        self.n_iterations = n_iterations
        self.tol = tol
        self.n_workers = n_workers

    def initialize(self, term_document_matrix):
        n_terms, n_docs = term_document_matrix.shape

        self.prob_term_topic = np.random.rand(n_terms, self.n_topics)
        self.prob_doc_topic = np.random.rand(n_docs, self.n_topics)
        self.prob_topic = np.random.rand(self.n_topics)

        self.prob_term_topic /= np.sum(self.prob_term_topic, axis=0)
        self.prob_doc_topic /= np.sum(self.prob_doc_topic, axis=1)[:, np.newaxis]
        self.prob_topic /= np.sum(self.prob_topic)

    def e_step(self, term_document_matrix):
        n_terms, n_docs = term_document_matrix.shape
        prob_term_doc_topic = np.zeros((n_terms, n_docs, self.n_topics))

        def process_term(i):
            for j in range(n_docs):
                for k in range(self.n_topics):
                    prob_term_doc_topic[i, j, k] = self.prob_term_topic[i, k] * self.prob_doc_topic[j, k] * self.prob_topic[k]
                prob_term_doc_topic[i, j, :] /= np.sum(prob_term_doc_topic[i, j, :])

        with ThreadPoolExecutor(max_workers=self.n_workers) as executor:
            executor.map(process_term, range(n_terms))

        return prob_term_doc_topic

    def m_step(self, term_document_matrix, prob_term_doc_topic):
        self.prob_term_topic = np.sum(term_document_matrix.toarray()[:, :, np.newaxis] * prob_term_doc_topic.toarray(), axis=1)
        self.prob_term_topic /= np.sum(self.prob_term_topic, axis=0)

        self.prob_doc_topic = np.sum(term_document_matrix.toarray()[:, :, np.newaxis] * prob_term_doc_topic.toarray(), axis=0)
        self.prob_doc_topic /= np.sum(self.prob_doc_topic, axis=1)[:, np.newaxis]

        self.prob_topic = np.sum(np.sum(term_document_matrix.toarray()[:, :, np.newaxis] * prob_term_doc_topic.toarray(), axis=0), axis=0)
        self.prob_topic /= np.sum(self.prob_topic)

    def log_likelihood(self, term_document_matrix):
        ll = 0
        for i in range(term_document_matrix.shape[0]):
            for j in range(term_document_matrix.shape[1]):
                ll += term_document_matrix[i, j] * np.log(np.sum(self.prob_term_topic[i, :] * self.prob_doc_topic[j, :] * self.prob_topic))

        return ll

    def fit(self, term_document_matrix):
        self.initialize(term_document_matrix)

        # Converter a matriz de termos-documentos em uma matriz densa (NumPy array)
        term_document_matrix_dense = term_document_matrix.toarray()

        prev_ll = -np.inf
        for iteration in range(self.n_iterations):
            prob_term_doc_topic = self.e_step(term_document_matrix_dense)
            self.m_step(term_document_matrix_dense, prob_term_doc_topic)

            ll = self.log_likelihood(term_document_matrix_dense)
            if np.abs(ll - prev_ll) < self.tol:
                break

            prev_ll = ll

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [4]:
documents = [
    "O gato correu atrás do rato",
    "O rato comeu o queijo",
    "O cachorro latiu para o gato",
    "O gato comeu o peixe",
    "O rato e o gato brincaram juntos",
]

In [5]:
# Criar uma matriz de termos-documentos usando CountVectorizer
vectorizer = CountVectorizer()
term_document_matrix = vectorizer.fit_transform(documents)

In [6]:
# Converter a matriz para o formato CSR (Compressed Sparse Row) para otimização
term_document_matrix = sp.csr_matrix(term_document_matrix)

In [None]:
# Instanciar e treinar o modelo PLSA
n_topics = 10
plsa = PLSA(n_topics)
plsa.fit(term_document_matrix)

In [None]:
# Mostrar as probabilidades termo-tópico
term_topic_matrix = plsa.prob_term_topic
for term_index, term in enumerate(vectorizer.get_feature_names()):
    print(f"{term}: {term_topic_matrix[term_index]}")

In [None]:
# Mostrar as probabilidades documento-tópico
doc_topic_matrix = plsa.prob_doc_topic
for doc_index, doc in enumerate(documents):
    print(f"Documento {doc_index}: {doc_topic_matrix[doc_index]}")