In [1]:
import numpy as np
import pandas as pd
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigsh
from concurrent.futures import ThreadPoolExecutor

In [2]:
def preprocess_and_tokenize(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]

    return tokens

In [3]:
def tokenize_dataframe(df, text_column):
    df['tokens'] = df[text_column].apply(preprocess_and_tokenize)
    return df

In [4]:
def create_corpus_and_dictionary(df, text_column):
    documents = df[text_column].tolist()
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
    return corpus, dictionary

In [5]:
def compute_cosine_similarity(doc1, doc2, dictionary):
    vec1 = np.zeros(len(dictionary))
    vec2 = np.zeros(len(dictionary))

    for idx, freq in doc1:
        vec1[idx] = freq

    for idx, freq in doc2:
        vec2[idx] = freq

    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [6]:
def compute_row_similarity(i, corpus, dictionary, similarity_threshold):
    num_docs = len(corpus)
    row = np.zeros(num_docs)

    for j in range(i, num_docs):
        if i == j:
            row[j] = 1
        else:
            similarity = compute_cosine_similarity(corpus[i], corpus[j], dictionary)
            if similarity >= similarity_threshold:
                row[j] = -similarity

    row[i] = -row.sum()
    return row

In [7]:
def compute_laplacian_matrix(corpus, dictionary, similarity_threshold=0.9):
    num_docs = len(corpus)
    laplacian = lil_matrix((num_docs, num_docs))

    with ThreadPoolExecutor() as executor:
        rows = list(executor.map(lambda i: compute_row_similarity(i, corpus, dictionary, similarity_threshold), range(num_docs)))

    laplacian = np.array(rows)
    laplacian_sparse = csr_matrix(laplacian)

    return laplacian_sparse

In [8]:
def laplacian_dmm(df, text_column, num_topics, similarity_threshold=0.9):
    corpus, dictionary = create_corpus_and_dictionary(df, text_column)
    laplacian = compute_laplacian_matrix(corpus, dictionary, similarity_threshold)
    laplacian_eigenvalues, laplacian_eigenvectors = eigsh(laplacian, k=num_topics, which='SM')

    eigenvectors_to_use = laplacian_eigenvectors[:, :num_topics]
    document_topic_distribution = eigenvectors_to_use / eigenvectors_to_use.sum(axis=1)[:, None]

    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42)

    return lda_model, document_topic_distribution

In [9]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

Unnamed: 0,texto
0,coronavirus aceitar braco abrir
1,achar eleitor bolsonaro medo coronavirus febre...
2,fome coronavirus entrar fila
3,trancar replies twetr sala coronavirus
4,caso coronavirus confirmar Brasil mundo querer...
...,...
99996,preciso comando vermelho decretar toque recolh...
99997,urgente reporter confirmar segundo morte coron...
99998,informativo elaborar equipe viano azevedo advo...
99999,vivo Paulo confirmar primeiro morte covid19 Br...


In [10]:
df = tokenize_dataframe(df, 'texto')
df

Unnamed: 0,texto,tokens
0,coronavirus aceitar braco abrir,"[coronavirus, aceitar, braco, abrir]"
1,achar eleitor bolsonaro medo coronavirus febre...,"[achar, eleitor, bolsonaro, medo, coronavirus,..."
2,fome coronavirus entrar fila,"[fome, coronavirus, entrar, fila]"
3,trancar replies twetr sala coronavirus,"[trancar, replies, twetr, sala, coronavirus]"
4,caso coronavirus confirmar Brasil mundo querer...,"[caso, coronavirus, confirmar, brasil, mundo, ..."
...,...,...
99996,preciso comando vermelho decretar toque recolh...,"[preciso, comando, vermelho, decretar, toque, ..."
99997,urgente reporter confirmar segundo morte coron...,"[urgente, reporter, confirmar, segundo, morte,..."
99998,informativo elaborar equipe viano azevedo advo...,"[informativo, elaborar, equipe, viano, azevedo..."
99999,vivo Paulo confirmar primeiro morte covid19 Br...,"[vivo, paulo, confirmar, primeiro, morte, bras..."


In [None]:
num_topics = 10
lda_model, document_topic_distribution = laplacian_dmm(df, 'tokens', num_topics)

# Print the topic-word distribution
for topic_id in range(num_topics):
    print(f"Topic {topic_id}:")
    print(lda_model.show_topic(topic_id))

# Print the document-topic distribution
for doc_id, doc_topic_dist in enumerate(document_topic_distribution):
    print(f"Document {doc_id}: Topic distribution: {doc_topic_dist}")

  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
