In [1]:
import nltk
import itertools
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from gensim import corpora, models

In [2]:
# Tokenização
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [3]:
def add_edges_in_chunks(graph, tfidf_matrix, chunk_size=1000, threshold=0.2):
    n_docs = tfidf_matrix.shape[0]
    for i in range(0, n_docs, chunk_size):
        for j in range(i, n_docs, chunk_size):
            i_chunk_start, i_chunk_end = i, min(i + chunk_size, n_docs)
            j_chunk_start, j_chunk_end = j, min(j + chunk_size, n_docs)

            # Calcular similaridades para o bloco atual
            similarities_chunk = cosine_similarity(tfidf_matrix[i_chunk_start:i_chunk_end], tfidf_matrix[j_chunk_start:j_chunk_end])

            # Adicionar arestas no grafo com base na similaridade do cosseno
            for i_idx, j_idx in itertools.product(range(similarities_chunk.shape[0]), range(similarities_chunk.shape[1])):
                similarity = similarities_chunk[i_idx, j_idx]

                # Ignorar similaridades na diagonal principal quando i == j
                if i == j and i_idx == j_idx:
                    continue

                if similarity > threshold:
                    graph.add_edge(i_chunk_start + i_idx, j_chunk_start + j_idx, weight=similarity)

In [4]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

df['tokens'] = df['texto'].apply(tokenize)

In [5]:
# Vetorização usando TF-IDF
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize, stop_words=None, max_df=0.9, min_df=0.1)
tfidf_matrix = vectorizer.fit_transform(df['texto'])

In [None]:
# Construindo o grafo de documentos
graph = nx.Graph()
graph.add_nodes_from(df.index)
add_edges_in_chunks(graph, tfidf_matrix, chunk_size=1000, threshold=0.2)

In [None]:
# Agregação de documentos
aggregated_docs = []
while graph.number_of_nodes() > 0:
    # Encontre o nó com o maior grau
    max_degree_node = max(graph.nodes, key=graph.degree)

    # Encontre todos os vizinhos do nó de maior grau
    neighbors = list(graph.neighbors(max_degree_node))

    # Combine os documentos do nó de maior grau e seus vizinhos
    aggregated_doc = []
    for node in neighbors + [max_degree_node]:
        aggregated_doc.extend(data.loc[node, 'tokens'])
        graph.remove_node(node)

    aggregated_docs.append(aggregated_doc)

In [None]:
# Modelagem de tópicos usando LDA
dictionary = corpora.Dictionary(aggregated_docs)
corpus = [dictionary.doc2bow(doc) for doc in aggregated_docs]

lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

In [None]:
# Imprimir tópicos
for i, topic in lda_model.print_topics(num_topics=10, num_words=5):
    print(f"Topic {i}: {topic}")