In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['texto'])

In [5]:
def co_occurrence_matrix(X):
    X[X > 0] = 1
    return np.dot(X.T, X)

co_matrix = co_occurrence_matrix(X)

In [None]:
def word_network(co_matrix, labels, threshold=0):
    G = nx.Graph()
    G.add_nodes_from(labels)
    
    for i, label1 in enumerate(labels):
        for j, label2 in enumerate(labels):
            weight = co_matrix[i, j]
            if weight > threshold:
                G.add_edge(label1, label2, weight=weight)
    
    return G

word_graph = word_network(co_matrix, vectorizer.get_feature_names_out())

In [None]:
eigenvector_centrality = nx.eigenvector_centrality(word_graph)

In [None]:
def word_clusters(eigenvector_centrality, n_topics):
    sorted_words = sorted(eigenvector_centrality, key=eigenvector_centrality.get, reverse=True)
    word_clusters = []
    
    for i in range(n_topics):
        word_clusters.append([])
    
    for i, word in enumerate(sorted_words):
        word_clusters[i % n_topics].append(word)
    
    return word_clusters

n_topics = 5
topics = word_clusters(eigenvector_centrality, n_topics)

In [None]:
def assign_topics(df, topics, eigenvector_centrality):
    topic_assignments = []
    
    for text in df['text']:
        topic_scores = np.zeros(len(topics))
        
        for i, topic in enumerate(topics):
            for word in text.split():
                if word in topic:
                    topic_scores[i] += eigenvector_centrality[word]
        
        assigned_topic = np.argmax(topic_scores)
        topic_assignments.append(assigned_topic)
    
    return topic_assignments

df['topic'] = assign_topics(df, topics, eigenvector_centrality)

In [None]:
def display_topics(topics):
    for i, topic in enumerate(topics):
        print(f"Tópico {i + 1}:")
        print(", ".join(topic))
        print()

display_topics(topics)
