In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
def create_pseudo_documents(df, column, num_pseudo_docs, num_top_words):
    # Agrupar documentos em pseudo-documentos
    step = len(df) // num_pseudo_docs
    pseudo_documents = [' '.join(df[column][i:i+step]) for i in range(0, len(df), step)]

    return pseudo_documents

In [3]:
def ptm(df, column, num_topics, num_pseudo_docs, num_top_words):
    # Criar pseudo-documentos
    pseudo_documents = create_pseudo_documents(df, column, num_pseudo_docs, num_top_words)
    
    # Vectorizar o texto usando TF-IDF
    vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')
    tfidf = vectorizer.fit_transform(pseudo_documents)

    # Aplicar NMF para extrair tópicos
    nmf = NMF(n_components=num_topics, random_state=1).fit(tfidf)
    feature_names = vectorizer.get_feature_names()

    # Exibir tópicos e palavras-chave
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        topics.append((topic_idx, top_words))
        print(f"Topic #{topic_idx}: {', '.join(top_words)}")

    return topics

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [None]:
num_topics = 10
num_pseudo_docs = 2
num_top_words = 10

topics = ptm(df, 'texto', num_topics, num_pseudo_docs, num_top_words)