In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import bitermplus as btm

In [None]:
def preprocess_data(df, column_name):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df[column_name])
    vocab = vectorizer.get_feature_names_out()
    return X, vocab

In [None]:
def biterm_topic_modeling(df, column_name, n_topics, n_iter, save_model=False, model_path=None):
    X, vocab = preprocess_data(df, column_name)
    btm = BitermPlus(n_topics=n_topics, n_iter=n_iter)
    btm.fit(X)

    if save_model and model_path:
        btm.save(model_path)

    return btm

In [None]:
def main():
    # Leitura dos dados do dataframe
    #df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
    df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

    # Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
    # o algoritmo não aceita o valor flutuante, que precisa ser filtrado
    df = df[df['texto'].apply(lambda x: isinstance(x, str))]
    df['texto'].apply(type).value_counts()

    df

    # Parâmetros do modelo
    column_name = 'texto'
    n_topics = 10
    n_iter = 10

    # Treinamento do modelo BTM
    btm = biterm_topic_modeling(df, column_name, n_topics, n_iter)

    # Exibir tópicos
    topic_words = btm.get_topic_words(topn=5)
    for i, words in enumerate(topic_words):
        print(f'Tópico {i}: {", ".join(words)}')

if __name__ == "__main__":
    main()