In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [2]:
def preprocess_data(df, column):
    df[column] = df[column].apply(lambda x: [word.lower() for word in x])
    return df

In [3]:
def train_word2vec(df, column, size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences=df[column], window=window, min_count=min_count, workers=workers) # size=size,
    return model

In [4]:
def build_word_embedding_matrix(word2vec_model, vocabulary):
    embedding_matrix = np.zeros((len(vocabulary), word2vec_model.vector_size))
    for i, word in enumerate(vocabulary):
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return embedding_matrix

In [5]:
def crftm(dataframe, text_column, n_topics=10, vector_size=100, window=5, min_count=1, workers=4, max_iter=100, tol=1e-6):
    df = preprocess_data(dataframe, text_column)

    word2vec_model = train_word2vec(df, text_column, size=vector_size, window=window, min_count=min_count, workers=workers)

    vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, min_df=min_count)
    dtm = vectorizer.fit_transform(df[text_column])
    dtm = normalize(dtm, norm='l1', axis=1)

    vocabulary = vectorizer.get_feature_names()
    embedding_matrix = build_word_embedding_matrix(word2vec_model, vocabulary)

    n_docs, n_words = dtm.shape
    word_topic = np.random.rand(n_words, n_topics)
    doc_topic = np.random.rand(n_docs, n_topics)

    prev_ll = 0
    for iteration in range(max_iter):
        word_topic /= word_topic.sum(axis=0)
        doc_topic = dtm @ word_topic
        doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]

        word_topic = np.dot(dtm.T, doc_topic) + embedding_matrix
        word_topic /= word_topic.sum(axis=0)

        ll = np.sum(dtm * np.log(word_topic @ doc_topic.T))
        if np.abs(ll - prev_ll) < tol:
            break
        prev_ll = ll

    return doc_topic, word_topic, word2vec_model

In [6]:
def display_topics(word_topic, vocabulary, n_top_words=10):
    for topic_idx, topic in enumerate(word_topic.T):
        print(f"Topic #{topic_idx + 1}:")
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [vocabulary[i] for i in top_words_idx]
        print(" ".join(top_words))

In [None]:
# Leitura dos dados do dataframe
#df = pd.read_csv('datasets/(processado-final)textos_tuitesPt_2020.csv.gz', names=['texto'])
df = pd.read_csv('datasets/(processado)textos_tuitesPt_2020_0.csv', names=['texto'])

# Elimina um valor flutuante que aparece no dataframe (por razões misteriosas)
# o algoritmo não aceita o valor flutuante, que precisa ser filtrado
df = df[df['texto'].apply(lambda x: isinstance(x, str))]
df['texto'].apply(type).value_counts()

df

In [None]:
doc_topic, word_topic, word2vec_model = crftm(df, 'texto', n_topics=10, vector_size=10)

In [None]:
vocabulary = vectorizer.get_feature_names_out()
display_topics(word_topic, vocabulary, n_top_words=10)