In [1]:
import pandas as pd

# Prepocessing libraries
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# Visualization libraries
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm


def get_coherence_score(model, text, dictionary, coherence):
    """
    Calculates the coherence score of a topic model.

    Parameters
    ----------
    model : gensim.models.ldamodel.LdaModel
        The LDA model.
    corpus : list
        The list of preprocesses stories.
    dictionary : gensim.corpora.dictionary.Dictionary
        The dictionary of the corpus.
    Returns
    -------
    coherence_score : float
        The coherence score of the topic model.
    """

    # get the word2vec score of the topic model
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def get_cross_validation_data(data, n_topics_range, min_dfs, max_dfs, alphas, coherence):
    """
    Performs cross validation to find the best hyperparameters for the topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Grid search to find the best hyperparameters
    for no_topics in n_topics_range:
        for min_df in min_dfs:
            for max_df in max_dfs:
                for alpha in alphas:
                    min_df_abs = min_df*data.size

                    tokens_list = data["story"].str.split().to_list()
                    dictionary = Dictionary(tokens_list)
                    dictionary.filter_extremes(no_below=min_df_abs, no_above=max_df)
                    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

                    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_topics, alpha=alpha, random_state=0)
                    coherence_score = get_coherence_score(lda, tokens_list, dictionary, coherence=coherence)

                    cross_validation_data.loc[len(cross_validation_data)] = [no_topics, min_df, max_df, alpha, coherence_score]

                    progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

In [2]:
data_lemmatized = pd.read_csv("data/preprocessed_stories_lemmatized.csv", header=None, names= ["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(50, 10, -5)]
alphas = [i*0.05 for i in range(1, 21)]
no_topics_range = [i for i in range(2, 21)]

cross_validation_data_cv = get_cross_validation_data(data_lemmatized, no_topics_range,     min_dfs, max_dfs, alphas, coherence='c_v')

cross_validation_data_cv.to_csv("data/cross_validation_data_lemmatized_cv.csv",    index=False)

Cross Validation: 100%|██████████| 30400/30400 [17:51:46<00:00,  2.12s/model]  
