In [8]:
import pandas as pd
import random as r

# Prepocessing libraries
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# Visualization libraries
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm


def get_coherence_score(model, text, dictionary, coherence):
    """
    Calculates the coherence score of a topic model.

    Parameters
    ----------
    model : gensim.models.ldamodel.LdaModel
        The LDA model.
    corpus : list
        The list of preprocesses stories.
    dictionary : gensim.corpora.dictionary.Dictionary
        The dictionary of the corpus.
    Returns
    -------
    coherence_score : float
        The coherence score of the topic model.
    """

    # get the word2vec score of the topic model
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def grid_search(data, n_topics_range, min_dfs, max_dfs, alphas, etas, coherence):
    """
    Performs a grid search to find the best hyperparameters for a topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    etas : list
        The range of eta values to be considered.
    coherence : str
        The coherence score to be used.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', 'eta', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas) * len(etas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Grid search to find the best hyperparameters
    for no_topics in n_topics_range:
        for min_df in min_dfs:
            for max_df in max_dfs:
                for alpha in alphas:
                    for eta in etas:
                        min_df_abs = min_df*data.size

                        tokens_list = data["story"].str.split().to_list()
                        dictionary = Dictionary(tokens_list)
                        dictionary.filter_extremes(no_below=min_df_abs,     no_above=max_df)
                        corpus = [dictionary.doc2bow(tokens) for tokens in  tokens_list]

                        lda = LdaModel(corpus=corpus, id2word=dictionary,   num_topics=no_topics, alpha=alpha, random_state=0, eta=eta)
                        coherence_score = get_coherence_score(lda, tokens_list,     dictionary, coherence=coherence)

                        cross_validation_data.loc[len(cross_validation_data)] =     [no_topics, min_df, max_df, alpha, eta, coherence_score]

                        progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

def randomized_search(data, n_topics_range, min_dfs, max_dfs, alphas, etas, coherence, n_iter):
    """
    Performs a randomized search to find the best hyperparameters for a topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    etas : list
        The range of eta values to be considered.
    coherence : str
        The coherence score to be used.
    n_iter : int
        The number of iterations to be performed.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', 'eta', coherence])
    total_iterations = n_iter
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Randomized search to find the best hyperparameters
    for i in range(n_iter):
        no_topics = r.choice(n_topics_range)
        min_df = r.choice(min_dfs)
        max_df = r.choice(max_dfs)
        alpha = r.choice(alphas)
        eta = r.choice(etas)

        min_df_abs = min_df*data.size

        tokens_list = data["story"].str.split().to_list()
        dictionary = Dictionary(tokens_list)
        dictionary.filter_extremes(no_below=min_df_abs,     no_above=max_df)
        corpus = [dictionary.doc2bow(tokens) for tokens in  tokens_list]

        lda = LdaModel(corpus=corpus, id2word=dictionary,   num_topics=no_topics, alpha=alpha, random_state=0, eta=eta)
        coherence_score = get_coherence_score(lda, tokens_list,     dictionary, coherence=coherence)

        cross_validation_data.loc[len(cross_validation_data)] =     [no_topics, min_df, max_df, alpha, eta, coherence_score]

        progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

In [20]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names= ["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(50, 10, -1)]
alphas = [i*0.01 for i in range(1, 51)]
etas = [i*0.01 for i in range(1, 31)]
no_topics_range = [i for i in range(2, 16)]

cross_validation_data = randomized_search(data, no_topics_range, min_dfs, max_dfs, alphas, etas, coherence='c_v', n_iter=5000)

cross_validation_data.to_csv("data/randomized_search_5000_iter.csv", index=False)
cross_validation_data

Cross Validation: 100%|██████████| 5000/5000 [2:40:30<00:00,  1.93s/model]  


Unnamed: 0,no_topics,min_df,max_df,alpha,eta,c_v
0,11.0,0.03,0.46,0.35,0.09,0.364727
1,14.0,0.04,0.17,0.16,0.24,0.322121
2,10.0,0.04,0.40,0.25,0.16,0.295055
3,13.0,0.10,0.49,0.15,0.05,0.325234
4,13.0,0.03,0.17,0.31,0.16,0.322238
...,...,...,...,...,...,...
4995,10.0,0.09,0.32,0.46,0.29,0.323228
4996,10.0,0.10,0.32,0.18,0.28,0.287306
4997,12.0,0.05,0.50,0.37,0.09,0.336565
4998,14.0,0.05,0.45,0.34,0.06,0.344310


In [22]:
cross_validation_data.c_v.max()

0.3867767142067709