In [4]:
import pandas as pd

# Prepocessing libraries
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# Visualization libraries
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm


def get_coherence_score(model, text, dictionary, coherence):
    """
    Calculates the coherence score of a topic model.

    Parameters
    ----------
    model : gensim.models.ldamodel.LdaModel
        The LDA model.
    corpus : list
        The list of preprocesses stories.
    dictionary : gensim.corpora.dictionary.Dictionary
        The dictionary of the corpus.
    Returns
    -------
    coherence_score : float
        The coherence score of the topic model.
    """

    # get the word2vec score of the topic model
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def get_cross_validation_data(data, n_topics_range, min_dfs, max_dfs, alphas, etas, coherence):
    """
    Performs cross validation to find the best hyperparameters for the topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    etas : list
        The range of eta values to be considered.
    coherence : str
        The coherence score to be used.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', 'eta', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas) * len(etas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Grid search to find the best hyperparameters
    for no_topics in n_topics_range:
        for min_df in min_dfs:
            for max_df in max_dfs:
                for alpha in alphas:
                    for eta in etas:
                        min_df_abs = min_df*data.size

                        tokens_list = data["story"].str.split().to_list()
                        dictionary = Dictionary(tokens_list)
                        dictionary.filter_extremes(no_below=min_df_abs,     no_above=max_df)
                        corpus = [dictionary.doc2bow(tokens) for tokens in  tokens_list]

                        lda = LdaModel(corpus=corpus, id2word=dictionary,   num_topics=no_topics, alpha=alpha, random_state=0, eta=eta)
                        coherence_score = get_coherence_score(lda, tokens_list,     dictionary, coherence=coherence)

                        cross_validation_data.loc[len(cross_validation_data)] =     [no_topics, min_df, max_df, alpha, eta, coherence_score]

                        progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

In [6]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names= ["story"])

min_dfs = [i*0.01 for i in range(1, 3)]
max_dfs = [i*0.01 for i in range(20, 10, -5)]
alphas = [i*0.05 for i in range(1, 3)]
etas = [i*0.05 for i in range(1, 3)]
no_topics_range = [i for i in range(2, 4)]

cross_validation_data = get_cross_validation_data(data, no_topics_range, min_dfs, max_dfs, alphas, etas, coherence='c_v')

cross_validation_data

Cross Validation:   0%|          | 0/32 [00:00<?, ?model/s]

Cross Validation: 100%|██████████| 32/32 [00:53<00:00,  1.67s/model]


Unnamed: 0,no_topics,min_df,max_df,alpha,eta,c_v
0,2.0,0.01,0.2,0.05,0.05,0.280628
1,2.0,0.01,0.2,0.05,0.1,0.280628
2,2.0,0.01,0.2,0.1,0.05,0.280628
3,2.0,0.01,0.2,0.1,0.1,0.280628
4,2.0,0.01,0.15,0.05,0.05,0.236178
5,2.0,0.01,0.15,0.05,0.1,0.236178
6,2.0,0.01,0.15,0.1,0.05,0.236829
7,2.0,0.01,0.15,0.1,0.1,0.236829
8,2.0,0.02,0.2,0.05,0.05,0.26105
9,2.0,0.02,0.2,0.05,0.1,0.26105


In [7]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names= ["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(50, 10, -5)]
alphas = [i*0.05 for i in range(1, 21)]
etas = [i*0.05 for i in range(1, 21)]
no_topics_range = [i for i in range(2, 21)]

cross_validation_data_cv = get_cross_validation_data(data, no_topics_range,     min_dfs, max_dfs, alphas, etas, coherence='c_v')

cross_validation_data_cv.to_csv("data/cross_validation_data_cv.csv",    index=False)

TypeError: get_cross_validation_data() missing 1 required positional argument: 'etas'