In [None]:
import pandas as pd
import numpy as np

# Prepocessing libraries
import re
import string
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import itertools

# Visualization libraries
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm

In [None]:


def calculate_perplexities(data, no_topics, min_df):
    """
    DEPRECATED: Not used in the project.

    Calculates the perplexities of LDA models with different number of topics.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    no_topics : int
        The maximum number of topics to be considered.
    min_df : int
        The minimum number of documents a word should appear in to be
        considered as a feature.
    Returns
    -------
    perplexities : list
        The list of perplexities of LDA models with different nu
    """

    progress_bar = tqdm(total=no_topics, desc='Calculating Perplexities', unit='model') # to show progress bar while iterating over the number of topics

    perplexities = []
    vectorized_data = CountVectorizer(min_df=min_df).fit_transform(data)

    for i in range(2, no_topics+1):
        lda = LatentDirichletAllocation(n_components=i, random_state=0)
        lda.fit(vectorized_data)
        perplexities.append(lda.perplexity(vectorized_data))
        progress_bar.update(1)
    
    progress_bar.close()
    return perplexities

# function to display the topics of a topic model
def display_topics(model, feature_names, no_top_words):
    # TODO: implement this function
    pass
            

def get_coherence_score(model, text, dictionary, coherence):
    """
    Calculates the coherence score of a topic model.

    Parameters
    ----------
    model : gensim.models.ldamodel.LdaModel
        The LDA model.
    corpus : list
        The list of preprocesses stories.
    dictionary : gensim.corpora.dictionary.Dictionary
        The dictionary of the corpus.
    Returns
    -------
    coherence_score : float
        The coherence score of the topic model.
    """

    # get the word2vec score of the topic model
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def get_cross_validation_data(data, n_topics_range, min_dfs, max_dfs, alphas, coherence):
    """
    Performs cross validation to find the best hyperparameters for the topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Grid search to find the best hyperparameters
    for no_topics in n_topics_range:
        for min_df in min_dfs:
            for max_df in max_dfs:
                for alpha in alphas:
                    min_df_abs = min_df*data.size

                    tokens_list = data["story"].str.split().to_list()
                    dictionary = Dictionary(tokens_list)
                    dictionary.filter_extremes(no_below=min_df_abs, no_above=max_df)
                    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

                    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_topics, alpha=alpha, random_state=0)
                    coherence_score = get_coherence_score(lda, tokens_list, dictionary, coherence=coherence)

                    cross_validation_data.loc[len(cross_validation_data)] = [no_topics, min_df, max_df, alpha, coherence_score]

                    progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

def get_cross_validation_data_optimized(data, n_topics_range, min_dfs, max_dfs, alphas, coherence):
    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model')

    parameter_combinations = itertools.product(n_topics_range, min_dfs, max_dfs, alphas)

    for parameters in parameter_combinations:
        no_topics, min_df, max_df, alpha = parameters
        min_df_abs = min_df * data.size

        tokens_list = data["story"].str.split().to_list()
        dictionary = Dictionary(tokens_list)
        dictionary.filter_extremes(no_below=min_df_abs, no_above=max_df)
        corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_topics, alpha=alpha, random_state=0)
        coherence_score = get_coherence_score(lda, tokens_list, dictionary, coherence=coherence)

        cross_validation_data.loc[len(cross_validation_data)] = [no_topics, min_df, max_df, alpha, coherence_score]
        progress_bar.update(1)

    progress_bar.close()
    return cross_validation_data

In [None]:
# test

# test: get the cross validation data for two sets of hyperparameters
cross_validation_data = get_cross_validation_data(data, [10, 20], [0.01, 0.05], [0.99, 0.98], [0.01, 0.02])
cross_validation_data

In [None]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(99, 89, -1)]
alphas = [i*0.05 for i in range(1, 21)]
no_topics_range = [i for i in range(2, 21)]

cross_validation_data_cv = get_cross_validation_data(data, no_topics_range, min_dfs, max_dfs, alphas, coherence='c_v')
cross_validation_data_cv.to_csv("data/cross_validation_data_cv.csv", index=False)

data_lemmatized = pd.read_csv("data/preprocessed_stories_lemmatized.csv", header=None, names=["story"])

cross_validation_data_lemmatized_cv = get_cross_validation_data(data_lemmatized, no_topics_range, min_dfs, max_dfs, alphas, coherence='c_v')
cross_validation_data_lemmatized_cv.to_csv("data/cross_validation_data_lemmatized_cv.csv", index=False)