# To do

- Remove the need for filter values so there are fewer things to specify

# Usage

This notebook performs fitting of gensim LDA based on a range of parameters as described in the settings file. It expects a settings.json file in the same directory and the explorer.ipynb notebook one directory up.

[Click here](#display-results) to jump to the results graphs and tables.

# Importing Packages

In [None]:
import numpy as np
import pandas as pd
import multiprocessing as mp

In [None]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel

In [None]:
import json
import os
from os.path import join as pjoin
import shutil

# Job Settings Import

In [None]:
try:    
    with open("job_config.json") as config_file:
        settings = json.load(config_file)
except:
    settings = {}

In [None]:
pd.DataFrame([settings]).transpose() # Display the job config

In [None]:
data_path = settings.get("data_filepath","Your data filepath here") # Filepath to the data
text_col = settings.get("text_column","Your text column name here") # Column containing text to be analyzed
index_col = settings.get("index_column","Your index column name here") # Column to use as index for dataframe
filter_col = settings.get("filter_column","Your filter column here") # Column used for filtering the data
filter_vals = settings.get("acceptable_values","Your filter values here") # Values accepted from said column
pickle_model = settings.get("pickle_model",False) # Whether to save the raw Gensim model. Not recommended
delete_bad_runs = settings.get("delete_bad_runs",True) # Whether to delete the files associated with most runs. Important for saving disk space.

# Import and Vectorize Data

### Import Data

In [None]:
preprocessed_SES = pd.read_csv(data_path,
            usecols = [index_col,filter_col,text_col],
            index_col = False)
# Set the index manually as I was having some trouble with specifying it in the import statement.
preprocessed_SES.set_index(index_col,inplace = True)
len(preprocessed_SES)

In [None]:
# Remove any missing rows. This is necessary since some people write "nan". Shouldn't be a problem with new preprocessing.
preprocessed_SES.dropna(inplace = True)
len(preprocessed_SES)

### Filter rows based on the filter_col

In [None]:
#Select responses from columns specified in the settings file
preprocessed_SES = preprocessed_SES[preprocessed_SES[filter_col].isin(filter_vals)]
len(preprocessed_SES)

In [None]:
preprocessed_SES.head()

### Tokenize the text

In [None]:
#Split each document into a list of words (a 'text' as gensim calls it)
tokenized_SES = preprocessed_SES[text_col].apply(str.split)

### Define the word list for the texts

In [None]:
#Create Word to ID pairing for use with gensim models
SES_dictionary = Dictionary(tokenized_SES)
#Remove super common words. Not sure if necessary or helpful.
SES_dictionary.filter_extremes(no_above = .5)
# Remove all very short words
short_words = [word for word in SES_dictionary.token2id.keys() if len(word) < 2]
short_ids = [SES_dictionary.token2id[word] for word in short_words]
# Remove any specified stop words
stop_words = settings.get("stop_words",[])
stop_ids = [SES_dictionary.token2id.get(word,None) for word in stop_words]
stop_ids = list(filter(None,stop_ids))
SES_dictionary.filter_tokens(
    bad_ids = short_ids + stop_ids
)
# Create corpus.
SES_corpus = [SES_dictionary.doc2bow(x) for x in tokenized_SES]

# Functions for scoring and extracting components from the fitted LDA Model

In [None]:
def score_lda(lda,corpus):
    '''This function takes a fitted gensim LdaModel and returns a dictonary of scores'''
    #Calculate perplexity from gensim method as per gensim documentation and source code.
    perplexity_score = 2**(-lda.log_perplexity(corpus))
    # Fit and score coherence model on the topics.
    c_model = CoherenceModel(model = lda,
                            texts = tokenized_SES,
                            dictionary = SES_dictionary,
                            coherence = 'c_v',
                            processes = 1)
    cv_score = c_model.get_coherence()
    return {
        "cv_score": cv_score,
        "perplexity": perplexity_score
    }

In [None]:
def get_term_topic_matrix(lda):
    '''Takes a fitted gensim LDA model and returns a dataframe with words on the index and topics as columns'''
    #First grab the matrix of word probabilities
    term_topic_matrix = pd.DataFrame(lda.get_topics()).transpose()
    #Replace index with word. Pretty sure the index matches correctly.
    term_topic_matrix.rename(
        index = SES_dictionary.id2token,
        columns = str, # Change columns to have string names
        inplace = True
        )
    return term_topic_matrix

In [None]:
def get_doc_topic_matrix(lda):
    '''Takes a fitten gensim LDA model and returns a dataframe of the estimated document distributions'''
    # A real issue is that the results of get_document topics seem to be variable. For instance it can assign .36 to topic 1 or to topic 5
    # Use the fitted model on the dataset
    document_topic_matrix = pd.DataFrame(
        [{doc_tuple[0]:doc_tuple[1] for doc_tuple in doc_tuple_list} for doc_tuple_list in lda[SES_corpus]])
    # Fill in missing values.
    document_topic_matrix.fillna(0,inplace = True)
    # Reorder columns to be nice
    document_topic_matrix = document_topic_matrix.reindex(sorted(document_topic_matrix.columns), axis=1)
    # Change columns to have string names
    document_topic_matrix.rename(columns = str, inplace = True)
    # Introduce the actual document index
    document_topic_matrix.index = preprocessed_SES.index
    return document_topic_matrix

# Do the gridsearch


## Configure Runs


In [None]:
num_topics_list = settings.get("num_topics",[3,5,10]) # Load in hyperparameter space
runs = []
run_id = 1
for n in num_topics_list:
    run = {
        "run_id": run_id,
        "num_topics": n,
        "run_name": f"{n}-top-run-{run_id}" # This should be a nice directory name
    }
    run_id += 1
    runs.append(run) # add run to list of runs

In [None]:
num_docs = len(preprocessed_SES)
passes = int(np.ceil(100000/num_docs)) # Make sure the fitting looks at at least 100,000 documents.
def execute_run(run):
    run_results = run.copy() # Make a copy of run to add onto
    num_topics = run["num_topics"] # Number of topics to use for fitting
    run_folder = run["run_name"] # Name to use for folder
    model_name = settings.get("job_name","lda") # Name to use for saved model files
    random_state = settings.get("random_state",333)
    iterations = settings.get("iterations",50)
    # Fit model
    lda = LdaModel(
        id2word = SES_dictionary,
        iterations = iterations, # Max number of iterations of model?
        passes = passes, # Number of times to go through the texts
        num_topics = num_topics,
        alpha = 'auto', # Learn possibly asymmetric alpha
        random_state = random_state
        )
    lda.update(SES_corpus)
    # Score model
    scores = score_lda(lda,corpus = SES_corpus)
    # Add scores to run results
    run_results.update(scores)
    # Create run folder
    os.makedirs(run_folder,exist_ok = True)
    # Save fitted LDA model in run folder if that option was selected
    if pickle_model:
        lda.save(
            pjoin(run_folder,model_name)
            )
    # Put explorer notebook in run folder
    try:
        shutil.copyfile(
            src = "../Explorer.ipynb",
            dst = pjoin(run_folder,"Explorer.ipynb")
        )
    except:
        shutil.copyfile(
            src = "Interactive-LDA-Explorer.ipynb",
            dst = pjoin(run_folder,"Explorer.ipynb")
        )
    # Copy the settings file to the run folder
    shutil.copyfile(
        src = "job_config.json",
        dst = pjoin(run_folder,"settings.json")
    )
    # Save the document topic matrix and term topic matrices to the run folder
    get_doc_topic_matrix(lda).to_parquet(
        pjoin(run_folder,"doc-topic.parquet")
    )
    get_term_topic_matrix(lda).to_parquet(
        pjoin(run_folder,"term-topic.parquet")
    )
    return run_results

## Execute in Parallel

In [None]:
cpu_count = mp.cpu_count()
print(f"CPUs to be used: {cpu_count}")

In [None]:
with mp.Pool(cpu_count) as pool:
    results = pool.map(execute_run,runs)
    grid = pd.DataFrame(results)

## Delete runs results for bad runs (if option is selected)

In [None]:
def delete_run_results(run):
    '''Deletes the run folder containing the results from the specified run'''
    run_folder = run["run_name"] # Get the folder name from the run dict
    shutil.rmtree(run_folder)
    print(f"Deleted run in {run_folder}")

### Determine which runs to save

In [None]:
# Find ranges of topic_num based on quantiles, then select the highest cv_score element from each.
quantile_borders = grid["num_topics"].quantile(
    q = [0, 1/5, 2/5, 3/5, 1],
    interpolation='nearest'
).tolist()
print(quantile_borders)
ranges_of_interest = [range(quantile_borders[i],quantile_borders[i+1]) for i in range(len(quantile_borders)-1)]
idxs_to_save = []
for rang in ranges_of_interest:
    if len(rang) > 0: # In case an interquartile range is empty due to multiple quantiles being the same.
        idx_to_save = grid[grid["num_topics"].isin(rang)]["cv_score"].idxmax()
        idxs_to_save.append(idx_to_save)

### Delete the other runs

In [None]:
if delete_bad_runs:
    grid.drop(idxs_to_save).apply(
        delete_run_results,
        axis=1
    );

# Display results

In [None]:
print("Job results for:\n","\t",settings.get("long_name"))
print(f"Passes: {passes}")

In [None]:
grid[["run_id","num_topics","cv_score","perplexity"]].sort_values(
    by = "cv_score",
    ascending = False).style.hide_index()

I suspect that something is wrong give that perplexity seems to be increasing with number of topics. This happened in the SKLearn implementation of LDA also, and we aren't using perplexity for our model choice anyway.

In [None]:
print("Lower perplexity is better")
grid.plot(x='num_topics',
          y='perplexity')

C_V Score is giving some local maxima, which is nice. The idea behind c_v score is that it grades topics based on how similar the top words are. Top words are defined by p(word|topic) and similarlity is defined by a prefit coherence model drawn from wikipedia text.

In [None]:
print("Higher coherence score is better")
grid.plot(x='num_topics',
          y='cv_score')