In [2]:
import pandas as pd
import numpy as np
from gensim import models
import matplotlib.pyplot as plt
import logging
import re
import os
import tqdm
from gensim import corpora
### choose the callbacks classes to import
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric

In [3]:
def extract_lda_params(data_set):
    """
    :param data_set: pandas dataframe object , each entry is a document
    :return:
          texts_data: list of lists where each inner-list represent a single document : [[dog,cat,mouse],[..],[..]]
          corpus:Gensim corpus parameter for creating the LDA model
          id2word:Gensim dictionary parameter for creating the LDA model
    """
    texts_data = [str(x).split() for x in np.squeeze(data_set).values.tolist()]
    id2word = corpora.Dictionary(texts_data)
    # filter words which appear in less than 10 documents , or in more than 50% of the documents
    id2word.filter_extremes(no_below=10, no_above=0.5)
    corpus = [id2word.doc2bow(text) for text in texts_data]
    return texts_data, corpus, id2word


In [4]:
# The filename is the file that will be created with the log.
# If the file already exists, the log will continue rather than being overwritten.
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(filename=r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\src\model_callbacks.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.NOTSET)

In [5]:
train_data_path = r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\data\clean_lda_train.csv'
training_set = pd.read_csv(train_data_path, encoding='utf8')
documents,corpus,dictionary = extract_lda_params(training_set)

In [7]:
# Set up the callbacks loggers
perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
convergence_logger = ConvergenceMetric(logger='shell')
coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v', texts = documents)

In [None]:
# List of the different iterations to try
iterations = range(401,600,30)

# The number of passes to use - could change depending on requirements
passes = 1

for iteration in iterations:

    # Add text to logger to indicate new model
    logging.debug(f'Start of model: {iteration} iterations')

    # Create model - note callbacks argument uses list of created callback loggers
    model = models.ldamodel.LdaModel(corpus=corpus,
             id2word=dictionary,
             num_topics=36,
             eval_every=20,
             passes=1,
             iterations=iteration,
            random_state=42,
            callbacks=[convergence_logger, perplexity_logger, coherence_cv_logger])

    # Add text to logger to indicate end of this model
    logging.debug(f'End of model: {iteration} iterations')


In [108]:
# Function to detect relevant numbers in log
def find_doc_convergence(topic_num, iteration, log):
    # Regex to bookend log for iteration - choose last occurrence
    end_slice = re.compile(f"End of model: .*?{iteration} iterations")
    end_matches = [end_slice.findall(l) for l in open(log)]
    iteration_end = [i for i, x in enumerate(end_matches) if x]
    iteration_end = iteration_end[-1]
    start_slice = re.compile(f"Start of model: .*?{iteration} iterations")
    start_matches = [start_slice.findall(l) for l in open(log)]
    start_options = [i for i, x in enumerate(start_matches) if x]
    start_options = [item for item in start_options if item < iteration_end]
    iteration_start = max(start_options)
    iteration_bookends = [iteration_start, iteration_end]
    print("iteration_bookens is ")
    print(iteration_bookends)
    # Regex to find documents converged figures
    p = re.compile("\d+\.\d")
    matches = [ p.findall(l) for l in open(log)]
    print("matches before slice is \n")
    print(matches)
    matches = matches[iteration_bookends[0]:iteration_bookends[1]]
    print("matches after slice is \n")
    print(matches)
    matches = [m for m in matches if len(m) > 0]
    # Unlist internal lists and turn into numbers
    matches = [m for sublist in matches for m in sublist]
    matches = [float(m) for m in matches]
    return(matches)

In [109]:
iterations = [ 10,15,20]

all_metrics = pd.DataFrame()

for iteration in tqdm.tqdm(iterations):
    print(f"lda_{iteration}i50p/lda_{iteration}i50p.model")
    model = models.ldamodel.LdaModel.load(f"lda_{iteration}i50p/lda_{iteration}i50p.model")
    df = pd.DataFrame.from_dict(model.metrics)
    df['docs_converged'] = find_doc_convergence(5, iteration, r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\src\model_callbacks.log')
    df['iterations'] = iteration
    df['topics'] = 5

    df = df.reset_index().rename(columns={'index': 'pass_num'})

    all_metrics = pd.concat([all_metrics, df])

  0%|          | 0/3 [00:00<?, ?it/s]

lda_10i50p/lda_10i50p.model


  0%|          | 0/3 [00:05<?, ?it/s]

iteration_bookens is 
[1, 5]
matches before slice is 

[[], [], ['4.9'], ['203.2'], ['0.4'], [], [], ['4.9'], ['199.4'], ['0.4'], [], [], ['4.9'], ['199.4'], ['0.4'], []]
matches after slice is 

[[], ['4.9'], ['203.2'], ['0.4']]





ValueError: Length of values (3) does not match length of index (1)

In [None]:
for metric in ['Coherence', 'Perplexity', 'Convergence', 'docs_converged']:

    fig, axs = plt.subplots(1, 1, figsize=(20, 7))

    # Each plot to show results for all models with the same topic number
    for i, topic_number in enumerate([5]):
        filtered_topics = all_metrics[all_metrics['topics'] == topic_number]
        for label, df in filtered_topics.groupby(['iterations']):
            print(label)
            df.plot(x='pass_num', y=metric, ax=axs, label=label)

        axs.set_xlabel(f"Pass number")
        axs.legend()
        axs.set_ylim([all_metrics[metric].min() * 0.9, all_metrics[metric].max() * 1.1])

    if metric == 'docs_converged':
        fig.suptitle('Documents converged', fontsize=20)
    else:
        fig.suptitle(metric, fontsize=20)