In [1]:
import pandas as pd
import numpy as np
from gensim import models
import matplotlib.pyplot as plt
import logging
import re
import os
import tqdm
from gensim import corpora
### choose the callbacks classes to import
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric

In [2]:
def extract_lda_params(data_set):
    """
    :param data_set: pandas dataframe object , each entry is a document
    :return:
          texts_data: list of lists where each inner-list represent a single document : [[dog,cat,mouse],[..],[..]]
          corpus:Gensim corpus parameter for creating the LDA model
          id2word:Gensim dictionary parameter for creating the LDA model
    """
    texts_data = [str(x).split() for x in np.squeeze(data_set).values.tolist()]
    id2word = corpora.Dictionary(texts_data)
    # filter words which appear in less than 10 documents , or in more than 50% of the documents
    id2word.filter_extremes(no_below=10, no_above=0.5)
    corpus = [id2word.doc2bow(text) for text in texts_data]
    return texts_data, corpus, id2word


In [3]:
# The filename is the file that will be created with the log.
# If the file already exists, the log will continue rather than being overwritten.
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(filename=r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\src\model_callbacks.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.NOTSET)

In [4]:
train_data_path = r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\data\clean_lda_train.csv'
training_set = pd.read_csv(train_data_path, encoding='utf8')
documents,corpus,dictionary = extract_lda_params(training_set)

In [5]:
# Set up the callbacks loggers
coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v', texts = documents)

In [None]:
# List of the different iterations to try
passes = range(1,10,1)

# The number of passes to use - could change depending on requirements

for i in passes:

    # Add text to logger to indicate new model
    logging.debug(f'Start of model: {i} passes')

    # Create model - note callbacks argument uses list of created callback loggers
    model = models.ldamodel.LdaModel(corpus=corpus,
             id2word=dictionary,
             num_topics=21,
             update_every=1,
            chunksize=300,
             passes=i,
             iterations=250,
            random_state=42,
            callbacks=[coherence_cv_logger])

    # Add text to logger to indicate end of this model
    logging.debug(f'End of model: {i} passes')


In [None]:
import re
import matplotlib.pyplot as plt
import csv
import math
iteration=[]
perplexity=[]
convergence=[]
coherence=[]
for line in open(r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\src\model_callbacks.log'):
    if 'Start' in line:
        short=" ".join(line.split()[:-1])
        iteration.append(short.split()[-1])
    if 'Coherence' in line :
        coherence.append(line.split()[-1])

fields = ['iterations','cv_coherence']
# rows = zip(iteration,coherence)
# with open("conv.csv","w") as f :
#     writer = csv.writer(f)
#     writer.writerow(fields)
#     for row in rows:
#         writer.writerow(row)

iteration = [ int(i) for i in iteration]
coherence = [round(float(i),3) for i in coherence]
print(iteration)
print(coherence)
plt.plot(iteration,coherence)
plt.xlabel("iterations")
plt.ylabel("c_v coherence value")
plt.show()
import matplotlib.pyplot as plt
import csv
import math
iteration=[]
perplexity=[]
convergence=[]
coherence=[]
for line in open(r'C:\Users\katac\PycharmProjects\NLP_project\TopicModeling\LDA\src\model_callbacks.log'):
    if 'Start' in line:
        short=" ".join(line.split()[:-1])
        iteration.append(short.split()[-1])
    if 'Coherence' in line :
        coherence.append(line.split()[-1])

fields = ['iterations','cv_coherence']
# rows = zip(iteration,coherence)
# with open("conv.csv","w") as f :
#     writer = csv.writer(f)
#     writer.writerow(fields)
#     for row in rows:
#         writer.writerow(row)

iteration = [ int(i) for i in iteration]
coherence = [round(float(i),3) for i in coherence]
print(iteration)
print(coherence)
plt.plot(iteration,coherence)
plt.xlabel("iterations")
plt.ylabel("c_v coherence value")
plt.show()