## Topic Modeling & Subject Headings

## gensim 

Kapadia, Shashank, "[Topic Modeling in Python: Latent Dirichlet Allocation (LDA)](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)," <i>towards data science</i>, Accessed 10/09/2020.

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, gensim, tqdm
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from scipy import stats

# Import NLTK packages.
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

# Import and append stopwords.
stop_words = stopwords.words("english")
stop_words.append('mr')

# Ignore warnings related to deprecated functions.
warnings.simplefilter("ignore", DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data/"

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_Subjects-dataframe.txt',
                 sep = '\t') \
    .dropna()

df.head()

Unnamed: 0,file,entry,date,subject,text
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,"War of 1812,Foreign Relations",10. VI:45. Visits at my house from John Herkim...
10,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-11,1825-01-11,"Native Americans,Adams Family Finances,Foreign...","11. VII. Visitors, W. Plumer jr. Settled with ..."
11,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-12,1825-01-12,Native Americans,12. V:30.Thomas J. Hellen went at 6. in the St...
12,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-13,1825-01-13,"Elections, Presidential (1824),Slavery,Coloniz...",13. VI:15. I called this morning on James Barb...
31,"('JQADiaries-v49-1825-06-p849.xml',)",jqadiaries-v49-1825-06-17,1825-06-17,Recreation,17. IV:45. symbols 31 symbols 32 symbols Bath...


## Functions

In [2]:
%%time

def remove_stopwords(texts):
    return [[w for w in simple_preprocess(str(doc)) if w not in stop_words] for doc in texts]

def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc = True)) # deacc = True :: removes punctuation
        
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus = corpus, 
                                           id2word = id2word, 
                                           num_topics = 10, 
                                           random_state = 100, 
                                           chunksize = 100, 
                                           passes = 10, 
                                           alpha = a, 
                                           eta = b, 
                                           per_word_topics = True)
    
    coherence_model_lda = CoherenceModel(model = lda_model, 
                                         texts = data_words, 
                                         dictionary = id2word, 
                                         coherence = 'c_v')
    
    return coherence_model_lda.get_coherence()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


## Create Corpus Dictionary

In [3]:
%%time

# Convert text to list.
data = df['text'].values.tolist()
        
data_words = list(sent_to_words(data))

# Create corpus + remove stopwords.
texts = remove_stopwords(data_words)

# Create dictionary.
id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]

CPU times: user 2.69 s, sys: 33.2 ms, total: 2.72 s
Wall time: 2.76 s


## Evaluate Multiple Models

In [8]:
%%time

# Topics range.
grid = {}
grid['Validation_Set'] = {}

min_topics = 10
max_topics = 100
step_size = 10
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)

corpus_sets = [corpus]

corpus_title = ['100% corpus']

model_results = {'Validation_Set': [], 'Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': []}

# Run model (can take a long time)
if 1 == 1:
    pbar = tqdm.tqdm(total = 30)
    
#     iterate through validation corpora
    for i in range(len(corpus_sets)):
        
#         iterate through number of topics.
        for k in topics_range:
            
#             Adjust alpha and beta once number of topics determined.
#             iterature through alpha values
            for a in alpha:
            
#                 iterature through beta values
                for b in beta:
                    
#                     Get coherence score for given parameters
                    cv = compute_coherence_values(corpus = corpus_sets[i], 
                                                  dictionary = id2word, 
                                                  k = k, 
                                                  a = a, 
                                                  b = b)
#                     Save model results.
                    model_results['Validation_Set'].append(corpus_sets[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
    pd.DataFrame(model_results).to_csv(abs_dir + 'Output/TopicModels/jqa_topics_tuning.csv', sep = ',', index = False)
    pbar.close()

  0%|          | 0/30 [20:44<?, ?it/s]
  0%|          | 0/30 [2:07:40<?, ?it/s]

CPU times: user 15min 9s, sys: 2min 36s, total: 17min 46s
Wall time: 2h 7min 40s





## Load results.

In [11]:
%%time

results = pd.read_csv(abs_dir + 'Output/TopicModels/jqa_topics_tuning.csv', sep = ',')

results.sort_values(by = ['Coherence'], ascending = False).head(5)

CPU times: user 2.11 s, sys: 227 ms, total: 2.34 s
Wall time: 2.36 s


Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
17,"[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, ...",10,0.91,0.61,0.345717
47,"[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, ...",20,0.91,0.61,0.343886
77,"[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, ...",30,0.91,0.61,0.343886
257,"[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, ...",90,0.91,0.61,0.343886
167,"[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, ...",60,0.91,0.61,0.343886
