# Topic Model — Kullback-Leibler Divergence

This notebook measures Kullback-Leibler Divergence (KLD).

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, gensim, tqdm
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from scipy import stats

# Import NLTK packages.
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Import sklearn packages.
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import and append stopwords.
stop_words = stopwords.words("english")
stop_words.append('mr')

# Ignore warnings related to deprecated functions.
warnings.simplefilter("ignore", DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data/"

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_Subjects-dataframe.txt', sep = '\t') \
    .drop(columns = ['subject']) \
    .dropna()

df.head()

Unnamed: 0,file,entry,date,text
0,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-01,1817-10-01,1. IV:30. Wednesday. Wrote a Letter to J. L. S...
1,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-02,1817-10-02,2. IV: Continued drafting instructions for Rus...
2,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-03,1817-10-03,3. IV: I had visits this morning from Mr Levet...
3,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-04,1817-10-04,4. IV: I waked before three and had afterwards...
4,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-05,1817-10-05,5. V: The Ladies went this morning to St. John...


## Cleaning & Preparation

In [2]:
%%time

# # Unnest subject headings.
# df['subject'] = df['subject'].str.split(',')
# df = df.explode('subject')

# Lowercase text field
df['text'] = df['text'].str.lower()

# Tokenize text field.
df['text'] = df['text'].apply(word_tokenize)

# Lemmatize and stem text field.
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords = True)

def lemma_and_stem(list_of_words):
    return [stemmer.stem(lemmatizer.lemmatize(w)) for w in list_of_words if w not in stop_words]

df['text'] = df['text'].apply(lemma_and_stem)

# Convert list of words to string for LDA model.
df['text'] = df['text'].apply(' '.join)

# print ('Number of unique subject headings:', len(df['subject'].unique()), '\n')

df.head()

CPU times: user 29 s, sys: 217 ms, total: 29.3 s
Wall time: 29.5 s


Unnamed: 0,file,entry,date,text
0,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-01,1817-10-01,1. iv:30. wednesday . wrote letter j. l. sulli...
1,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-02,1817-10-02,2. iv : continu draft instruct rush . subject ...
2,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-03,1817-10-03,"3. iv : visit morn levett harri , nours regist..."
3,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-04,1817-10-04,4. iv : wake three afterward sleep . inconveni...
4,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-05,1817-10-05,5. v : ladi went morn st. john ’ church ; retu...


## Train Model

In [3]:
%%time

# Remove duplicate text rows (caused from unnesting headings) by subsetting & de-duplicating.
topics = df[['entry', 'text']].drop_duplicates(subset = ['entry'])

# Initialise the vectorizer with English stop words.
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed texts.
features = vectorizer.fit_transform(topics['text'])

# Set parameters (topics set to number of unique subject headings found).
number_topics = 20
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components = number_topics, n_jobs=-1)
lda.fit(features)

CPU times: user 2.27 s, sys: 837 ms, total: 3.1 s
Wall time: 13 s


LatentDirichletAllocation(n_components=20, n_jobs=-1)

## Document Topic Distribution

In [11]:
%%time

# Create a document-topic matrix.
doc_tops = lda.transform(features)

# Convert document-topic matrix to dataframe.
doc_tops = pd.DataFrame(doc_tops, index = topics.index)

# Join document-topic dataframe with metadata on shared indices.
doc_tops = pd.merge(df[['entry', 'date']],
                    doc_tops,
                    left_index = True, right_index = True)

CPU times: user 63.1 ms, sys: 139 ms, total: 202 ms
Wall time: 2.21 s


## Kullback-Leibler Divergence

In [21]:
%%time

matrix = doc_tops.drop(columns = ['entry', 'date'])

kld = stats.entropy(matrix.T, matrix.T)

kld

CPU times: user 6.22 ms, sys: 1.84 ms, total: 8.06 ms
Wall time: 6.95 ms


array([0., 0., 0., ..., 0., 0., 0.])

In [25]:
# matrix.T

# stats.entropy([0.000485, 0.000266, 0.000299], [0.302791, 0.146970, 0.061189])

max(kld)

0.0