In [1]:
# These notebooks are supplemental to my main notebooks: 'topic_modeling_lotr_complete' & 'word2vec_lotr_complete'
# Please see these notebooks for proper and comprehensive annotations. 
import os, glob, re, string
import numpy as np
import pandas as pd

In [2]:
with open('./the_lord_of_the_rings/the_two_towers.txt', 'r') as file:
    towers = file.read().replace('\n', '')

In [3]:
len(towers)

817496

In [4]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+')
tower_tokens = tokenizer.tokenize(towers)
tower_tokens[1971]

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

'Galen'

In [8]:
# Based on previous models, it's imperative to add some stop words. 
# add stopwords - changing this list can have a dramatic effect on results in the LDA model because it uses word counts

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [9]:
# Removes stopwords 
towers_clean = [word for word in tower_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(tower_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(towers_clean)} words')

Length of original list: 156474 words

Length of list after stopwords removal: 72425 words


In [10]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
towers_tokens_lems = [lemmatizer.lemmatize(i) for i in towers_clean]
len(towers_tokens_lems)

72425

In [11]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [12]:
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(towers_tokens_lems)

In [13]:
lda = LatentDirichletAllocation(n_components=11, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 2019)

In [14]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=11, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=2019, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
# Define helper function to print top words
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [16]:
n_top_words = 10
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:frodo master know thing night wall pippin ring cried wonder

Topic #1:like long hand looked shall tower slowly legolas step thjoden

Topic #2:eye men time light smjagol voice foot mountain suddenly end

Topic #3:good shadow passed aragorn left got grey nice hill evil

Topic #4:gollum day went saw let going man rest took small

Topic #5:away yes old think road deep black face stood water

Topic #6:sam hobbit tree turned mind white king darkness green want

Topic #7:way dark thought right lay high round fear boromir lord

Topic #8:gandalf little sword make sleep dead felt hour arm tale

Topic #9:great stone land place look heard fell air near gimli

Topic #10:far ore faramir say head tell word ground gate heart


In [20]:
# Use CountVectorizer to get total word counts in documents

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

In [22]:
# Transform words with TfidfTransformer - This takes into account term frequency across and within documents

word_counts = vectorizer.fit_transform(tower_tokens)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

# final_words = normalize(words_tfidf, norm = 'l1')

In [23]:
# Instantiate NMF model and fit to tfidf transformed documents

model = NMF(n_components = 10, init = 'nndsvd')

# Set W as the document by topic matrix
# Set H as the topic by word matrix

W = model.fit_transform(words_tfidf)
H = model.components_

In [24]:
# Function to assign topic indices back to feature names - takes model, feature names from vectorizer, 
# and n_top_words as arguments. n_top_words selects the number of keywords per topic

def print_top_words(model, feature_names, n_top_words):
    lst = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

In [25]:
# Set 3rd argument to number of topic keywords that are desired

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))

In [26]:
# Check nmf topic assignment

topics_nmf

['Topic #0: the zvram fruitless froglike from fronds front frost frothed frowned',
 'Topic #1: and is had have were zvram fruit from fronds front',
 'Topic #2: of his said not with for on now from no',
 'Topic #3: to they but we as at had have there were',
 'Topic #4: he his said not with for on were now all',
 'Topic #5: in they his said not with for on at there',
 'Topic #6: it his not said is with for at on have',
 'Topic #7: that they his said not with for at on them',
 'Topic #8: you they his said not with for at on all',
 'Topic #9: was they we as at on had were now all']