In [1]:
import os, glob, re, string
import numpy as np
import pandas as pd

### Import the Complete Lord of the Rings Text

In [2]:
# complete texts of Lord of the Rings can be located at 
# 'https://archive.org/details/TheLordOfTheRing1TheFellowshipOfTheRing'

In [3]:
with open('./the_lord_of_the_rings/Lord_of_the_Rings_complete.txt', 'r') as file:
    lotr = file.read().replace('\n', '')

In [4]:
# check the size of the string to double check
len(lotr)

2512368

In [5]:
# Leaving the corpus as one long string is optimal for NLP, LDA, and NMF

### Prepare Text for NLP (Tokenize, Punctuation Removal, Stopwords) 

In [6]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize


import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;

# Please see my 'preprocessing_stopwords_plotly.ipynb' notebook for complete annotated preprocessing steps

In [7]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+')
lotr_tokens = tokenizer.tokenize(lotr)

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

In [8]:
lotr_tokens[2019]

'Hobbits'

In [9]:
# Based on previous models, it's imperative to add some stop words. 
# add stopwords - changing this list can have a dramatic effect on results in the LDA model because it uses word counts

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [10]:
# Removes stopwords 
lotr_clean = [word for word in lotr_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(lotr_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(lotr_clean)} words')

Length of original list: 482056 words

Length of list after stopwords removal: 221549 words


In [23]:
lotr_clean[1891]

'flowers'

In [16]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
lotr_tokens_lems = [lemmatizer.lemmatize(i) for i in lotr_clean]

In [24]:
lotr_tokens_lems[1891]

'flower'

### Combined Lemmmatizing and CountVectorizer into one Class

In [25]:
# Graciously borrowed from 'https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial' 
# We have essentially inherited and subclassed the original Sklearn's CountVectorizer class 
# and overwritten the build_analyzer method by implementing the lemmatizer for each list in the raw text matrix.

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [26]:
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(lotr_tokens_lems)

### Build Our LDA Model

In [30]:
import time

from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from sklearn.decomposition import LatentDirichletAllocation

# Using the SKlearn LDA and NMF models -- The Gensim topic models produced uninterpreable results

In [31]:
lda = LatentDirichletAllocation(n_components=21, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 2019)

In [33]:
lda.fit(tf)
print(time.time())

time.struct_time(tm_year=2019, tm_mon=5, tm_mday=16, tm_hour=15, tm_min=47, tm_sec=56, tm_wday=3, tm_yday=136, tm_isdst=0)


In [34]:
# Graciously borrowed from 'https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial'
# Define helper function to print top words

def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_): # enumerate keeps count of iterations 
        message = "\nTopic #{}:".format(index) 
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        # list comprehension inside .join function
        print(message)
        print("="*70) # prints 70 '=' signs as separators

In [35]:
n_top_words = 20
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:heard left took door enemy near fair darkness green began people son close short poor young bright unless room beast

Topic #1:hand road face head word look saruman evil lie tall star rock line evening elrond shape stream led standing bowed

Topic #2:merry city rider legolas rohan wish save boromir guess followed caught used running deed ship try strider tongue shagrat nearly

Topic #3:way little white soon fall mina help half taken run peril rising bank large forest escape huge afraid tunnel sighed

Topic #4:time looked tower lay thjoden hour pa grew sea answer bilbo valley come song stair hidden quickly big westward pain

Topic #5:high seen tell round felt better south captain looking silent earth trouble treebeard halted weary low bag bent father swift

Topic #6:far aragorn saw stone let suddenly east moment air river strength le asked cut cast kept worse tidings passing free

Topic #7:land night good hope set knew ground want walked told beregond be

### Build Our NMF Model

#### Non-Negative Matrix Factorization comparison to LDA

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

In [37]:
# Use CountVectorizer to get total word counts in documents

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

In [38]:
# Transform words with TfidfTransformer - This takes into account term frequency across and within documents

word_counts = vectorizer.fit_transform(lotr_tokens_lems)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

# final_words = normalize(words_tfidf, norm = 'l1')

In [42]:
# Instantiate NMF model and fit to tfidf transformed documents

model = NMF(n_components = 50, init = 'nndsvd')

# Set W as the document by topic matrix
# Set H as the topic by word matrix

W = model.fit_transform(words_tfidf)
H = model.components_

25967035.137980964


In [46]:
# Function to assign topic indices back to feature names - takes model, feature names from vectorizer, 
# and n_top_words as arguments. n_top_words selects the number of keywords per topic
# renders differently than our previous function essentially does the same thing 
# Thanks to Nick Gayliard for inspiration 

def print_top_words(model, feature_names, n_top_words):
    lst = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

In [47]:
# List numbered topic and top 10 words in the topic 

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))

In [27]:
# Check nmf topic assignment

topics_nmf

['Topic #0: frodo heard black good stone side left end passed made',
 'Topic #1: sam heard ring stone side left made though lord much',
 'Topic #2: great good shadow end think black foot left water king',
 'Topic #3: could think stood black passed left voice hill white first',
 'Topic #4: long shadow end heard foot water much lord gollum master',
 'Topic #5: would shadow stone black passed voice let side king made',
 'Topic #6: like heard hill king passed made let left going suddenly',
 'Topic #7: gandalf end ever black foot stone passed left hill much',
 'Topic #8: go good end stood shadow ever water side king lord',
 'Topic #9: one end stood much voice hill water first face gate',
 'Topic #10: back shadow stood think foot black side voice passed much',
 'Topic #11: away good ever think foot though left hill first word',
 'Topic #12: still ever heard think passed though hill much first white',
 'Topic #13: see behind stone voice passed let black king hill master',
 'Topic #14: many sh