In [1]:
import os, glob, re, string
import numpy as np
import pandas as pd

### Import the Complete Lord of the Rings Text

In [2]:
# complete texts of Lord of the Rings can be located at 
# 'https://archive.org/details/TheLordOfTheRing1TheFellowshipOfTheRing'

In [3]:
with open('./the_lord_of_the_rings_text/Lord_of_the_Rings_complete.txt', 'r') as file:
    lotr = file.read().replace('\n', '')

In [4]:
# check the size of the string to double check
len(lotr)

2512368

In [5]:
# Leaving the corpus as one long string is optimal for NLP, LDA, and NMF

### Prepare Text for NLP (Tokenize, Punctuation Removal, Stopwords) 

In [6]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize


import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;

# Please see my 'preprocessing_stopwords_plotly.ipynb' notebook for complete annotated preprocessing steps

In [7]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+')
lotr_tokens = tokenizer.tokenize(lotr)

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

In [8]:
lotr_tokens[2019]

'Hobbits'

In [9]:
# Based on previous models, it's imperative to add some stop words. 
# add stopwords - changing this list can have a dramatic effect on results in the LDA model because it uses word counts

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [10]:
# Removes stopwords 
lotr_clean = [word for word in lotr_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(lotr_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(lotr_clean)} words')

Length of original list: 482056 words

Length of list after stopwords removal: 221549 words


In [11]:
lotr_clean[1891]

'flowers'

In [12]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
lotr_tokens_lems = [lemmatizer.lemmatize(i) for i in lotr_clean]

In [13]:
lotr_tokens_lems[1891]

'flower'

### Combined Lemmmatizing and CountVectorizer into one Class

In [14]:
# Graciously borrowed from 'https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial' 
# We have essentially inherited and subclassed the original Sklearn's CountVectorizer class 
# and overwritten the build_analyzer method by implementing the lemmatizer for each list in the raw text matrix.

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [15]:
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(lotr_tokens_lems)

### Build Our LDA Model

In [16]:
import time

from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from sklearn.decomposition import LatentDirichletAllocation

# Using the SKlearn LDA and NMF models -- The Gensim topic models produced uninterpreable results

In [17]:
lda = LatentDirichletAllocation(n_components=21, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 2019)

In [18]:
lda.fit(tf)
print(time.time())

1558095001.606483


In [19]:
# Graciously borrowed from 'https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial'
# Define helper function to print top words

def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_): # enumerate keeps count of iterations 
        message = "\nTopic #{}:".format(index) 
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        # list comprehension inside .join function
        print(message)
        print("="*70) # prints 70 '=' signs as separators

In [20]:
n_top_words = 20
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:heard left took door enemy near fair darkness green began people son close short poor young bright unless room beast

Topic #1:hand road face head word look saruman evil lie tall star rock line evening elrond shape stream led standing bowed

Topic #2:merry city rider legolas rohan wish save boromir guess followed caught used running deed ship try strider tongue shagrat nearly

Topic #3:way little white soon fall mina help half taken run peril rising bank large forest escape huge afraid tunnel sighed

Topic #4:time looked tower lay thjoden hour pa grew sea answer bilbo valley come song stair hidden quickly big westward pain

Topic #5:high seen tell round felt better south captain looking silent earth trouble treebeard halted weary low bag bent father swift

Topic #6:far aragorn saw stone let suddenly east moment air river strength le asked cut cast kept worse tidings passing free

Topic #7:land night good hope set knew ground want walked told beregond be

### Build Our NMF Model

#### Non-Negative Matrix Factorization comparison to LDA

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

In [22]:
# Use CountVectorizer to get total word counts in documents

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

In [23]:
# Transform words with TfidfTransformer - This takes into account term frequency across and within documents

word_counts = vectorizer.fit_transform(lotr_tokens_lems)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

# final_words = normalize(words_tfidf, norm = 'l1')

In [24]:
# Instantiate NMF model and fit to tfidf transformed documents

model = NMF(n_components = 50, init = 'nndsvd')

# Set W as the document by topic matrix
# Set H as the topic by word matrix

W = model.fit_transform(words_tfidf)
H = model.components_

In [25]:
# Function to assign topic indices back to feature names - takes model, feature names from vectorizer, 
# and n_top_words as arguments. n_top_words selects the number of keywords per topic
# renders differently than our previous function essentially does the same thing 
# Thanks to Nick Gayliard for inspiration 

def print_top_words(model, feature_names, n_top_words):
    lst = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

In [26]:
# List numbered topic and top 10 words in the topic 

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))

In [27]:
# Check nmf topic assignment

topics_nmf

['Topic #0: frodo behind end heard side passed though hill water first',
 'Topic #1: sam behind end think passed side left much first master',
 'Topic #2: great think end heard black side stone king though water',
 'Topic #3: could stone passed though left voice hill let first suddenly',
 'Topic #4: long think heard foot side passed water left hill lord',
 'Topic #5: would end behind black heard side water let made much',
 'Topic #6: like behind end think stood ever stone passed hill water',
 'Topic #7: gandalf foot black side left heard king much made white',
 'Topic #8: go behind end passed though black side left hill much',
 'Topic #9: one behind black passed left though much first white two',
 'Topic #10: back end ever heard black foot let water made lord',
 'Topic #11: away behind end think passed though side hill water first',
 'Topic #12: still heard foot passed though left white two word deep',
 'Topic #13: see behind stood end think ever word two suddenly deep',
 'Topic #14: m

### Conclusions: 
### Topic Modelling for works of fiction requires further research. My hypothesis is that given the assumptions made by both the LDA and NMF models easily interpretable results are both difficult to reach and hard to interpret. Close examination of these assumptions, word counts following a specific distribution, for example,  should be made before continuing with this line of questioning. 