### Import Complete Lord of the Rings Text -- No Book or Chapter Divisons

In [1]:
# complete texts of Lord of the Rings can be located at 
# 'https://archive.org/details/TheLordOfTheRing1TheFellowshipOfTheRing'

In [2]:
with open('./the_lord_of_the_rings/Lord_of_the_Rings_complete.txt', 'r') as file:
    lotr = file.read().replace('\n', '')

In [3]:
# check the size of the string to double check
# 2_512_368 characters 
len(lotr)

2512368

In [4]:
# Leaving the corpus as one long string is optimal for NLP

### Prepare Corpus for NLP (Tokenize, Punctuation Removal, Stopwords)

In [5]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer;
from nltk.tokenize import RegexpTokenizer;

In [6]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+') # Separating by word using Regular Expressions 
lotr_tokens = tokenizer.tokenize(lotr)

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

In [7]:
lotr_tokens[2019]

'Hobbits'

### Stopwords 

In [8]:
# Knowing how sensitive LDA models can be regarding word counts; I'm reticent to remove any additional stopwords. 

stopwords = nltk.corpus.stopwords.words('english')

In [9]:
# Some of these are lemma so I'll do a second stopword sweep during CountVectorization and Tf-Idf. 
# I'd rather add/subtract stopwords early in the process. 
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
# Removes stopwords and gives a count of words before and after removal

lotr_clean = [word for word in lotr_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(lotr_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(lotr_clean)} words')

Length of original list: 482056 words

Length of list after stopwords removal: 228086 words


In [11]:
lotr_clean[2019]

'workshops'

In [12]:
# Lemmatize tokens

lemmatizer = WordNetLemmatizer()
lotr_tokens_lems = [lemmatizer.lemmatize(i) for i in lotr_clean]

In [13]:
# 'Workshops' has been reduced to its root, or lemma, 'workshop'.

lotr_tokens_lems[2019]

'workshop'

In [14]:
import sklearn;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.feature_extraction import stop_words

In [15]:
# Graciously borrowed from 'https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial' 
# We have essentially inherited and subclassed the original Sklearn's CountVectorizer class 
# and overwritten the build_analyzer method by implementing the lemmatizer for each list in the raw text matrix.

lemm = WordNetLemmatizer() 
class LemmaCountVectorizer(CountVectorizer): # the parent class 
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer() # the child class
        # 'super' is short for 'superimpose'
        # super() helps to specifically call the Parent class method 
        # which has been overridden in the child class, from the child class.
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))    

In [16]:
print(stop_words.ENGLISH_STOP_WORDS)

frozenset({'too', 'fifteen', 'fill', 'again', 'therein', 'whither', 'per', 'such', 'interest', 'whom', 'nothing', 'yet', 'side', 'become', 'in', 'beforehand', 'de', 'an', 'some', 'four', 'no', 'nor', 'though', 'off', 'towards', 'hereupon', 'whereupon', 'are', 'among', 'five', 'wherein', 'until', 'much', 'fire', 'herself', 'see', 'whatever', 'upon', 'besides', 'afterwards', 'only', 'them', 'also', 'becomes', 'fifty', 'our', 'across', 'below', 'him', 'due', 'namely', 'himself', 'seems', 'few', 'through', 'his', 'sincere', 'above', 'be', 'anyway', 'hers', 'thereafter', 'what', 'already', 'former', 'mill', 'when', 'everywhere', 'except', 'thin', 'although', 'somewhere', 'etc', 'of', 'yours', 'my', 'around', 'part', 'all', 'whereby', 'being', 'else', 'ten', 'wherever', 'always', 'between', 'each', 'any', 'top', 'full', 'or', 'we', 'take', 'least', 'its', 'over', 'back', 'then', 'beyond', 'twelve', 'whence', 'latter', 'neither', 'show', 'must', 'well', 'nowhere', 'as', 'thence', 'if', 'among

In [17]:
sets = [frozenset(stop_words.ENGLISH_STOP_WORDS), set(stopwords)] # Combines both stopword lists into a master list 
 
master_stops = ([list(x) for x in sets])

In [18]:
# Our unaltered master list of stop words; including duplicates
print(master_stops)

[['too', 'fifteen', 'fill', 'again', 'therein', 'whither', 'per', 'such', 'interest', 'whom', 'nothing', 'yet', 'side', 'become', 'in', 'beforehand', 'de', 'an', 'some', 'four', 'no', 'nor', 'though', 'off', 'towards', 'hereupon', 'whereupon', 'are', 'among', 'five', 'wherein', 'until', 'much', 'fire', 'herself', 'see', 'whatever', 'upon', 'besides', 'afterwards', 'only', 'them', 'also', 'becomes', 'fifty', 'our', 'across', 'below', 'him', 'due', 'namely', 'himself', 'seems', 'few', 'through', 'his', 'sincere', 'above', 'be', 'anyway', 'hers', 'thereafter', 'what', 'already', 'former', 'mill', 'when', 'everywhere', 'except', 'thin', 'although', 'somewhere', 'etc', 'of', 'yours', 'my', 'around', 'part', 'all', 'whereby', 'being', 'else', 'ten', 'wherever', 'always', 'between', 'each', 'any', 'top', 'full', 'or', 'we', 'take', 'least', 'its', 'over', 'back', 'then', 'beyond', 'twelve', 'whence', 'latter', 'neither', 'show', 'must', 'well', 'nowhere', 'as', 'thence', 'if', 'amongst', 'min

In [19]:
# Create lambda function to combine or flatten our list of lists

flatten = lambda master_stops: [item for sublist in master_stops for item in sublist]

In [20]:
# Apply lambda function

flat_master_stops = flatten(master_stops)

In [21]:
# Unique stopwords in our master list 

print(set(flat_master_stops))

{'too', 'therein', 'per', 'yet', 'de', 'beforehand', 'towards', 'though', 'whatever', 'also', 'becomes', 'our', 'seems', "that'll", 'few', 'through', 'what', 'already', 'former', "wasn't", 'whereby', 'being', 'between', 'shouldn', 'take', 'beyond', 'nowhere', "she's", 'as', 'would', 'from', 'hereby', 'put', 'sometime', 'everyone', 'mightn', 'with', 'forty', 'here', 'found', 'serious', 'twenty', 'ie', 'three', 'sixty', 'needn', 'this', 'herein', 'same', 'o', 'bill', 'amount', 'became', 'why', 'hundred', 'everything', 'front', 'another', 'one', 'before', 'was', 'amoungst', 'have', "you've", 'a', 'without', 'either', 'y', 'two', 'could', 'describe', 'never', 'nobody', 'us', "isn't", 'hereafter', 'mustn', 'move', 'you', 'nine', 'aren', 'she', 'alone', 'last', "it's", 'made', 'call', 'anything', 'how', 'fifteen', 'interest', 'side', "you're", 'some', 'four', 'hereupon', 'much', 'see', 'afterwards', 'across', "mustn't", 'sincere', 'be', 'hers', "mightn't", 'just', 'of', "needn't", 'my', 'aro

In [22]:
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, # Ignore terms that a document frequency higher that .95 of stopwords
                                     min_df=2, # Ignore terms that have a document frequency of 2 
                                     stop_words='english', # Second sweep -- words listed above
                                     decode_error='ignore') # Ignore UnicodeDecodeError 
tf = tf_vectorizer.fit_transform(lotr_tokens_lems)

### Visuals on Lemmatized and Cleaned Corpus -- No Additional Stopwords

In [23]:
import numpy as np

In [24]:
# Signed up for plotly. See Jeff Hale's article: 
# https://towardsdatascience.com/its-2019-make-your-data-visualizations-interactive-with-plotly-b361e7d45dc6

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(tf.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))

# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word Frequencies after Preprocessing with Unaltered Stop Word List'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

# The words 'said', 'come', and 'came' are overrepresented. Going to make them stopwords and reanalyze. 

#### The words 'said', 'come', and 'came' are overrepresented. Going to make them stopwords and reanalyze. 

In [25]:
# Knowing how sensitive LDA models can be regarding word counts; I'm reticent to remove any additional stopwords. 


stopwords = nltk.corpus.stopwords.words('english')  # changing this list can have a dramatic effect on topic modelling
                                                    # because they use word counts (LDA more that NMF models)
    
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [26]:
# Remove stopwords using our new list
lotr_clean = [word for word in lotr_tokens if word.lower() not in stopwords]

In [27]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
lotr_tokens_lems = [lemmatizer.lemmatize(i) for i in lotr_clean]

In [28]:
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(lotr_tokens_lems)

In [29]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(tf.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))

# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word Frequencies after Preprocessing with Additional Stopwords'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

#### I'm comfortable with Frodo and Sam being at the top of our word frquencies since they're the main characters. 