In [1]:
# These notebooks are supplemental to my main notebooks: 'topic_modeling_lotr_complete' & 'word2vec_lotr_complete'
# Please see these notebooks for proper and comprehensive annotations. 
import os, glob, re, string
import numpy as np
import pandas as pd

In [2]:
with open('./the_lord_of_the_rings/the_return_of_king.txt', 'r') as file:
    king = file.read().replace('\n', '')

In [3]:
len(king)

710699

In [5]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+')
king_tokens = tokenizer.tokenize(king)
king_tokens[1971]

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

'battle'

In [10]:
# Based on previous models, it's imperative to add some stop words. 
# add stopwords - changing this list can have a dramatic effect on results in the LDA model because it uses word counts

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [11]:
king_clean = [word for word in king_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(king_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(king_clean)} words')

Length of original list: 137549 words

Length of list after stopwords removal: 62349 words


In [12]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
king_tokens_lems = [lemmatizer.lemmatize(i) for i in king_clean]
len(king_tokens_lems)

62349

In [13]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [14]:
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(king_tokens_lems)

In [15]:
lda = LatentDirichletAllocation(n_components=11, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 2019)

In [16]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=11, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=2019, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [17]:
# Define helper function to print top words
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [18]:
n_top_words = 10
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:merry aragorn folk ruffian ring sword coming fair chief want

Topic #1:went gandalf pippin know going land black soon turned gondor

Topic #2:say way mr shire think left stone let head dead

Topic #3:frodo hobbit good shadow ore wall mordor sun field captain

Topic #4:thing city set faramir heard master high rose tell speak

Topic #5:day away old shall looked saw stood voice year face

Topic #6:like far eye little gate hope white heart fear look

Topic #7:end lord mountain tree right round new rohan suddenly hear

Topic #8:great hand dark passed rode grey seen better east sat

Topic #9:long time road got gone jowyn north place people friend

Topic #10:sam men king light thought house night took man battle


In [19]:
# Use CountVectorizer to get total word counts in documents

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

In [22]:
# Transform words with TfidfTransformer - This takes into account term frequency across and within documents

word_counts = vectorizer.fit_transform(king_tokens)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

# final_words = normalize(words_tfidf, norm = 'l1')

In [23]:
# Instantiate NMF model and fit to tfidf transformed documents

model = NMF(n_components = 10, init = 'nndsvd')

# Set W as the document by topic matrix
# Set H as the topic by word matrix

W = model.fit_transform(words_tfidf)
H = model.components_

In [24]:
# Function to assign topic indices back to feature names - takes model, feature names from vectorizer, 
# and n_top_words as arguments. n_top_words selects the number of keywords per topic

def print_top_words(model, feature_names, n_top_words):
    lst = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

In [25]:
# Set 3rd argument to number of desired topic keywords 

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))

In [26]:
topics_nmf

['Topic #0: the gaffer foundering fount fountain fountains four fours fourteen fourteenth',
 'Topic #1: and zirak fragrant fountain fountains four fours fourteen fourteenth fourth',
 'Topic #2: of at is on there now were fount fowl foundering',
 'Topic #3: to but they as with at is on be have',
 'Topic #4: he his but they said not with as on now',
 'Topic #5: in his for said not all there him from were',
 'Topic #6: that but they his not as with on all be',
 'Topic #7: it but they his as not with on had be',
 'Topic #8: was his said for not as with all there is',
 'Topic #9: you his for said at is there all had on']