In [1]:
# These notebooks are supplemental to my main notebooks: 'topic_modeling_lotr_complete' & 'word2vec_lotr_complete'
# Please see these notebooks for proper and comprehensive annotations. 
import os, glob, re, string
import numpy as np
import pandas as pd

In [2]:
with open('./the_lord_of_the_rings/the_fellowship_of_the_ring.txt', 'r') as file:
    fellow = file.read().replace('\n', '')

In [3]:
len(fellow)

984984

In [4]:
import nltk;
from nltk.corpus import stopwords;
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import sklearn;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
# remove punctuation and tokenize

tokenizer = RegexpTokenizer(r'\w+')
fellow_tokens = tokenizer.tokenize(fellow)
fellow_tokens[1971]

# Keeping capitalization because I want the model to treat the proper nouns accordingly. Names are important in LotR. 

'wars'

In [6]:
# Based on previous models, it's imperative to add some stop words. 
# add stopwords - changing this list can have a dramatic effect on results in the LDA model because it uses word counts

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('said') # removed due to disproportional frequency
stopwords.append('come')
stopwords.append('came')

# Additional stopwords, like proper names, could dramatically alter results. Keeping them to preserve the text. 

In [7]:
# Removes stopwords 
fellow_clean = [word for word in fellow_tokens if word.lower() not in stopwords]
print("="*90)
print(f'Length of original list: {len(fellow_tokens)} words\n')
print(f'Length of list after stopwords removal: {len(fellow_clean)} words')

Length of original list: 188191 words

Length of list after stopwords removal: 86873 words


In [8]:
# Lemmatize tokens.
lemmatizer = WordNetLemmatizer()
fellow_tokens_lems = [lemmatizer.lemmatize(i) for i in fellow_clean]
fellow_tokens_lems[2019]

'Fornost'

In [9]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [10]:
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(fellow_clean)

### Build Our LDA Model

In [11]:
lda = LatentDirichletAllocation(n_components=11, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

In [12]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=11, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [13]:
# Define helper function to print top words

def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [14]:
n_top_words = 10
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:aragorn tree boromir road hill stone company sun power west

Topic #1:like dark day way know hand say gimli left heart

Topic #2:ring light land thing little suddenly stood grey lay high

Topic #3:sam gandalf away went let black fear head soon world

Topic #4:frodo river thought boat end merry place enemy turned green

Topic #5:far water white seen deep spoke mr ore knew forward

Topic #6:shall hobbit pippin wind path answered gate going evil moria

Topic #7:time looked think good passed elrond shire folk sat star

Topic #8:long great elf night heard shadow mountain voice door fell

Topic #9:eye old saw word bilbo strider hope wood round grew

Topic #10:foot cried legolas right look dwarf men face wall moment


### Build Our NMF Model

#### Non-Negative Matrix Factorization in sklearn for comparison to LDA

In [16]:
# Use CountVectorizer to get total word counts in documents

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

In [17]:
# Transform words with TfidfTransformer - This takes into account term frequency across and within documents

word_counts = vectorizer.fit_transform(fellow_tokens)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

# final_words = normalize(words_tfidf, norm = 'l1')

In [18]:
# Instantiate NMF model and fit to tfidf transformed documents

model = NMF(n_components = 10, init = 'nndsvd')

# Set W as the document by topic matrix
# Set H as the topic by word matrix

W = model.fit_transform(words_tfidf)
H = model.components_

In [19]:
# Function to assign topic indices back to feature names - takes model, feature names from vectorizer, 
# and n_top_words as arguments. n_top_words selects the number of keywords per topic

def print_top_words(model, feature_names, n_top_words):
    lst = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

In [20]:
# Set 3rd argument to number of topic keywords that are desired

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))

In [21]:
# Check nmf topic assignment

topics_nmf

['Topic #0: the zvram forest foretelling foretell forests forester forestall foreseen forgave',
 'Topic #1: and zvram forester forgave foretold foretelling foretell forests forestall forged',
 'Topic #2: of you but for as had frodo with there we',
 'Topic #3: to you but for as had frodo with there we',
 'Topic #4: he you but for as had is at with frodo',
 'Topic #5: in but his on is with frodo be all if',
 'Topic #6: it you for as had at frodo we have all',
 'Topic #7: that you but for as had said there with have',
 'Topic #8: was his not is on at with were have their',
 'Topic #9: they but his said is at with were him from']