In [1]:
import numpy as np
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from collections import Counter

nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download("punkt")

[nltk_data] Downloading package gutenberg to /home/jona/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jona/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jona/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Code for dowloading corpus preprocessing

In [2]:
"""
preprocessing step of the document. First we remove the stop words and change all
the words to lower case. Then we remove the uncommon words, given a specified threshold.
"""
def preprocess(doc, threshold):
    docs = sort_chapters(doc)
    docs = filter_doc(docs)
    docs = removing_uncommon(docs, threshold)

    return docs


"""
A function for removing the stop words in a document and changing all the words
to lower case.
"""
def filter_doc(doc):
    stop_words = set(stopwords.words('english'))

    filtered_doc = []
    for chapter in doc:
        filtered_chapter = [w.lower() for w in chapter if not w.lower() in stop_words and w.isalnum()]
        filtered_doc.append(filtered_chapter)

    return filtered_doc


"""
A function for removing the less common words, i.e. the words with a higher
frequency in the document over a specified threshold.
"""
def removing_uncommon(doc, threshold):
    freq = get_counter(doc)
    
    filtered_doc = []
    for chapter in doc:
        filtered_chapter = [w for w in chapter if freq[w] >= threshold]
        filtered_doc.append(filtered_chapter)

    return filtered_doc


"""
A helper function to get a frequency counter for the document.
"""
def get_counter(doc):
    freq = Counter()
    for sentance in doc:
        for word in sentance:
            freq[word] += 1

    return freq


"""
Start by assigning all words to the unassigned class -1, just to constuct
a data structure that can hold the categories in the future
"""
def get_categories(docs):
    cats = []
    for chapter in docs:
        chap = []
        for _ in chapter:
            chap.append(-1)
        cats.append(np.array(chap))
    return cats


"""
Divide the book up into chapters, the chapters will be though of as documents
"""
def sort_chapters(doc):
    docs = []
    chapter = []
    for word in doc:
        if word != 'CHAPTER':
            chapter.append(word)
        else:
            docs.append(chapter)
            chapter = []
    return docs

#LDA

##Defining LDA model

In [3]:
class LDA:

    def __init__(self, docs, cats, n_classes, voc, alpha, beta):
        # The document we are analysing
        self.docs = docs
        # All the tokens stored in a array of lists
        self.w = np.array(docs)
        # The category for each word
        self.z = np.array(cats)
        # Number of categories we are looking for
        self.K = n_classes
        # The words in our corpus
        self.voc = voc
        # Number of words in our corpus
        self.V = len(voc)
        # Hyper parameters used for Gibb's sampling
        self.alpha = alpha
        self.beta = beta

        """
        Instead of counting these we just store the value in a matrix, this
        makes to code a bit more optimized since we do not have to allocate or
        do this computation over and over agian.
        """
        # Category distribution for each doc (chapter)
        self.n = np.zeros( [len(docs), n_classes] )
        # Category distribution for each word
        self.m = np.zeros( [len(voc), n_classes] )


    """
    This function fits our model to the data by iterating a collapsed
    Gibb's sampling n times
    """
    def fit(self, n_iteration):
        
        print("Fitting the model")
        for it in range(n_iteration):
            print("Iteration %i" %(it+1))
            self.iterate()

    """
    This function performs a single iteration in our algorithm
    """
    def iterate(self):
        
        # Allocating these now to optimize the code
        q = np.zeros(self.K)
        p = np.zeros(self.K)

        for d, chapter in enumerate(self.docs):
            for j, word in enumerate(chapter):
                word_ind = self.voc.index(word)

                # Reomve the category for our current word and updating m & n acordingly
                if self.z[d][j] != -1:
                    k = self.z[d][j]
                    self.n[d, k] -= 1
                    self.m[word_ind, k] -= 1
                    self.z[d][j] = -1 

                # computes the posterior distribution for the category distribution
                for k in range(self.K):
                    sum_m_k = np.sum(self.m[:, k]) 
                    q[k] = ( self.alpha + self.n[d, k] ) * \
                           ( self.beta + self.m[word_ind, k] ) / \
                           (self.V * self.beta + sum_m_k ) 
                # Normalizing
                p = q / np.sum(q)

                # Assigning a new category
                assigned_category = np.random.choice(self.K, 1, p = p)[0]
                
                # Updating the model object accordingly
                self.z[d][j] = assigned_category
                self.n[d, assigned_category] += 1
                self.m[word_ind, assigned_category] += 1

    """
    To evaluate our model we take a look at the highest values in the either the
    relative frequency or the total frequency.
    This function returns thoose counters.
    """
    def evaluate(self, relative=True):
        counters = self.count_occurances()

        if relative:
            for key in self.voc:
                tot_freq = sum([counters[it][key] for it in range(self.K)])
                
                for jt in range(self.K):
                    counters[jt][key] /= tot_freq

        return counters


    """
    This is a helper function for our evaluation function. Here we count the frequencies 
    each appeared in each category and save it in a list of counter objects.
    """
    def count_occurances(self):
        counters = [Counter() for _ in range(self.K)]

        for d, chapter in enumerate(self.docs):
            for j, word in enumerate(chapter):
                counters[self.z[d][j]][word] += 1

        return counters




In [4]:
def train_model(alpha, beta, n_iteration, K):
    # Moby Dick - Herman Melville
    print("Loading text document and formatting")
    moby_dick = gutenberg.words('melville-moby_dick.txt')

    threshold = 10 
    docs = preprocess(moby_dick, threshold)
    freq = get_counter(docs)
    cats = get_categories(docs)
    voc = list(freq.keys())

    model = LDA(docs, cats, K, voc, alpha, beta)
    model.fit(n_iteration)

    return model


##LDA 1: α = β = 0.1,  100 iterations, 10 categories

In [None]:
model = train_model(0.1, 0.1, 100, 10)

Loading text document and formatting


  self.w = np.array(docs)
  self.z = np.array(cats)


Fitting the model
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53


###Most comman, total frequency

In [None]:
counters = model.evaluate(relative = False)
for counter in counters:
  print(counter.most_common(10))

[(',', 4606), ('.', 1679), ("'", 1533), ('!', 858), ('"', 761), ('-', 581), ('--', 446), (';', 347), ('?', 345), ('ye', 339)]
[(';', 422), ('"', 241), ('captain', 198), ('said', 125), ('ahab', 118), ('ship', 109), (',"', 101), ('."', 96), ('ye', 93), ('?"', 91)]
[(';', 336), ('captain', 106), ('ship', 105), ("'", 94), ('"\'', 61), ('ships', 46), ('two', 44), ('men', 43), ('steelkilt', 40), ('gentlemen', 35)]
[(';', 925), ('!', 232), ('god', 116), ('thou', 83), ('yet', 79), ('white', 78), ('jonah', 55), ('old', 54), ('sailor', 54), ('would', 53)]
[('whale', 626), ('.', 563), (';', 489), ('whales', 211), ('sperm', 182), ('leviathan', 93), ('(', 87), ('oil', 69), ('found', 53), ('fish', 52)]
[(';', 582), ('boat', 262), ('ahab', 257), ('whale', 236), ('seemed', 155), ('white', 153), ('sea', 132), ('ship', 117), ('starbuck', 116), ('crew', 114)]
[('"', 326), ('."', 188), ('whale', 170), ('--', 167), ("'", 166), ('.', 97), ('-', 84), (';', 83), ('fish', 67), ('right', 41)]
[(';', 472), ('-',

###Most common, relative frequency

In [None]:
counters = model.evaluate()
for counter in counters:
  print(counter.most_common(10))

[('moving', 1.0), ('ha', 1.0), ('yon', 1.0), ('hurrah', 1.0), ('kick', 1.0), ('using', 1.0), ('tambourine', 1.0), ('pip', 0.9594594594594594), ('.)', 0.9545454545454546), ('guess', 0.9444444444444444)]
[('peleg', 1.0), ('quaker', 1.0), ('dost', 1.0), ('bildad', 1.0), ('bye', 1.0), ('guernsey', 1.0), ('elijah', 0.9333333333333333), ('spirits', 0.9), ('steward', 0.8888888888888888), ('seven', 0.875)]
[('gabriel', 1.0), ('commodore', 1.0), ("!'", 1.0), ("?'", 1.0), (",'", 1.0), ('priest', 1.0), ('leak', 1.0), ('radney', 1.0), ('steelkilt', 1.0), ('lakeman', 1.0)]
[('mild', 1.0), ('pulpit', 1.0), ('naught', 1.0), ('woe', 1.0), ('soil', 1.0), ('whiteness', 1.0), ('vice', 1.0), ('doubloon', 1.0), ('um', 1.0), ('lesson', 0.9230769230769231)]
[('cetology', 1.0), ('greenland', 1.0), ('baleen', 1.0), ('magnitude', 1.0), ('cuvier', 1.0), ('scoresby', 1.0), ('species', 1.0), ('century', 1.0), ('folio', 1.0), ('ii', 1.0)]
[('wave', 1.0), ('gradually', 1.0), ('fate', 1.0), ('bowed', 1.0), ('speed', 

##LDA 2: α = β = 0.01, 100 iterations 10 categories

In [None]:
model2 = train_model(0.01, 0.01, 100, 10)

###Most common, total frequency

In [None]:
counters = model2.evaluate(relative = False)
for counter in counters:
  print(counter.most_common(10))

[('whales', 160), ('like', 143), ('whaling', 107), ('though', 97), ('water', 86), ('three', 85), ('nantucket', 78), ('ship', 76), ('may', 76), ('peleg', 74)]
[('.', 494), ('little', 148), ('queequeg', 147), ('like', 111), (',"', 95), ('good', 89), ('come', 84), ('harpooneer', 81), ('away', 78), ('sort', 74)]
[('white', 98), ('?', 68), ('still', 66), ('times', 64), ('yet', 50), ('without', 33), ('waters', 32), ('wild', 30), ('day', 30), ('seas', 28)]
[('ahab', 470), ('ship', 252), ('thou', 236), ('man', 218), ('boat', 205), ('captain', 171), ('deck', 162), ('like', 131), ('men', 125), ('round', 109)]
[(',', 18249), ('.', 5756), (';', 3946), ('-', 2493), ("'", 2150), ('"', 1168), ('--', 1045), ('whale', 963), ('one', 889), ('upon', 547)]
[("'", 456), ('"', 266), ('ye', 110), ('.', 100), ('?', 88), ('"\'', 66), ('said', 52), ('stubb', 46), ('steelkilt', 40), ('cook', 39)]
[('.', 368), ('whale', 230), ('sperm', 115), ('?', 98), ('fish', 86), ('(', 82), ('whales', 78), ('oil', 71), ('great'

###Most common, relative frequency

In [None]:
counters = model2.evaluate()
for counter in counters:
  print(counter.most_common(10))

[('french', 1.0), ('street', 1.0), ('monstrous', 1.0), ('whereas', 1.0), ('learned', 1.0), ('spouts', 1.0), ('voyages', 1.0), ('frequently', 1.0), ('southern', 1.0), ('doubtless', 1.0)]
[('harpooneer', 1.0), ('money', 1.0), ('image', 1.0), ('begin', 1.0), ('lungs', 1.0), ('difference', 1.0), ('considering', 1.0), ('unaccountable', 1.0), ('answer', 1.0), ('social', 1.0)]
[('leagues', 1.0), ('picture', 1.0), ('palms', 1.0), ('seek', 1.0), ('faith', 1.0), ('spiritual', 1.0), ('straits', 1.0), ('george', 1.0), ('canoe', 1.0), ('subtle', 1.0)]
[('dust', 1.0), ('waves', 1.0), ('tow', 1.0), ('masts', 1.0), ('sing', 1.0), ('bringing', 1.0), ('seated', 1.0), ('rigging', 1.0), ('compasses', 1.0), ('thither', 1.0)]
[('supplied', 1.0), ('late', 1.0), ('school', 1.0), ('--', 1.0), ('coat', 1.0), ('heart', 1.0), ('body', 1.0), ('ever', 1.0), ('world', 1.0), ('somehow', 1.0)]
[("!'", 1.0), ('shipmate', 1.0), ('kick', 1.0), ('wharf', 1.0), ("?'", 1.0), (",'", 1.0), (".'", 1.0), ('radney', 1.0), ('stee

##LDA 3: α = β = 0.1, 100 iterations, 50 cetgories

In [None]:
model3 = train_model(0.1, 0.1, 100, 50)

###Most common.
First we see the 50 most common in total frequency and then we see the 50 highest relative frequencies for this models assigned classes.

In [None]:
counters = model3.evaluate()
for counter in counters:
  print(counter.most_common(10))

counters = model3.evaluate(relative = False)
for counter in counters:
  print(counter.most_common(10))

[('watches', 0.75), ('helm', 0.7407407407407407), ('parsee', 0.6666666666666666), ('closed', 0.6), ('latitude', 0.5384615384615384), ('cast', 0.5333333333333333), ('unseen', 0.5), ('apart', 0.5), ('spoken', 0.46153846153846156), ('clouds', 0.45454545454545453)]
[('school', 0.8), ('discovered', 0.7), ('remains', 0.625), ('utterly', 0.6153846153846154), ('host', 0.6), ('magnitude', 0.5909090909090909), ('creatures', 0.5882352941176471), ('flood', 0.5714285714285714), ('marked', 0.5555555555555556), ('subsequent', 0.5384615384615384)]
[('breakfast', 0.5714285714285714), ('troubled', 0.5333333333333333), ('sat', 0.5111111111111111), ('pagan', 0.5), ('boots', 0.5), ('mortals', 0.4), ('clock', 0.375), ('softly', 0.36363636363636365), ('fellows', 0.36363636363636365), ('ages', 0.35714285714285715)]
[('tackles', 0.9411764705882353), ('tub', 0.9230769230769231), ('tackle', 0.8), ('carefully', 0.7272727272727273), ('strip', 0.7), ('bucket', 0.6666666666666666), ('block', 0.6363636363636364), ('a

##LDA 4: α = β = 0.01, 100 iterations, 50 categories

In [None]:
model4 = train_model(0.01, 0.01, 100, 50)

###Same as for previous model

In [None]:
counters = model4.evaluate()
for counter in counters:
  print(counter.most_common(10))

  counters = model4.evaluate(relative = False)
for counter in counters:
  print(counter.most_common(10))

[('queen', 0.7777777777777778), ('law', 0.7619047619047619), ('possession', 0.75), ('gentleman', 0.75), ('ambergris', 0.6923076923076923), ('loose', 0.6285714285714286), ('humor', 0.6), ('blubber', 0.5882352941176471), ('hoisting', 0.5833333333333334), ('originally', 0.5294117647058824)]
[('son', 0.75), ('customary', 0.6428571428571429), ('knowing', 0.625), ('receiving', 0.6153846153846154), ('thereby', 0.6111111111111112), ('earnest', 0.6), ('sweeping', 0.5833333333333334), ('cannibals', 0.5454545454545454), ('possible', 0.5333333333333333), ('quietly', 0.5263157894736842)]
[('landlord', 0.9705882352941176), ('throwing', 0.9), ('bench', 0.8823529411764706), ('bed', 0.8266666666666667), ('crow', 0.7142857142857143), ('tomahawk', 0.6842105263157895), ('streets', 0.6), ('peter', 0.6), ('landed', 0.5833333333333334), ('idol', 0.5384615384615384)]
[('baleen', 1.0), ('cuvier', 1.0), ('scoresby', 1.0), ('folio', 1.0), ('octavo', 1.0), ('iii', 1.0), (').--', 1.0), ('ribs', 0.9047619047619048)

##Investigating model 1 again but now with 200 iterations

In [None]:
model_200it = train_model(0.1, 0.1, 200, 10)

###Relative frequency

In [None]:
counters_200 = model_200it.evaluate()
for counter in counters_200:
  print(counter.most_common(10))

[('stripped', 1.0), ('throughout', 1.0), ('external', 1.0), ('depth', 1.0), ('tun', 1.0), ('bulk', 0.96875), ('degree', 0.9615384615384616), ('surface', 0.9487179487179487), ('spine', 0.9411764705882353), ('immense', 0.9375)]
[(",'", 1.0), (".'", 1.0), ('leak', 1.0), ('radney', 1.0), ('steelkilt', 1.0), ('lakeman', 1.0), ("?'", 0.96875), ("!'", 0.9666666666666667), ('pumps', 0.9333333333333333), ('jonah', 0.9294117647058824)]
[('english', 1.0), ('cetology', 1.0), ('greenland', 1.0), ('baleen', 1.0), ('southern', 1.0), ('london', 1.0), ('john', 1.0), ('cuvier', 1.0), ('scoresby', 1.0), ('species', 1.0)]
[('hussey', 1.0), ('sick', 1.0), ('throwing', 1.0), ('landlord', 1.0), ('clam', 1.0), ('tomahawk', 1.0), ('chapel', 1.0), ('notice', 1.0), ('mrs', 1.0), ('yojo', 1.0)]
[('compasses', 1.0), ('phantom', 1.0), ('dream', 1.0), ('sleet', 1.0), ('fate', 1.0), ('spiritual', 1.0), ('continual', 1.0), ('leaned', 1.0), ('binnacle', 1.0), ('milky', 1.0)]
[('towing', 1.0), ('instantly', 1.0), ('glid

###Total frequency

In [None]:
counters_200_abs = model_200it.evaluate(relative=False)
for counter in counters_200_abs:
  print(counter.most_common(10))

[('whale', 221), ('sperm', 117), ('.', 101), ('head', 88), ('feet', 62), ('great', 58), ('side', 45), ('tail', 44), ('much', 43), ('two', 42)]
[('jonah', 79), ("'", 72), ('"\'', 63), ('captain', 62), ('steelkilt', 40), (".'", 36), ('gentlemen', 33), (",'", 32), ("?'", 31), ("!'", 29)]
[('whale', 605), ('.', 255), ('whales', 216), ('fish', 121), ('(', 110), ('-', 103), ('sperm', 99), ('?', 98), ('great', 80), ('"', 80)]
[('queequeg', 206), ('"', 169), ('said', 120), ('captain', 113), ('thought', 90), ('little', 80), ('bildad', 76), ('good', 75), ('peleg', 74), (',"', 72)]
[('ahab', 314), (';', 284), ('white', 168), ('seemed', 143), ('starbuck', 135), (',', 123), ('sea', 110), ('men', 99), ('deck', 85), ('still', 84)]
[('boat', 267), ('whale', 212), ('!"', 136), ('line', 119), ('boats', 117), ('"', 99), ('one', 96), ('!', 94), ('stubb', 91), ('cried', 83)]
[('!', 511), ('?', 116), ('.', 86), ('pip', 74), ('oh', 68), ('sailor', 59), ('look', 53), ('--', 49), ('god', 47), ('ye', 45)]
[('-'

#Hidden markov transition model
This class definition is similar to the one used for our LDA model, but some tweaks have been made and more logic has been added.

In [None]:
class HMTM:

    def __init__(self, docs, cats, n_classes, voc, alpha, beta, pi):
        # The document we are analysing
        self.docs = docs
        # All the tokens stored in a array of lists
        self.w = np.array(docs)
        # The category for each word
        self.z = np.array(cats)
        # Number of categories we are looking for
        self.K = n_classes
        # The words in our corpus
        self.voc = voc
        # Number of words in our corpus
        self.V = len(voc)
        # Hyper parameters used for Gibb's sampling
        self.alpha = alpha
        self.beta = beta
        self.pi = pi

        """
        Instead of counting these we just store the value in a matrix, this
        makes to code a bit more optimized since we do not have to allocate or
        do this computation over and over agian.
        """
        # Category transition distribution for each doc (chapter)
        self.n_trans = np.zeros( [len(docs), n_classes, n_classes] )
        # Category distribution for starting words over all docs
        self.n_start = np.zeros([n_classes])
        # Category distribution for each word
        self.m = np.zeros( [len(voc), n_classes] )


    """
    This function fits our model to the data by iterating a collapsed
    Gibb's sampling n times
    """
    def fit(self, n_iteration):
        
        print("Fitting the model")
        for it in range(n_iteration):
            print("Iteration %i" %(it+1))
            self.iterate()

    """
    This function performs a single iteration in our algorithm
    """
    def iterate(self):
        
        # Allocating these now to optimize the code
        q = np.zeros(self.K)
        p = np.zeros(self.K)

        for d, chapter in enumerate(self.docs):
            for j, word in enumerate(chapter):
                word_ind = self.voc.index(word)
                
                if j == 0:
                    #Remove current token, update m, n_trans, n_start
                    k_next = self.z[d][j+1]
                    if self.z[d][j] != -1:
                        k = self.z[d][j]
                        self.n_trans[d, k, k_next] -= 1
                        self.n_start[k] -= 1
                        self.m[word_ind, k] -= 1
                        self.z[d][j] = -1 
                
                    # computes the posterior distribution for the category distribution
                    for k in range(self.K):
                        sum_m_k = np.sum(self.m[:, k]) 
                        q[k] = ( self.alpha + self.n_start[k] ) * \
                                ( self.beta + self.m[word_ind, k] ) / \
                                (self.V * self.beta + sum_m_k ) 
                        # Normalizing
                        p = q / np.sum(q)
                
                elif j == len(chapter)-1:
                    #Remove current token, update m, n_trans
                    k_prev = self.z[d][j-1]
                    if self.z[d][j] != -1:
                        k = self.z[d][j]
                        self.n_trans[d, k_prev, k] -= 1
                        self.m[word_ind, k] -= 1
                        self.z[d][j] = -1 
                    
                    # computes the posterior distribution for the category distribution
                    for k in range(self.K):
                        sum_m_k = np.sum(self.m[:, k]) 
                        q[k] = ( self.alpha + self.n_trans[d, k_prev, k] ) * \
                                ( self.beta + self.m[word_ind, k] ) / \
                                (self.V * self.beta + sum_m_k ) 
                        # Normalizing
                        p = q / np.sum(q)
                    
                else:
                    # Remove current token, update m, 
                    k_prev = self.z[d][j-1]
                    k_next = self.z[d][j+1]
                    if self.z[d][j] != -1:
                        k = self.z[d][j]
                        self.n_trans[d, k_prev, k] -= 1
                        self.n_trans[d, k, k_next] -= 1
                        self.m[word_ind, k] -= 1
                        self.z[d][j] = -1 

                    # computes the posterior distribution for the category distribution
                    for k in range(self.K):
                        sum_m_k = np.sum(self.m[:, k]) 
                        q[k] = ( self.alpha + self.n_trans[d, k_prev, k] ) * \
                                ( self.beta + self.m[word_ind, k] ) / \
                                (self.V * self.beta + sum_m_k ) 
                        # Normalizing
                        p = q / np.sum(q)

                # Assigning a new category
                assigned_category = np.random.choice(self.K, 1, p = p)[0]
                
                # Updating the model object accordingly
                self.z[d][j] = assigned_category
                self.m[word_ind, assigned_category] += 1
                if j==0:
                    self.n_start[assigned_category] += 1
                    self.n_trans[d, assigned_category, k_next] += 1
                elif j== len(chapter)-1:
                    self.n_trans[d, k_prev, assigned_category] += 1
                else:
                    self.n_trans[d, k_prev, assigned_category] += 1
                    self.n_trans[d, assigned_category, k_next] += 1
                

    """
    To evaluate our model we take a look at the highest values in the either the
    relative frequency or the total frequency.
    This function returns thoose counters.
    """
    def evaluate(self, relative=True):
        counters = self.count_occurances()

        if relative:
            for key in self.voc:
                tot_freq = sum([counters[it][key] for it in range(self.K)])
                
                for jt in range(self.K):
                    counters[jt][key] /= tot_freq

        return counters


    """
    This is a helper function for our evaluation function. Here we count the frequencies 
    each appeared in each category and save it in a list of counter objects.
    """
    def count_occurances(self):
        counters = [Counter() for _ in range(self.K)]

        for d, chapter in enumerate(self.docs):
            for j, word in enumerate(chapter):
                counters[self.z[d][j]][word] += 1

        return counters

In [None]:
def train_hmtm_model(alpha, beta, pi, n_iteration, K):
    # Moby Dick - Herman Melville
    print("Loading text document and formatting")
    moby_dick = gutenberg.words('melville-moby_dick.txt')

    threshold = 10 
    docs = preprocess(moby_dick, threshold)
    freq = get_counter(docs)
    cats = get_categories(docs)
    voc = list(freq.keys())

    model = HMTM(docs, cats, K, voc, alpha, beta, pi)
    model.fit(n_iteration)

    return model

##HMTM 1
Training and viewing results

In [None]:
model_hmtm = train_hmtm_model(0.1, 0.1, 1, 100, 10)

counters_hmtm = model_hmtm.evaluate(relative = False)
for counter in counters_hmtm:
  print(counter.most_common(10))

Printout of above code:

[('"\'', 62), ('steelkilt', 40), (".'", 36), ("?'", 32), (",'", 32), ('gentlemen', 30), ("!'", 27), ('lakeman', 24), ('radney', 22), ('mate', 15)]

[('ginger', 16), ('bucket', 15), ('tun', 14), ('blubber', 13), ('strip', 9), ('blanket', 8), ('spades', 8), ('cutting', 8), ('mass', 8), ('tackles', 7)]

[('fish', 41), ('fast', 20), ('law', 18), ('loose', 15), ('possession', 10), ('queen', 10), ('herd', 7), ('lake', 6), ('chase', 6), ('circles', 6)]

[('white', 39), ('whiteness', 27), ('pulpit', 12), ('um', 11), ('terror', 6), ('chapel', 5), ('holy', 5), ('shark', 5), ('marble', 4), ('father', 4)]

[('.', 148), ('whale', 78), ('whales', 46), ('(', 37), ('book', 24), ('),', 22), ('porpoise', 19), ('fish', 18), ('greenland', 17), ('folio', 16)]

[('skull', 21), ('blacksmith', 20), ('skeleton', 15), ('perth', 14), ('feet', 13), ('spine', 12), ('brain', 12), ('ribs', 12), ('forge', 11), ('temple', 8)]

[('!', 95), ('pip', 73), ('sailor', 32), ('(', 22), ('gabriel', 19), ('hussey', 17), ('.)', 16), ('mrs', 13), ('tambourine', 11), ('clam', 10)]

[('bildad', 76), ('peleg', 74), ('cook', 38), ('dat', 20), ('de', 20), ('steak', 15), ('dam', 11), ('fleece', 11), ('quaker', 9), ('dost', 9)]

[('jonah', 75), ('bed', 47), ('landlord', 33), ('harpooneer', 32), ('room', 28), ('god', 26), ('shipmates', 22), ('says', 12), ('thinks', 12), ('lesson', 11)]

[(',', 18275), ('.', 6570), (';', 3949), ("'", 2606), ('-', 2492), ('"', 1433), ('whale', 1115), ('!', 1080), ('--', 1045), ('one', 900)]

Printout of above model, relative frequencies:

[('peleg', 1.0), ('quaker', 1.0), ('bildad', 1.0), ('fleece', 1.0), ('steak', 1.0), ('dat', 1.0), ('whiteness', 0.9629629629629629), ('de', 0.7105263157894737), ('den', 0.6923076923076923), ('cook', 0.6909090909090909)]

[('spine', 0.7058823529411765), ('skull', 0.6666666666666666), ('canal', 0.6666666666666666), ('ribs', 0.6190476190476191), ('skeleton', 0.5294117647058824), ('elephant', 0.5263157894736842), ('trunk', 0.4375), ('depth', 0.4), ('inches', 0.3684210526315789), ('flukes', 0.35135135135135137)]

[('radney', 1.0), ('steelkilt', 1.0), ('lakeman', 1.0), ('"\'', 0.8970588235294118), ("?'", 0.875), (".'", 0.8611111111111112), ("!'", 0.8333333333333334), (",'", 0.8125), ('gentlemen', 0.7631578947368421), ('pumps', 0.7333333333333333)]

[('cetology', 1.0), ('baleen', 1.0), ('narwhale', 1.0), ('folio', 1.0), ('ii', 1.0), ('iii', 1.0), ('porpoise', 1.0), (').--', 1.0), ('octavo', 0.9166666666666666), ('possession', 0.8333333333333334)]

[('forge', 1.0), ('blacksmith', 1.0), ('ginger', 0.9375), ('guernsey', 0.9090909090909091), ('spades', 0.9), ('needle', 0.7272727272727273), ('compasses', 0.7), ('perth', 0.6111111111111112), ('monkey', 0.5), ('needles', 0.45454545454545453)]

[('gabriel', 1.0), ('sinking', 1.0), ('bucket', 1.0), ('tun', 1.0), ('derick', 0.9166666666666666), ('jeroboam', 0.9), ('letter', 0.6428571428571429), ('noise', 0.4166666666666667), ('floats', 0.2727272727272727), ('says', 0.2608695652173913)]

[('jonah', 1.0), ('george', 0.9), ('lesson', 0.8461538461538461), ('shipmates', 0.8214285714285714), ('perseus', 0.8181818181818182), ('prophet', 0.625), ('delight', 0.6111111111111112), ('berth', 0.4), ('feels', 0.38461538461538464), ('woe', 0.3548387096774194)]

[('tub', 0.8461538461538461), ('herd', 0.8461538461538461), ('circles', 0.8181818181818182), ('straits', 0.8125), ('lake', 0.7), ('host', 0.5), ('hemp', 0.45454545454545453), ('block', 0.45454545454545453), ('strip', 0.4), ('windlass', 0.36363636363636365)]

[('mrs', 1.0), ('bunger', 1.0), ('hussey', 0.9411764705882353), ('um', 0.9166666666666666), ('clam', 0.9090909090909091), ('ramadan', 0.8181818181818182), ('ambergris', 0.7692307692307693), ('pulpit', 0.75), ('ladder', 0.75), ('cod', 0.5384615384615384)]

[('dick', 1.0), ('supplied', 1.0), ('late', 1.0), ('school', 1.0), (')', 1.0), ('pale', 1.0), ('--', 1.0), ('coat', 1.0), (',', 1.0), ('heart', 1.0)]

##HMTM 2
Training and viewing result

In [None]:
model_hmtm_2 = train_hmtm_model(0.1, 0.1, 1, 200, 50)

counters_hmtm_2 = model_hmtm_2.evaluate(relative = False)
for counter in counters_hmtm_2:
  print(counter.most_common(10))

Loading text document and formatting


  import sys
  if __name__ == '__main__':


Fitting the model
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Ite