# Trying some BTM models and evaluating output

The metrics from 02 and 01 give an indication of the range of suitable topics. Now, we will run a couple of options and evaluate the output.


In [1]:
# Load libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import bitermplus as btm
from nltk.corpus import stopwords
import dropbox
import io

# Connect to dropbox for raw data DO NOT COMMIT TOKEN
access_token = 'XXXX'
dbx = dropbox.Dropbox(access_token)

pd.options.mode.chained_assignment = None

## Prepare data files

In [2]:
# Load processed data
md, response = dbx.files_download("/ClassifyingESTC/intermediate_output/estc_btm_prepped.csv")
filtered_data = pd.read_csv(io.BytesIO(response.content), encoding = "ISO-8859-1")

# prepare BTM files
texts = filtered_data["clean_title"].to_list()
X, vocabulary, vocab_dict = btm.get_words_freqs(filtered_data["clean_title"])
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

## Run select models

In [3]:
# (1) Trying 13 topics

model = btm.BTM(
        n_dw = X, # Documents vs words frequency matrix
        vocabulary = vocabulary, # List of all words
        seed = 931, # Random state seed, set for repeatability
        T = 13, # Number of topics
        alpha = 50/13, # Symmetric dirichlet prior probability of a topic P(z) (literature default)
        beta = 0.01 # Symmetric dirichlet prior probability of a word given the topic P(w|z) (literature default)
)
model.fit_transform(docs_vec, biterms, iterations=600, verbose=True)

100%|█████████████████████████████████████████| 600/600 [27:04<00:00,  2.71s/it]
100%|██████████████████████████████████| 94690/94690 [00:02<00:00, 40056.94it/s]


array([[2.14151699e-01, 4.98752729e-02, 7.38823881e-02, ...,
        4.65718198e-02, 2.41953281e-05, 1.39938814e-02],
       [5.09094088e-02, 6.75891949e-02, 1.41212730e-02, ...,
        1.59483429e-02, 3.72211379e-08, 1.18754362e-03],
       [9.11382839e-02, 3.53062886e-01, 4.24519875e-02, ...,
        2.32339551e-03, 1.19961914e-05, 1.58628100e-04],
       ...,
       [2.40580405e-02, 2.35905710e-03, 1.16357707e-01, ...,
        1.16651206e-02, 1.06333290e-06, 9.22326986e-03],
       [2.49951698e-01, 1.51025402e-01, 1.31447158e-02, ...,
        2.26620890e-02, 1.38893774e-04, 3.79621910e-03],
       [6.72372160e-02, 9.85513256e-02, 6.00285846e-03, ...,
        3.40375052e-02, 1.04732567e-04, 3.93511681e-02]])

In [4]:
# Evaluate 13

btm.get_top_topic_words(model)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12
0,parliament,church,god,english,th,use,sermon,king,tune,relat,cure,de,year
1,lord,answer,christ,book,sir,tabl,preach,written,love,king,diseas,anno,lord
2,majesti,wherein,christian,latin,day,sever,lord,english,new,great,use,regi,calcul
3,common,late,man,contain,william,part,god,histori,pleasant,letter,physick,et,almanack
4,hous,concern,lord,word,late,rule,late,majesti,young,armi,natur,die,meridian
5,king,england,wherein,use,account,art,church,england,man,lord,art,regni,bissextil
6,order,religion,word,set,execut,way,minist,french,good,true,medicin,ex,leapyear
7,assembl,law,life,church,relat,necessari,st,late,song,forc,way,ac,world
8,england,book,peopl,collect,london,make,london,year,true,command,bodi,angli,god
9,honour,vindic,true,prayer,sever,mani,right,first,lover,taken,sever,domini,creation


In [5]:
# (1) Trying 19 topics

model2 = btm.BTM(
        n_dw = X, # Documents vs words frequency matrix
        vocabulary = vocabulary, # List of all words
        seed = 931, # Random state seed, set for repeatability
        T = 19, # Number of topics
        alpha = 50/19, # Symmetric dirichlet prior probability of a topic P(z) (literature default)
        beta = 0.01 # Symmetric dirichlet prior probability of a word given the topic P(w|z) (literature default)
)
model2.fit_transform(docs_vec, biterms, iterations=600, verbose=True)

100%|█████████████████████████████████████████| 600/600 [36:53<00:00,  3.69s/it]
100%|██████████████████████████████████| 94690/94690 [00:03<00:00, 30994.32it/s]


array([[1.38730805e-01, 3.11873431e-02, 1.79810720e-01, ...,
        5.59543005e-02, 6.44385594e-02, 1.19650092e-02],
       [6.95159611e-01, 4.62764515e-02, 2.92878767e-03, ...,
        4.58391991e-03, 3.45943121e-03, 9.56231569e-04],
       [2.15093172e-02, 2.89835766e-01, 1.72206031e-02, ...,
        2.65587258e-03, 5.49968573e-02, 1.35081717e-04],
       ...,
       [4.78529059e-02, 2.60241415e-03, 8.83031892e-03, ...,
        8.37687389e-03, 4.40198125e-03, 9.20453248e-03],
       [8.16216739e-02, 1.30081764e-01, 2.19133711e-02, ...,
        1.22861171e-02, 1.03983908e-01, 3.58054423e-03],
       [2.59215774e-03, 1.11068055e-01, 1.35624265e-02, ...,
        3.09698867e-02, 3.45043666e-02, 2.57143087e-02]])

In [6]:
# Evaluate 19

btm.get_top_topic_words(model2)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15,topic16,topic17,topic18
0,sir,church,king,god,de,year,th,church,use,act,tune,day,letter,anno,written,sermon,cure,parliament,year
1,lord,answer,majesti,christ,la,account,account,prayer,tabl,parliament,love,th,lord,regi,english,preach,diseas,lord,lord
2,honour,wherein,england,lord,le,relat,relat,book,english,england,new,book,armi,de,french,god,use,common,calcul
3,right,concern,ireland,man,en,histori,true,set,rule,majesti,pleasant,sale,sir,et,histori,late,physick,order,almanack
4,knight,late,lord,christian,imprimatur,present,execut,christian,art,blank,young,sold,relat,regni,author,church,natur,hous,meridian
5,william,england,princ,wherein,van,england,last,holi,ad,king,man,three,forc,die,book,minist,medicin,majesti,bissextil
6,court,religion,franc,peopl,end,great,late,english,contain,person,song,auction,great,ex,act,st,make,assembl,leapyear
7,earl,book,charl,word,du,part,sever,togeth,method,sever,lover,catalogu,command,angli,latin,lord,sort,publish,god
8,parliament,law,scotland,life,dialogu,contain,death,god,necessari,law,good,st,taken,domini,late,london,bodi,print,creation
9,case,vindic,great,true,et,mani,day,word,latin,kingdom,maid,near,sent,cum,first,reverend,way,concern,world


I think XX seems better....