# Executing and analysing the main BTM

Having prepared the data and selected our model, we will run the final model and save the outputs.

In [1]:
# Load libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import bitermplus as btm
from nltk.corpus import stopwords

pd.options.mode.chained_assignment = None

# Number of topics
ntopics = 19 # From narrow search (02/02a)

# Run type (some details about this run) used to label output
run_type = "NONUMBERS_DEDUPED_AdStpwds"

# Output folder
output_folder = "/Users/jamiesanders/Dropbox/ClassifyingESTC/results/" # End in /

## Prepare data files

In [2]:
# Load processed data
filtered_data = pd.read_csv("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/estc_btm_prepped.csv")

# prepare BTM files
texts = filtered_data["clean_title"].to_list()
X, vocabulary, vocab_dict = btm.get_words_freqs(filtered_data["clean_title"])
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

## Run chosen model

In [3]:
model = btm.BTM(
    n_dw = X, # Documents vs words frequency matrix
    vocabulary = vocabulary, # List of all words
    seed = 931, # Random state seed, set for repeatability
    T = ntopics, # Number of topics
    alpha = 50/ntopics, # Symmetric dirichlet prior probability of a topic P(z) (literature default)
    beta = 0.01 # Symmetric dirichlet prior probability of a word given the topic P(w|z) (literature default)
)

model.fit_transform(docs_vec, biterms, iterations=600, verbose=True)

100%|█████████████████████████████████████████| 600/600 [34:37<00:00,  3.46s/it]
100%|██████████████████████████████████| 94690/94690 [00:02<00:00, 33727.43it/s]


array([[1.38730805e-01, 3.11873431e-02, 1.79810720e-01, ...,
        5.59543005e-02, 6.44385594e-02, 1.19650092e-02],
       [6.95159611e-01, 4.62764515e-02, 2.92878767e-03, ...,
        4.58391991e-03, 3.45943121e-03, 9.56231569e-04],
       [2.15093172e-02, 2.89835766e-01, 1.72206031e-02, ...,
        2.65587258e-03, 5.49968573e-02, 1.35081717e-04],
       ...,
       [4.78529059e-02, 2.60241415e-03, 8.83031892e-03, ...,
        8.37687389e-03, 4.40198125e-03, 9.20453248e-03],
       [8.16216739e-02, 1.30081764e-01, 2.19133711e-02, ...,
        1.22861171e-02, 1.03983908e-01, 3.58054423e-03],
       [2.59215774e-03, 1.11068055e-01, 1.35624265e-02, ...,
        3.09698867e-02, 3.45043666e-02, 2.57143087e-02]])

## Saving output

In [6]:
# Prefix for saving
prefix = output_folder + "BTM" + str(ntopics) + "_" + run_type + "__"

# Document Topic Probability Matrix
topic_doc_mat = pd.DataFrame(model.matrix_topics_docs_,)

topic_doc_mat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94680,94681,94682,94683,94684,94685,94686,94687,94688,94689
0,0.1387308,0.69516,0.021509,2.122421e-06,1.160653e-06,0.001511,1.595839e-08,0.012673,0.004342,0.002427,...,0.004498,0.313115,0.04631,0.01074485,0.0009475545,0.04411096,0.03866,0.047853,0.081622,0.002592158
1,0.03118734,0.046276,0.289836,0.0001397354,3.617359e-07,0.02472,0.001963455,0.043833,0.164854,0.031784,...,0.158796,0.056213,0.064663,0.08404406,0.03723422,0.1438186,0.08504,0.002602,0.130082,0.1110681
2,0.1798107,0.002929,0.017221,1.416502e-06,7.746189e-07,0.019313,3.368735e-05,0.021223,0.006268,0.058344,...,0.017532,0.081998,0.018546,0.0161317,0.03079085,0.01382835,0.07804,0.00883,0.021913,0.01356243
3,0.04027834,0.015272,0.014795,6.765009e-07,0.05619319,0.128679,5.08658e-09,0.038688,0.256291,0.074795,...,0.279345,0.02593,0.026698,0.02473841,0.041277,0.02492005,0.136009,0.098291,0.011968,0.002658302
4,1.863697e-07,3e-06,1e-06,2.308252e-05,1.262276e-05,6.4e-05,1.735564e-07,5e-06,6.7e-05,0.000706,...,3e-06,3e-06,5.6e-05,1.599264e-10,1.821761e-10,1.316764e-07,0.002013,0.000105,0.000128,2.346361e-08


In [9]:
# Save to dropbox
topic_doc_mat.to_csv(prefix + "TOPIC_DOC_MAT.csv", index=False)

# Most characteristic words for each topic
topwords = btm.get_top_topic_words(model)

# Save to dropbox
topwords.to_csv(prefix + "TOP_WORDS.csv", index=False)

# Built in function for top docs isn't working, so built it myself (horribly inefficient, please don't judge me)
doc_topic_mat = topic_doc_mat.transpose()
results_joined = filtered_data.reset_index(drop=True).join(doc_topic_mat)

top_docs = pd.DataFrame()
for col in range(0,ntopics):
    top_titles = results_joined.sort_values(col, axis = 0, ascending = False)['title'][0:20]
    top_docs[col] = top_titles.reset_index(drop=True)
    top_docs = top_docs.reset_index(drop=True)

# Save to dropbox
top_docs.to_csv(prefix + "TOP_DOCS.csv", index=False)