# Load modules

In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

import os
import codecs
import string
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

import spacy

nlp = spacy.load("en")

import time
from sys import stdout
import cPickle as pickle

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

In [None]:
corpora_directory = os.path.join("", "corpora")

hansard_transcript_directory = os.path.join(corpora_directory, "hansard-transcripts")
hansard_speeches_directory = os.path.join(corpora_directory, "hansard-speeches")

saves_directory = os.path.join("", "saves/hansard")

figures_directory = os.path.join("", os.path.join("figures", "hansard"))

In [None]:
def check_create_directory(directory):
    """Checks if directory exists. If not, create it."""

    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
check_create_directory(saves_directory)

check_create_directory(hansard_speeches_directory)

check_create_directory(figures_directory)

# Get speeches from hansard transcripts

In [None]:
def get_speeches(soup):
    """Get the speeches of a transcript in html markup.

    This function:
        (a) finds where the speakers are using the heuristics of looking for bold tags in the form of <b>
        and <strong>, though they are also used for other texts in the transcript (e.g. absent, present, etc.)
        (b) assumes all text after the current bold tag and the next bold tag belong to the same speech
        (c) these speeches are in the form of a dictionary where the key is the speaker, and the val is a list of the
        speech in paragraphs, each paragraph is an item in the list.

    Parameter:
        BeautifulSoup object (BeautifulSoup(html file, "html.parser"))

    Return:
        list of dictionaries where {key: value} pairs are {speaker: speech} pairs, where speech is
        a list of paragraphs in the speech.
    """

    # get ptags and bold/strong tag locationsa
    c = 0
    btag_location = list()
    ptags = list()
    #     strongs = list()

    for ptag in soup.find_all("p"):
        ptags.append(ptag)

        if (ptag.b is not None) or (ptag.strong is not None):
            btag_location.append(c)

        c += 1

    # use btag locations and ptags list to get speeches
    speeches = list()  # container for dictionary of speeches
    c = 0

    for start_index in btag_location:
        # key is MP, and val is a list of the paragraphs of the speech
        speech = dict()

        # get end_index
        if c == len(btag_location) - 1:
            end_index = len(ptags)
        else:
            end_index = btag_location[c + 1]

        # get tags and content
        try:
            key = ptags[start_index].strong.string.strip("\n")
        except AttributeError:
            try:
                key = ptags[start_index].b.string.strip("\n")
            except:
                try:
                    key = ptags[start_index].b.string
                except:
                    key = "key"

        val = ptags[start_index].contents[2].string.strip("\n")

        if len(ptags[start_index].contents) > 3:
            for i in range(3, len(ptags[start_index].contents) + 1):
                try:
                    if ptags[start_index].contents[i] == "\n":
                        pass
                    else:
                        try:
                            val += ptags[start_index].contents[i].string.strip("\n")
                        except:
                            pass
                except IndexError:
                    pass

        #  initiate dictionary with the first value
        speech[key] = [val]

        # get rest of speech using start and end index
        for i in range(start_index + 1, end_index):
            for content in ptags[i].contents:
                if content.string != None:
                    try:
                        val = content.string.strip("\n")
                    except:
                        print(content.string)

                speech[key].append(val)

        speeches.append(speech)

        c += 1

    return speeches

In [None]:
def new_numbered_filename(counter, directory=hansard_speeches_directory):
    new_name = str(counter) + ".txt"
    new_filepath = os.path.join(directory, new_name)
    return new_filepath

In [None]:
def check_procedural_speech(speech, char_limit=150, word_limit=30):
    """Tag a speech as a 'procedural' speech using a crude hueristic as follows.

    Mark a speech as 'procedural' if:
    (i)  length of speech is less than 100, or
    (ii) number of words in speech is less than 20.

    Examples are:
        (i)   Madam Speaker, I agree (consent, concur, etc.)
        (ii)  Does the Leader of the House have the general assent of the hon Members present to so move?
        (iii) Order. Leader of the House.
        (iv)  Mr Speaker, may I take Question Nos 2 and 3 together?
        (v)   Mr Saktiandi, keep it short, please.
        (vi)  Yes, Mr Seah Kian Peng.

    Argument:
        speech is a string/unicode variable

    Return:
        Boolean: True if procedural
    """

    # remove empty lines and empty spaces
    speech = speech.replace(r"\n", "")
    re.sub(r"\s+", " ", speech)

    if (len(speech) <= char_limit) or (len(speech.split(" ")) <= word_limit):
        return True

### Get speeches of hansard transcripts
Helper functions:
   1. get_speeches(soup)
   2. check_procedural_speech(text)
 
Steps:
1. get all transcript file names from directory
2. pass transcript (html format) to bs to get soup object
3. use get_speeches(soup) to obtain speaker-speech chunks 
3. write speech to a running ordered number text file if check_procedural_speech(speech) is False

In [None]:
if 1 == 1:
    transcript_filenames = [f for f in os.listdir(hansard_transcript_directory)]
    transcript_filenames.sort()

    counter = 0

    for filename in transcript_filenames:
        stdout.write("\rProcessing %s" % filename)

        # get content of filename
        filepath = os.path.join(hansard_transcript_directory, filename)
        with codecs.open(filepath, "r", encoding="utf-8") as fh:
            transcript = " ".join(fh.readlines())

        # convert to bs object
        soup = BeautifulSoup(transcript, "html.parser")

        # get speeches
        speeches = get_speeches(soup)

        # unpack each speech into its own file
        for speech_dict in speeches:
            for speaker, speech in speech_dict.items():
                # check if speech is 'procedural', if yes then pass
                if check_procedural_speech(" ".join(speech)):
                    pass
                else:  # write speech to file
                    counter += 1  # increase counter at speech level

                    with codecs.open(
                        new_numbered_filename(counter), "w", encoding="utf-8"
                    ) as f:
                        # add speaker to first line
                        try:
                            f.write(speaker)
                        except TypeError:
                            f.write("\n")

                        # add speech
                        for paragraph in speech:
                            f.write(paragraph)
                            f.write("\n")

# Get sentences and train gensim Phraser

In [None]:
def clean_speech(speech):
    """Clean speeches from parliament transcript.

    Removes:
        (i)   Timestamps
        (ii)  column markers
        (iii) page markers
        (iv)  remove strings in parentheses and brackets
        (v)   strang non-english characters
        (vi)  digits
        (vii) whitespaces

    Argument:
        speech is a unicode/str variable (needs unicode for further spacy processing)

    Return:
        unicode/str var

    """

    # remove strange non-eng characters
    speech = "".join([char for char in speech if char in string.printable])

    # remove transcript markers
    pattern1 = "page:\s*?[0-9]+"  # page makers e.g. page: xx
    pattern2 = "[0-9]+[.:]\s*?[0-9]+\s*?pm"  # timestamp e.g. 5.55 pm
    pattern3 = "[0-9]+[.:]\s*?[0-9]+\s*?am"  # timestamp e.g. 11:00 am
    pattern4 = "column:\s*?[0-9]+"  # column makers e.g. column: xxxx
    pattern5 = "\(.+\)"  # remove anything in parentheses
    pattern6 = "\[.+\]"  # remove anything in brackets

    pattern = "|".join((pattern1, pattern2, pattern3, pattern4, pattern5, pattern6))
    speech = re.sub(pattern, " ", speech)

    # remove digits
    speech = re.sub(r"[0-9]+", " ", speech)

    # remove whitespaces
    speech = re.sub("\s+", " ", speech)

    return speech

In [None]:
def return_token(token):
    """
    Checks if a token (of type spacy.tokens.token.Token) meets certain exclusion criteria using spacy.
    If yes, then return false, return true otherwise.
    Exclusion criteria are:
        (i)   entity types of person, date, time, etc.
        (ii)  punctuation
        (iii) stopword
    Entity types found here: https://spacy.io/usage/linguistic-features

    Argument:
        spacy.tokens.token.Token object.

    Return:
        Boolean.
    """
    entity_types = [
        "PERSON",
        "DATE",
        "TIME",
        "PERCENT",
        "MONEY",
        "QUANTITY",
        "ORDINAL",
        "CARDINAL",
    ]

    if token.ent_type_ in entity_types:
        return False

    if token.is_punct:
        return False

    if token.is_stop:
        return False

    return True

In [None]:
def sentence_stream(directory=hansard_speeches_directory):
    """
    Generator: iterate over all documents in the directory (=hansard_speeches_directory),
    and yield a sentence at a time (=list of utf8 tokens):

        (i)   get document from each file
        (ii)  clean document content using clean_speech()
        (iii) process using spacy's nlp
        (iv)  store lemmatized token to list if not punct and not stopword
        (v)   yield list of sentence tokens

    Argument:
        Directory containing document files.

    Return:
        List of sentence tokens for each sentence.

    """
    # get speech from directory
    for file_no in range(1, len(os.listdir(hansard_speeches_directory)) + 1):
        #     for file_no in range(1, 50):
        filepath = os.path.join(directory, str(file_no) + ".txt")
        with codecs.open(filepath, "r", encoding="utf-8") as fh:
            speech = " ".join(fh.readlines())

        # basic cleaning of speech
        speech = clean_speech(speech)

        # get sentences
        parsed_speech = nlp(speech, disable=["tagger"])

        for (
            sentence
        ) in (
            parsed_speech.sents
        ):  # append lemmatised token to list if token meets conditions (see return_token())
            sentence_tokens = [
                token.lemma_.lower() for token in sentence if return_token(token)
            ]

            yield sentence_tokens

In [None]:
for num, entity in enumerate(parsed_speech.ents):
    print 'Entity {}:'.format(num + 1), entity, '-', entity.label_
    print ''

In [None]:
ss = list(sentence_stream())

In [None]:
ss

## Bigram phraser

In [None]:
bigram_phrases = Phrases(
    sentences=sentence_stream(),
    min_count=10,
    threshold=0.55,
    max_vocab_size=40000000,
    delimiter="_",
    scoring="npmi",
)

In [None]:
bigram_phraser = Phraser(bigram_phrases)

In [None]:
for sent in ss:
    print " ".join(bigram_phraser[sent])
    print "---------------------------"   

In [None]:
bigram_phraser_directory = os.path.join(saves_directory, "bigram-phraser-hansard")
bigram_phraser.save(bigram_phraser_directory)

In [None]:
bigram_phraser_directory = os.path.join(saves_directory, "bigram-phraser-hansard")
bigram_phraser = Phraser.load(bigram_phraser_directory)

## Trigram phraser

In [None]:
trigram_phrases = Phrases(
    sentences=bigram_phraser[sentence_stream()],
    min_count=10,
    threshold=0.7,
    max_vocab_size=40000000,
    delimiter="_",
    scoring="npmi",
)

In [None]:
trigram_phraser = Phraser(trigram_phrases)

In [None]:
for sent in bigram_phraser[ss]:
    print " ".join(trigram_phraser[sent])
    print "---------------------------"   

In [None]:
trigram_phraser_directory = os.path.join(saves_directory, "trigram-phraser-hansard")
trigram_phraser.save(trigram_phraser_directory)

In [None]:
trigram_phraser_directory = os.path.join(saves_directory, "trigram-phraser-hansard")
trigram_phraser = Phraser.load(trigram_phraser_directory)

# Tokenize speeches

In [None]:
def tokenized_corpus_stream(directory=hansard_speeches_directory):
    # get speech from directory
    trigram_corpus = list()

    for ix, speech_file in enumerate(os.listdir(directory)):
        stdout.write("\rProcessing %s/%s" % (ix + 1, len(os.listdir(directory))))

        filepath = os.path.join(directory, speech_file)
        with codecs.open(filepath, "r", encoding="utf-8") as fh:
            speech = " ".join(fh.readlines())

        # basic cleaning of speech
        speech = clean_speech(speech)

        # tokenized speech using nlp
        tokenized_speech = [
            token.lemma_.lower()
            for token in nlp(speech, disable=["tagger"])
            if return_token(token)
        ]

        # convert bigrams using bigram_phraser
        bigram_speech = bigram_phraser[tokenized_speech]

        # convert trigrams using trigram_phraser
        trigram_speech = trigram_phraser[bigram_speech]

        trigram_corpus.append(trigram_speech)

    return trigram_corpus

In [None]:
trigram_corpus = tokenized_corpus_stream()

In [None]:
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-hansard")

with open(trigram_corpus_directory, "wb") as f:
    pickle.dump(trigram_corpus, f)

## Get dictionary

In [None]:
# load corpus
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-hansard")
with open(trigram_corpus_directory, "rb") as f:
    corpus = pickle.load(f)

In [None]:
dictionary = Dictionary(trigram_corpus)

In [None]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-hansard.dict")
dictionary.save(dictionary_directory)

## Vectorized tokenized corpus

In [None]:
# load corpus
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-hansard")
with open(trigram_corpus_directory, "rb") as f:
    trigram_corpus = pickle.load(f)

# load dictionary
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-hansard.dict")
dictionary = Dictionary.load(dictionary_directory)

In [None]:
vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-hansard.mm"
)
MmCorpus.serialize(
    vectorized_corpus_directory, [dictionary.doc2bow(doc) for doc in trigram_corpus]
)

In [None]:
# vectorized_corpus = [dictionary.doc2bow(doc) in tokenized_corpus_stream()]

In [None]:
# vectorized_corpus_directory = os.path.join(saves_directory, 'trigram-vectorized-corpus-hansard.mm')
# MmCorpus.serialize(vectorized_corpus_directory, vectorized_corpus)

# Find optimal k
Requires:
* vectorized_corpus
* dictionary
* corpus (in text form - list of lists of document tokens)

## Get coherence scores

In [None]:
# load vectorized_corpus as stream
vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-hansard.mm"
)
vectorized_corpus = MmCorpus(vectorized_corpus_directory)

# load dictionary
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-hansard.dict")
dictionary = Dictionary.load(dictionary_directory)

# load corpus
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-hansard")
with open(trigram_corpus_directory, "rb") as f:
    corpus = pickle.load(f)

In [None]:
df = pd.DataFrame(columns=["c_v", "c_uci", "c_npmi", "u_mass", "num_topics"]).set_index(
    "num_topics"
)

In [None]:
MAX_TOPICS = 150
TOPN = 10  # top n words in topics to use when evaluating topic coherence
PROCESSES = 1  # I think this is how cpu cores to use when estimating coherence

for k in np.arange(2, MAX_TOPICS, 2):
    ALPHA = float(50) / k
    stdout.write(
        "\rTopic modelling %s topics (%s)" % (k, time.strftime("%Y-%m-%d %H:%M"))
    )

    # train base LDA model
    tm = LdaMulticore(
        corpus=vectorized_corpus,
        num_topics=k,
        id2word=dictionary,
        workers=2,
        chunksize=2000,
        passes=1,
        batch=False,
        alpha=ALPHA,
        eta=None,
        decay=0.5,
        offset=1.0,
        eval_every=10,
        iterations=100,
        gamma_threshold=0.001,
        random_state=0,
        minimum_probability=0.01,
        minimum_phi_value=0.01,
        per_word_topics=False,
    )

    # Train coherence models
    c_v_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_v",
        topn=TOPN,
        processes=PROCESSES,
    )

    c_uci_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_uci",
        topn=TOPN,
        processes=PROCESSES,
    )

    c_npmi_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_npmi",
        topn=TOPN,
        processes=PROCESSES,
    )

    u_mass_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="u_mass",
        topn=TOPN,
        processes=PROCESSES,
    )

    # store coherence scores
    df.set_value(k, "c_v", c_v_model.get_coherence())
    df.set_value(k, "c_uci", c_uci_model.get_coherence())
    df.set_value(k, "c_npmi", c_npmi_model.get_coherence())
    df.set_value(k, "u_mass", u_mass_model.get_coherence())
    df.to_excel("coherence-scores-hansard-alpha.xlsx")

In [None]:
df = pd.read_excel("coherence-scores-hansard-alpha.xlsx", index_col="num_topics")

In [None]:
df.head(3)

In [None]:
df = df.apply(lambda c: pd.to_numeric(c, errors="coerce"))

In [None]:
glob_params = {
    "legend.fontsize": "xx-large",
    "figure.titlesize": "xx-large",
    "axes.labelsize": "xx-large",
    "axes.titlesize": "xx-large",
    "xtick.labelsize": "xx-large",
    "ytick.labelsize": "xx-large",
    "lines.markersize": 12.0,
    "figure.figsize": [12, 8],
}

plt.rcParams.update(glob_params)

## Plot c_v

In [None]:
df["c_v"].idxmax(axis=1)

In [None]:
df.nlargest(5, "c_v").index.tolist()

In [None]:
c_v_directory = os.path.join(figures_directory, "c-v")

local_max = (df["c_v"].argmax(), df["c_v"].max())

plt.plot(df.index, df["c_v"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.005),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_v coherence measure")
plt.legend(loc="best")

plt.savefig(c_v_directory)
plt.show()

## Plot c_uci

In [None]:
df["c_uci"].idxmax(axis=1)

In [None]:
df.nlargest(5, "c_uci").index.tolist()

In [None]:
c_uci_directory = os.path.join(figures_directory, "c-uci")

local_max = (df["c_uci"].argmax(), df["c_uci"].max())

plt.plot(df.index, df["c_uci"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.01),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_uci coherence measure")
plt.legend(loc="best")

plt.savefig(c_uci_directory)
plt.show()

## Plot c_npmi

In [None]:
df["c_npmi"].idxmax(axis=1)

In [None]:
df.nlargest(5, "c_npmi").index.tolist()

In [None]:
c_npmi_directory = os.path.join(figures_directory, "c-npmi")

local_max = (df["c_npmi"].argmax(), df["c_npmi"].max())

plt.plot(df.index, df["c_npmi"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.001),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_npmi coherence measure")
plt.legend(loc="best")

plt.savefig(c_npmi_directory)
plt.show()

## Plot u_mass

In [None]:
df["u_mass"].idxmax(axis=1)

In [None]:
df.nlargest(5, "u_mass").index.tolist()

In [None]:
u_mass_directory = os.path.join(figures_directory, "u-mass")

local_max = (df["u_mass"].argmax(), df["u_mass"].max())

plt.plot(df.index, df["u_mass"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(round(local_max[0], -1), round(local_max[1], 1)),
    arrowprops=dict(facecolor="black"),
)

plt.xlabel("Number of topics")
plt.ylabel("u_mass coherence measure")
plt.legend(loc="best")

plt.savefig(u_mass_directory)
plt.show()

## Plot normalised scores

In [None]:
# normalising coherence scores
def normalise_scores(vector):
    """
    Normalise scores to (0, 1) range using:
        x_normalised = (x - x_min)/(x_max - x_min)

    Argument:
        List, array, series type

    Returns:
        Transformed list, array, series
    """

    # cach min and max
    min_x = vector.min()
    range_x = vector.max() - min_x

    transformed_vector = [(x - min_x) / range_x for x in vector]

    return transformed_vector

In [None]:
transformed_scores_df = df.apply(normalise_scores)

In [None]:
transformed_scores_df

In [None]:
normalised_directory = os.path.join(figures_directory, "normalised-scores")

local_max = (
    transformed_scores_df["c_npmi"].argmax(),
    transformed_scores_df["c_npmi"].max(),
)

# create plot
plt.plot(transformed_scores_df.index, transformed_scores_df["c_v"], color="teal")
plt.plot(transformed_scores_df.index, transformed_scores_df["c_uci"], color="orange")
plt.plot(transformed_scores_df.index, transformed_scores_df["c_npmi"], color="maroon")

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.5)

plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.03),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("Normalised coherence scores [0, 1]")
plt.legend(loc="best")

# plt.savefig(normalised_directory)
plt.show()

In [None]:
transformed_scores_df.nlargest(5, "c_v")

# LDA
Requires:
* vectorized_corpus
* dictionary

In [None]:
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-hansard.dict")
dictionary = Dictionary.load(dictionary_directory)

In [None]:
# from gensim.test.utils import datapath

vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-hansard.mm"
)

vectorized_corpus = MmCorpus(vectorized_corpus_directory)

In [None]:
K = 80
ALPHA = float(50) / K
ALPHA

In [None]:
lda = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=K,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=1,
    alpha=ALPHA,
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=1000,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda_directory = os.path.join(saves_directory, "lda-hansard-80-alpha.model")
lda.save(lda_directory)

#### K = 50 topics

In [None]:
K = 50
ALPHA = float(50) / K
ALPHA

In [None]:
lda = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=K,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=1,
    alpha=ALPHA,
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=1000,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda_directory = os.path.join(saves_directory, "lda-hansard-50-alpha.model")
lda.save(lda_directory)

#### K = 100 topics

In [None]:
K = 100
ALPHA = float(50) / K
ALPHA

In [None]:
lda = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=K,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=1,
    alpha=ALPHA,
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=1000,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda_directory = os.path.join(saves_directory, "lda-hansard-100-alpha.model")
lda.save(lda_directory)