# Load modules

In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Standard library
import os
import re
import time
import string
import pickle
from sys import stdout

# Third-party - data/numeric
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Third-party - NLP
import spacy
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# Load spaCy model
nlp = spacy.load("en")

# Notebook settings
%matplotlib inline

In [None]:
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

In [None]:
corpora_directory = os.path.join("", "corpora")

saves_directory = os.path.join("", "saves", "st")

In [None]:
def check_create_directory(directory):
    """Checks if directory exists. If not, create it."""

    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
check_create_directory(saves_directory)

# Pre-processing

## Load ST corpus and clean corpus

In [None]:
st_file = os.path.join(corpora_directory, "all_articles_final.xlsx")

df = pd.read_excel(st_file)

In [None]:
def clean_content(text):
    #     # remove strange non-eng characters
    #     text = "".join([unicode(char) for char in text if char in string.printable])

    remove_list = [r"\n',", r"'\n',", r'\n",', r'\n"', r"\n", r"\r", "$", r"\\", "\\"]

    for item in remove_list:
        text = text.replace(item, "")

    # remove digits
    text = re.sub(r"[0-9]+", "", text)

    # remove email addresses
    text = re.sub(r"\S+@\S+", "", text)

    # replace multiple whitespactes with one whitespace
    text = re.sub(r"\s+", " ", text)

    return text


corpus_st = [
    clean_content(df.get_value(row, "content")[1:-1]) for row, _ in df.iterrows()
]

In [None]:
corpus_st_directory = os.path.join(saves_directory, "corpus-st")

with open(corpus_st_directory, "wb") as fh:
    pickle.dump(corpus_st, fh)

## Adding custom stopwords

In [None]:
additional_stopwords = ["i", "mr", "dr", "ms", "tell", "cent", "reporter"]

# convert from str to unicode (spacy reads only unicode)
additional_stopwords = [unicode(word) for word in additional_stopwords]

# add to spacy's STOP_WORDS
for word in additional_stopwords:
    STOP_WORDS.add(word)

# set word.is_stop to True for removal of stopwords
for stopword in STOP_WORDS:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

## Getting sentences and training phrasers

In [None]:
corpus_st_directory = os.path.join(saves_directory, "corpus-st")

with open(corpus_st_directory, "rb") as fh:
    corpus = pickle.load(fh)

In [None]:
def return_token(token):
    """
    Checks if a token (of type spacy.tokens.token.Token) meets certain exclusion criteria using spacy.
    If yes, then return false, return true otherwise.
    Exclusion criteria are:
        (i)   entity types of person, date, time, etc.
        (ii)  punctuation
        (iii) stopword
    Entity types found here: https://spacy.io/usage/linguistic-features

    Argument:
        spacy.tokens.token.Token object.

    Return:
        Boolean.
    """
    entity_types = [
        "PERSON",
        "DATE",
        "TIME",
        "PERCENT",
        "MONEY",
        "QUANTITY",
        "ORDINAL",
        "CARDINAL",
    ]

    if token.ent_type_ in entity_types:
        return False

    if token.is_punct:
        return False

    if token.is_stop:
        return False

    return True

In [None]:
if False:
    sentence_stream = list()

    c = 0

    for doc in corpus_st:
        c += 1
        stdout.write("\rGetting sentences from document %s/%s" % (c, len(corpus_st)))

        parsed_doc = nlp(doc, disable=["tagger"])

        for sentence in parsed_doc.sents:
            sentence_tokens = list()
            for token in sentence:
                if not token.is_punct and not token.is_stop:
                    sentence_tokens.append(token.lemma_.lower())

            #         sentence_tokens = [token.lemma_.lower() for token in sentence if return_token(token)]
            sentence_stream.append(sentence_tokens)

    sentence_stream_directory = os.path.join(saves_directory, "sentence-stream-st")

    with open(sentence_stream_directory, "wb") as fh:
        pickle.dump(sentence_stream, fh)

    print("\nSaved sentence_stream list to %s" % sentence_stream_directory)

In [None]:
def sentence_stream(corpus):
    """
    Generator: iterate over documents in corpus_st

    Argument:
        Corpus is a list of lists of documents.

    Yields:
        List of tokens.
    """

    for doc in corpus:
        #     for doc in corpus[:5000]:
        parsed_doc = nlp(doc, disable=["tagger"])

        for sentence in parsed_doc.sents:
            sentence_tokens = [
                token.lemma_.lower() for token in sentence if return_token(token)
            ]

            yield sentence_tokens

In [None]:
# sentence_stream_directory = os.path.join(saves_directory, 'sentence-stream-st')

# with open(sentence_stream_directory, 'rb') as fh:
#     sentence_stream = pickle.load(fh)

In [None]:
# len(sentence_stream)

In [None]:
ss = list(sentence_stream(corpus))

In [None]:
ss

## Bigram phraser

In [None]:
bigram_phrases = Phrases(
    sentences=sentence_stream(corpus),
    min_count=10,
    threshold=0.7,
    max_vocab_size=40000000,
    delimiter="_",
    scoring="npmi",
)

In [None]:
bigram_phraser = Phraser(bigram_phrases)

In [None]:
for sent in ss:
    print " ".join(bigram_phraser[sent])
    print "---------------------------"

In [None]:
bigram_phraser_directory = os.path.join(saves_directory, "bigram-phraser-st")
bigram_phraser.save(bigram_phraser_directory)

In [None]:
bigram_sentences = [bigram_phraser[sent] for sent in sentence_stream(corpus)]

In [None]:
bigram_sentence_stream_directory = os.path.join(
    saves_directory, "bigram-sentence-stream-st"
)

with open(bigram_sentence_stream_directory, "wb") as fh:
    pickle.dump(bigram_sentences, fh)

## Trigram phraser

In [None]:
trigram_phrases = Phrases(
    sentences=bigram_phraser[sentence_stream(corpus)],
    min_count=10,
    threshold=0.7,
    max_vocab_size=40000000,
    delimiter="_",
    scoring="npmi",
)

In [None]:
trigram_phraser = Phraser(trigram_phrases)

In [None]:
for sent in bigram_phraser[ss]:
    print " ".join(trigram_phraser[sent])
    print '------------------------------'

In [None]:
trigram_phraser_directory = os.path.join(saves_directory, "trigram-phraser-st")
trigram_phraser.save(trigram_phraser_directory)

# Tokenize corpus

In [None]:
corpus_st_directory = os.path.join(saves_directory, "corpus-st")

with open(corpus_st_directory, "rb") as fh:
    corpus_st = pickle.load(fh)

In [None]:
tokenized_corpus = list()


for ix, doc in enumerate(corpus_st):
    stdout.write("\rTokenizing document %s/%s" % (ix + 1, len(corpus_st)))

    tokenized_document = list()

    # parse doc (str/unicode) using spacy's nlp
    parsed_doc = nlp(doc, disable=["tagger", "ner"])

    # append lemma of token if not punctuation and not stopword
    for token in parsed_doc:
        if not token.is_punct and not token.is_stop:
            tokenized_document.append(token.lemma_.lower())

    tokenized_corpus.append(tokenized_document)


# save
tokenized_corpus_directory = os.path.join(saves_directory, "tokenized_corpus-st")

with open(tokenized_corpus_directory, "wb") as fh:
    pickle.dump(tokenized_corpus, fh)

## Converting tokenized corpus to phrases and vectorized

In [None]:
tokenized_corpus_directory = os.path.join(saves_directory, "tokenized_corpus-st")

with open(tokenized_corpus_directory, "rb") as fh:
    tokenized_corpus = pickle.load(fh)

In [None]:
bigram_phraser_directory = os.path.join(saves_directory, "bigram-phraser-st")
bigram_phraser = Phraser.load(bigram_phraser_directory)

bigram_corpus = [bigram_phraser[sent] for sent in tokenized_corpus]

In [None]:
trigram_phraser_directory = os.path.join(saves_directory, "trigram-phraser-st")
trigram_phraser = Phraser.load(trigram_phraser_directory)

trigram_corpus = [trigram_phraser[sent] for sent in bigram_corpus]

In [None]:
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-st")

with open(trigram_corpus_directory, "wb") as f:
    pickle.dump(trigram_corpus, f)

## Get dictionary

In [None]:
# load corpus
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-st")
with open(trigram_corpus_directory, "rb") as f:
    corpus = pickle.load(f)

In [None]:
dictionary = Dictionary(corpus)
dictionary.filter_extremes(no_below=10, no_above=0.5)

dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-st.dict")
dictionary.save(dictionary_directory)

## Vectorized corpus

In [None]:
vectorized_corpus = [dictionary.doc2bow(doc) for doc in trigram_corpus]

vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-st.mm"
)
MmCorpus.serialize(vectorized_corpus_directory, vectorized_corpus)

# Find optimal k
Requires:
* vectorized_corpus
* dictionary
* corpus (in text form - list of lists of document tokens)

## Get coherence scores

In [None]:
# load vectorized_corpus as stream
vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-st.mm"
)
vectorized_corpus = MmCorpus(vectorized_corpus_directory)

# load dictionary
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-st.dict")
dictionary = Dictionary.load(dictionary_directory)

# load corpus
trigram_corpus_directory = os.path.join(saves_directory, "trigram-corpus-st")
with open(trigram_corpus_directory, "rb") as f:
    corpus = pickle.load(f)

In [None]:
df2 = pd.DataFrame(
    columns=["c_v", "c_uci", "c_npmi", "u_mass", "num_topics"]
).set_index("num_topics")

In [None]:
# df = pd.read_excel('coherence-scores-st.xlsx', index_col='num_topics')

In [None]:
MAX_TOPICS = 30
TOPN = 10  # top n words in topics to use when evaluating topic coherence
PROCESSES = 1  # I think this is how cpu cores to use when estimating coherence

for k in np.arange(2, MAX_TOPICS, 2):
    stdout.write(
        "\rTopic modelling %s topics (%s)" % (k, time.strftime("%Y-%m-%d %H:%M"))
    )

    # train base LDA model
    tm = LdaMulticore(
        corpus=vectorized_corpus,
        num_topics=k,
        id2word=dictionary,
        workers=2,
        chunksize=2000,
        passes=1,
        batch=False,
        alpha="symmetric",
        eta=None,
        decay=0.5,
        offset=1.0,
        eval_every=10,
        iterations=100,
        gamma_threshold=0.001,
        random_state=0,
        minimum_probability=0.01,
        minimum_phi_value=0.01,
        per_word_topics=False,
    )

    # Train coherence models
    c_v_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_v",
        topn=TOPN,
        processes=PROCESSES,
    )

    c_uci_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_uci",
        topn=TOPN,
        processes=PROCESSES,
    )

    c_npmi_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="c_npmi",
        topn=TOPN,
        processes=PROCESSES,
    )

    u_mass_model = CoherenceModel(
        model=tm,
        topics=None,
        texts=corpus,
        corpus=vectorized_corpus,
        dictionary=None,
        coherence="u_mass",
        topn=TOPN,
        processes=PROCESSES,
    )

    # store coherence scores
    df2.set_value(k, "c_v", c_v_model.get_coherence())
    df2.set_value(k, "c_uci", c_uci_model.get_coherence())
    df2.set_value(k, "c_npmi", c_npmi_model.get_coherence())
    df2.set_value(k, "u_mass", u_mass_model.get_coherence())
#     df2.to_excel('coherence-scores-st.xlsx')

In [None]:
df = pd.read_excel("coherence-scores-st.xlsx", index_col="num_topics")

In [None]:
df = df.apply(lambda c: pd.to_numeric(c, errors="coerce"))

In [None]:
glob_params = {
    "legend.fontsize": "xx-large",
    "figure.titlesize": "xx-large",
    "axes.labelsize": "xx-large",
    "axes.titlesize": "xx-large",
    "xtick.labelsize": "x-large",
    "ytick.labelsize": "x-large",
    "lines.markersize": 12.0,
    "figure.figsize": [12, 8],
}

plt.rcParams.update(glob_params)

In [None]:
figures_directory = os.path.join("", os.path.join("figures", "st"))

In [None]:
check_create_directory(figures_directory)

## Plot c_v

In [None]:
df.nlargest(5, "c_v").index.tolist()

In [None]:
c_v_directory = os.path.join(figures_directory, "c-v-article")

local_max = (df["c_v"].argmax(), df["c_v"].max())

plt.plot(df.index, df["c_v"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.01),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_v coherence measure")
plt.legend(loc="best")

plt.savefig(c_v_directory)
plt.show()

## Plot c_uci

In [None]:
df.nlargest(5, "c_uci").index.tolist()

In [None]:
c_uci_directory = os.path.join(figures_directory, "c-uci-article")

local_max = (df["c_uci"].argmax(), df["c_uci"].max())

plt.plot(df.index, df["c_uci"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.02),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_uci coherence measure")
plt.legend(loc="best")

plt.savefig(c_uci_directory)
plt.show()

## Plot c_npmi

In [None]:
df.nlargest(5, "c_npmi").index.tolist()

In [None]:
c_npmi_directory = os.path.join(figures_directory, "c-npmi-article")

local_max = (df["c_npmi"].argmax(), df["c_npmi"].max())

plt.plot(df.index, df["c_npmi"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "Local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(local_max[0] + 5, local_max[1] - 0.005),
    size="x-large",
)

plt.xlabel("Number of topics")
plt.ylabel("c_npmi coherence measure")
plt.legend(loc="best")

plt.savefig(c_npmi_directory)
plt.show()

## Plot u_mass

In [None]:
df.nlargest(5, "u_mass").index.tolist()

In [None]:
u_mass_directory = os.path.join(figures_directory, "u-mass-article")

local_max = (df["u_mass"].argmax(), df["u_mass"].max())

plt.plot(df.index, df["u_mass"])

plt.plot(local_max[0], local_max[1], "o", color="red", alpha=0.7)

# https://matplotlib.org/users/annotations.html
plt.annotate(
    "local max\n (" + str(local_max[0]) + ", " + str(local_max[1])[:5] + ")",
    xy=(local_max),
    xytext=(round(local_max[0], -1), round(local_max[1], 1)),
    arrowprops=dict(facecolor="black"),
)

plt.xlabel("Number of topics")
plt.ylabel("u_mass coherence measure")
plt.legend(loc="best")

plt.savefig(u_mass_directory)
plt.show()

## Plot normalised scores

In [None]:
# normalising coherence scores
def normalise_scores(vector):
    """
    Normalise scores to (0, 1) range using:
        x_normalised = (x - x_min)/(x_max - x_min)

    Argument:
        List, array, series type

    Returns:
        Transformed list, array, series
    """

    # cach min and max
    min_x = vector.min()
    range_x = vector.max() - min_x

    transformed_vector = [(x - min_x) / range_x for x in vector]

    return transformed_vector


transformed_scores_df = df.apply(normalise_scores)

In [None]:
df.nlargest(5, "c_npmi").index.tolist()

In [None]:
df.nlargest(5, "c_v").index.tolist()

In [None]:
df.nlargest(5, "c_uci").index.tolist()

In [None]:
transformed_scores_df.nlargest(5, "c_npmi").index.tolist()

In [None]:
transformed_scores_df.nlargest(5, "c_v").index.tolist()

In [None]:
transformed_scores_df.nlargest(5, "c_uci").index.tolist()

In [None]:
normalised_directory = os.path.join(figures_directory, "normalised-scores-article")

local_max = (
    transformed_scores_df["c_npmi"].argmax(),
    transformed_scores_df["c_npmi"].max(),
)

# create plot
plt.plot(transformed_scores_df.index, transformed_scores_df["c_v"], color="teal")
plt.plot(transformed_scores_df.index, transformed_scores_df["c_uci"], color="orange")
plt.plot(transformed_scores_df.index, transformed_scores_df["c_npmi"], color="maroon")

plt.xlabel("Number of topics")
plt.ylabel("Normalised coherence scores [0, 1]")
plt.legend(loc="best")

plt.savefig(normalised_directory)
plt.show()

# LDA

In [None]:
dictionary_directory = os.path.join(saves_directory, "trigram-dictionary-st.dict")
dictionary = Dictionary.load(dictionary_directory)

vectorized_corpus_directory = os.path.join(
    saves_directory, "trigram-vectorized-corpus-st.mm"
)
vectorized_corpus = MmCorpus(vectorized_corpus_directory)

## K*=40

In [None]:
lda40 = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=40,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=2,
    alpha="symmetric",
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=200,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda40_directory = os.path.join(saves_directory, "lda-st-40.model")
lda40.save(lda40_directory)

## K=30

In [None]:
lda30 = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=30,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=2,
    alpha="symmetric",
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=200,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda30_directory = os.path.join(saves_directory, "lda-st-30.model")
lda30.save(lda30_directory)

## K=50

In [None]:
lda50 = LdaMulticore(
    corpus=vectorized_corpus,
    num_topics=50,
    id2word=dictionary,
    chunksize=2000,
    passes=1,
    workers=2,
    alpha="symmetric",
    eta=None,
    decay=0.5,
    offset=1.0,
    eval_every=None,
    iterations=200,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=0,
    minimum_phi_value=0.01,
    per_word_topics=False,
)

In [None]:
lda50_directory = os.path.join(saves_directory, "lda-st-50.model")
lda50.save(lda50_directory)