# LDA

In [2]:
import pandas as pd
import os
import csv

In [3]:
data_file = "./data/preprocessed_data.csv"

# Read the data into a pandas dataframe
df = pd.read_csv(
    data_file,  # The data file being read, from the variable assignment above
    on_bad_lines="warn",  # This tells Pandas to only warn on bad lines vs causing an error
    dtype="str",
)  # This tells Pandas to treat all numbers as words

df.fillna("", inplace=True)

In [4]:
df.shape

(9605, 44)

In [5]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,MDR_REPORT_KEY,MDR_TEXT_KEY,TEXT_TYPE_CODE,PATIENT_SEQUENCE_NUMBER,DATE_REPORT,FOI_TEXT,DEVICE_EVENT_KEY,IMPLANT_FLAG,DATE_REMOVED_FLAG,...,COMBINATION_PRODUCT_FLAG,UDI-DI,UDI-PUBLIC,TOKENIZED_TEXT,NOPUNCT_TEXT,NOSTOPWORDS_TEXT,NODIGITS_TEXT,POS_TEXT,LEMMATIZED_TEXT,STEMMED_TEXT
0,106741,6383024,106903842,N,1,,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,,,,...,N,,,"['based', 'on', 'additional', 'information', '...","['based', 'on', 'additional', 'information', '...","['based', 'additional', 'information', 'receiv...","['based', 'additional', 'information', 'receiv...","[('based', 'VBN'), ('additional', 'JJ'), ('inf...","['base', 'additional', 'information', 'receive...","['base', 'addit', 'inform', 'receiv', 'complai..."


In [5]:
import gensim
from gensim.utils import simple_preprocess


def sent_to_words(sentences):
    for sentence in sentences:
        yield (
            gensim.utils.simple_preprocess(str(sentence), deacc=True)
        )  # deacc=True removes punctuations


data = df["FOI_TEXT"].tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['based', 'on', 'additional', 'information', 'received', 'this', 'complaint', 'is', 'not', 'medtronic', 'product', 'if', 'information', 'is', 'provided', 'in', 'the', 'future', 'supplemental', 'report', 'will', 'be', 'issued']


In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(
    data_words, min_count=5, threshold=100
)  # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
# NLTK Stop words
import nltk

# nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words.extend(["from", "subject", "re", "edu", "use"])


# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        )
    return texts_out

In [8]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(
    data_words_bigrams, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]
)

print(data_lemmatized[:1][0][:30])

['base', 'additional', 'information', 'receive', 'complaint', 'medtronic', 'product', 'information', 'provide', 'future', 'supplemental', 'report', 'issue']


In [9]:
data_words_bigrams[0]

['based',
 'additional',
 'information',
 'received',
 'complaint',
 'medtronic',
 'product',
 'information',
 'provided',
 'future',
 'supplemental',
 'report',
 'issued']

In [10]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]


In [11]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=10,
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True,
)

In [12]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.150*"pain" + 0.113*"patient" + 0.107*"treatment" + 0.045*"implant" + '
  '0.029*"symptom" + 0.028*"surgery" + 0.028*"include" + 0.027*"complication" '
  '+ 0.026*"pelvic" + 0.023*"vaginal"'),
 (1,
  '0.085*"patient" + 0.066*"report" + 0.054*"scientific" + 0.053*"procedure" + '
  '0.038*"corporation" + 0.035*"implant" + 0.029*"advantage" + 0.027*"perform" '
  '+ 0.023*"experience" + 0.022*"mesh"'),
 (2,
  '0.469*"choose" + 0.053*"number" + 0.037*"send" + 0.028*"frequent" + '
  '0.027*"evidence" + 0.026*"code" + 0.022*"find" + 0.018*"slight" + '
  '0.017*"report" + 0.017*"packing"'),
 (3,
  '0.043*"vaginal" + 0.038*"mesh" + 0.021*"bladder" + 0.017*"sling" + '
  '0.016*"cystoscopy" + 0.015*"urinary" + 0.012*"note" + 0.011*"patient" + '
  '0.011*"anterior" + 0.010*"procedure"'),
 (4,
  '0.067*"representative" + 0.067*"patient" + 0.064*"implant" + 0.063*"report" '
  '+ 0.047*"sling" + 0.046*"urinary" + 0.028*"sphincter" + 0.027*"device" + '
  '0.025*"artificial" + 0.021*"due"'),
 

In [13]:
from gensim.models import CoherenceModel

# Compute Coherence Score
# TODO: Pickle these as the baseline models
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence Score: ", coherence_lda)

Coherence Score:  0.5142455499661397


In [14]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    # TODO: Pickle each lda_model and coherence_model_lda
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=100,
        chunksize=100,
        passes=10,
        alpha=a,
        eta=b,
    )

    coherence_model_lda = CoherenceModel(
        model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="c_v"
    )

    return coherence_model_lda.get_coherence()

In [15]:
import numpy as np
import tqdm

grid = {}
grid["Validation_Set"] = {}

# Topics range
min_topics = 3
max_topics = 5
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append("symmetric")
# alpha.append("asymmetric")

alpha = [0.01, 0.5, "symmetric"]

# Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append("symmetric")
beta = [0.01, 0.5, "symmetric"]

# Validation sets
num_of_docs = len(corpus)

corpus_sets = [
    gensim.utils.ClippedCorpus(corpus, int(num_of_docs * 0.70)),
    gensim.utils.ClippedCorpus(corpus, int(num_of_docs * 0.75)),
]

corpus_title = ["70% Corpus", "75% Corpus"]

model_results = {
    "Validation_Set": [],
    "Topics": [],
    "Alpha": [],
    "Beta": [],
    "Coherence": [],
}

# Can take a long time to run
if True:
    pbar = tqdm.tqdm(
        total=(len(beta) * len(alpha) * len(topics_range) * len(corpus_title))
    )

    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(
                        corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b
                    )
                    # Save the model results
                    model_results["Validation_Set"].append(corpus_title[i])
                    model_results["Topics"].append(k)
                    model_results["Alpha"].append(a)
                    model_results["Beta"].append(b)
                    model_results["Coherence"].append(cv)

                    pbar.update(1)
    pd.DataFrame(model_results).to_csv(
        f"./data/lda_tuning_results.csv", index=False
    )
    pbar.close()

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [30:55<00:00, 51.54s/it]   


## Start of Analysis Section

In [16]:
model_results_df = pd.DataFrame(model_results)

In [17]:
import pickle
import datetime

TAG = f"{datetime.datetime.now():%Y-%m-%d-%s}"

with open(
    f"./data/model_results_df-{max_topics}-topics-{TAG}.pickle", "wb"
) as f:
    pickle.dump(model_results_df, f)

In [20]:
max_topics

5

In [18]:
# TODO: Graph the coherence scores based on some common criteria
model_results_df

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,70% Corpus,3,0.01,0.01,0.581722
1,70% Corpus,3,0.01,0.5,0.585841
2,70% Corpus,3,0.01,symmetric,0.598325
3,70% Corpus,3,0.5,0.01,0.579251
4,70% Corpus,3,0.5,0.5,0.581705
5,70% Corpus,3,0.5,symmetric,0.584441
6,70% Corpus,3,symmetric,0.01,0.57985
7,70% Corpus,3,symmetric,0.5,0.588638
8,70% Corpus,3,symmetric,symmetric,0.588638
9,70% Corpus,4,0.01,0.01,0.557099


In [21]:
num_topics = 4
alpha = 0.01
beta = "symmetric"

lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=100,
    chunksize=100,
    passes=10,
    alpha=alpha,
    eta=beta,
)

In [22]:
import pickle
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join(
    f"./data/ldavis_tuned_{num_topics}-topics-{TAG}.pickle"
)

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if True:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, "wb") as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, "rb") as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(
    LDAvis_prepared,
    f"./data/ldavis_tuned_{num_topics}-topics-{TAG}"
    + str(num_topics)
    + ".html",
)

LDAvis_prepared

## Start of Analysis Section

In [23]:
import os
import fnmatch
import subprocess

bucket = "praxis-2023-html-output"
website = f"http://{bucket}.s3-website-us-west-2.amazonaws.com"

# Use the fnmatch module to find all files in the current directory that end in ".html"
file_list = []
for root, dirnames, filenames in os.walk("."):
    for filename in fnmatch.filter(filenames, "*.html"):
        file_list.append(os.path.join(root, filename))

# Sort the file list alphabetically
file_list.sort()

# Create the HTML file and write the header
with open(os.path.join(".", "index.html"), "w") as f:
    f.write(
        """<html>
        <head>
            <title>Praxis 2023 HTML Output</title>
            <style>
                table {
                    border-collapse: collapse;
                    width: 100%;
                }
                th, td {
                    text-align: left;
                    padding: 8px;
                }
                th {
                    background-color: #007bff;
                    color: #fff;
                    font-weight: bold;
                }
                tr:nth-child(even) {
                    background-color: #f2f2f2;
                }
                tr:hover {
                    background-color: #ddd;
                }
            </style>
        </head>
        <body>
            <table>
                <tr><th>Name</th><th>Size</th></tr>\n
    """
    )

    # Loop through each file and add a row to the table
    for file_name in file_list:
        if file_name in ["./index.html"]:
            continue

        file_size = os.path.getsize(file_name)
        f.write(
            f'<tr><td><a href="{website}/{file_name}" target="_blank" rel="noopener noreferrer">{file_name}</a></td><td>{int(file_size / 1048576)} MB</td></tr>\n'
        )

    # Write the footer and close the file
    f.write("</table></body></html>")

command = [
    "aws",
    "s3",
    "sync",
    ".",
    f"s3://{bucket}",
    "--exclude",
    "*",
    "--include",
    "*.html",
    "--no-progress",
]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the output
print(output.stdout)
print("fin")

upload: 28-LDA/ldavis_tuned_25-topics-2023-04-01-168033058625.html to s3://praxis-2023-html-output/28-LDA/ldavis_tuned_25-topics-2023-04-01-168033058625.html
upload: ./index.html to s3://praxis-2023-html-output/index.html
upload: 31-NMF-with-Sentences/coherence_score.html to s3://praxis-2023-html-output/31-NMF-with-Sentences/coherence_score.html

fin
