In [1]:
# !python3 -m pip install gensim nltk pandas
# https://towardsdatascience.com/topic-modeling-with-latent-dirichlet-allocation-e7ff75290f8
# https://towardsdatascience.com/gaussian-mixture-model-clusterization-how-to-select-the-number-of-components-clusters-553bef45f6e4


# Topic extraction using LDA

## Importing modules & defining globals


In [2]:
import os
import numpy as np
import pandas as pd
from typing import Union

np.random.seed(42)

DATA_PATH = "../data/raw"


## Loading data


In [3]:
io = os.path.join(DATA_PATH, "Pubmed5k.xlsx")
sheet_name = "random 5k"
df = pd.read_excel(io, sheet_name, index_col=[0])


## Exploring the data


In [4]:
df.head()


Unnamed: 0_level_0,Title,Abstract
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1
34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...


TODO: add more EDA


## Following simple approach from [here](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

### Importing required packages & downloading assets


In [5]:
import nltk

nltk.download("wordnet")
nltk.download("stopwords")


[nltk_data] Downloading package wordnet to /home/prime/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/prime/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


### combining `gensim`'s stopwords w/ `nltk`'s


In [7]:
STOPWORDS = STOPWORDS.union(set(stopwords.words("english")))


### Preprocessing


In [8]:
# combine article title w/ abstract to obtain a single document
df_joined = df["Title"] + " " + df["Abstract"]
df_joined.name = "document"


def preprocess(
    document: str,
    min_len: int = 2,
    max_len: int = 15,
    stopwords: frozenset = frozenset(),
    pos: Union[str, list] = "n",
) -> str:
    """Tokenise the document, drop stopwords, then lowercasing and lemmatising each token

    TODO:
    """
    pp_doc = simple_preprocess(document, min_len=min_len, max_len=max_len)
    doc_non_stop = [token for token in pp_doc if token not in stopwords]
    lemmatise = WordNetLemmatizer().lemmatize
    if type(pos) == str:
        return [lemmatise(token, pos=pos) for token in doc_non_stop]
    else:
        result = doc_non_stop
        for p in pos:
            result = [lemmatise(token, pos=p) for token in result]
        return result


# preprocessing the corpus
pos = ["a", "n", "r", "s", "v"]
df_processed = df_joined.apply(preprocess, stopwords=STOPWORDS, pos=pos)


In [9]:
# show a random document
" ".join(df_processed.iloc[np.random.randint(0, len(df_processed))])


'research cooperative mechanism government enterprise basin ecological compensation base differential game ecological compensation important mean basin pollution control exist research mainly focus government level ignore important role enterprise paper introduce enterprise process ecological compensation firstly suppose ecological compensation compose government enterprise government dominant position ecological compensation input government enterprise produce social reputation ecological compensation enterprise produce advertise effect consumer demand affect social reputation advertise effect compensation strategy government enterprise analyze construct differential game model research show certain condition cost share mechanism realize pareto improvement benefit government enterprise cooperative mechanism benefit government enterprise optimal finally validity conclusion verify case analysis sensitivity analysis relevant parameter carry conclusion provide reference government establi

### Counting tokens


In [10]:
dictionary = Dictionary(df_processed)


### Filtering out words


In [11]:
# removing tokens that appear in less that 1% (.01) or in more than 50% (.5) of the corpus
dictionary.filter_extremes(int(0.01 * len(df_processed)), 0.5, None)


### Creating BoW


In [12]:
df_bow = df_processed.apply(dictionary.doc2bow)


### Creating TF-IDF model


In [13]:
tfidf_model = TfidfModel(df_bow.values.tolist())


In [14]:
df_tfidf = df_bow.apply(lambda doc: tfidf_model[doc])


### LDA model


In [15]:
lda_model = LdaMulticore(df_bow, id2word=dictionary)
lda_model.print_topics()


[(69,
  '0.011*"cell" + 0.009*"cd" + 0.009*"day" + 0.009*"age" + 0.008*"case" + 0.007*"tumor" + 0.007*"covid" + 0.007*"group" + 0.006*"high" + 0.006*"present"'),
 (34,
  '0.012*"dna" + 0.010*"data" + 0.009*"cell" + 0.008*"infection" + 0.007*"high" + 0.007*"patient" + 0.007*"model" + 0.007*"base" + 0.007*"analysis" + 0.006*"approach"'),
 (3,
  '0.009*"process" + 0.009*"clinical" + 0.009*"cell" + 0.009*"patient" + 0.008*"review" + 0.007*"include" + 0.007*"indicator" + 0.007*"care" + 0.007*"dna" + 0.006*"damage"'),
 (62,
  '0.010*"patient" + 0.008*"analysis" + 0.008*"screen" + 0.008*"data" + 0.007*"sequence" + 0.007*"method" + 0.006*"present" + 0.006*"age" + 0.005*"health" + 0.005*"disease"'),
 (8,
  '0.024*"cancer" + 0.015*"cell" + 0.014*"patient" + 0.009*"effect" + 0.009*"treatment" + 0.009*"increase" + 0.008*"high" + 0.008*"health" + 0.007*"base" + 0.006*"screen"'),
 (70,
  '0.012*"treatment" + 0.010*"protein" + 0.008*"high" + 0.007*"case" + 0.007*"cell" + 0.007*"increase" + 0.007*"can

In [16]:
def get_top_n_topics(
    corpus: pd.DataFrame, model: LdaMulticore, n: int = 3
) -> pd.DataFrame:
    """calculates the top `n` topics for each document in `df` through the model
    
    Parameters:
    -----------
    
    Returns:
    --------
    
    TODO: complete the pydoc
    """
    # if not isinstance(df, pd.DataFrame):
    #     df = pd.DataFrame(df)
    df_lda = corpus.apply(
        lambda doc: sorted(model[doc], key=(lambda tup: tup[1]), reverse=True)[:n]
    )
    corpus = pd.DataFrame(corpus)

    for i in range(n):
        corpus[f"topic_{i+1}"] = df_lda.apply(
            lambda topics: topics[i][0] if len(topics) > i else None
        )
        corpus[f"topic_{i+1}_prop"] = df_lda.apply(
            lambda topics: topics[i][1] if len(topics) > i else None
        )

    # FIXME: some documents might have less than `n` possible topics
    # to avoid N/As, set them to the previous topic
    for i in range(2, n + 1):
        idx = corpus[f"topic_{i}"].isna()
        cols = [f"topic_{i}", f"topic_{i}_prop"]
        cols_prev = [f"topic_{i-1}", f"topic_{i-1}_prop"]
        corpus.loc[idx, cols] = corpus.loc[idx, cols_prev].values

    return corpus.drop(columns=["document"])


In [17]:
df_topic = get_top_n_topics(df_bow, lda_model, 3)
df_topic.head()


In [19]:
df_topic.to_csv(os.path.join(DATA_PATH, "bow_100_topic.csv"))


In [20]:
lda_model = LdaMulticore(df_tfidf, id2word=dictionary)
lda_model.print_topics()


[(53,
  '0.006*"model" + 0.005*"patient" + 0.005*"data" + 0.004*"human" + 0.004*"group" + 0.004*"chronic" + 0.004*"extraction" + 0.004*"method" + 0.004*"hand" + 0.004*"function"'),
 (48,
  '0.009*"habitat" + 0.007*"social" + 0.007*"fusion" + 0.007*"model" + 0.006*"disease" + 0.006*"distribution" + 0.006*"patient" + 0.006*"fluid" + 0.005*"inhibitor" + 0.005*"health"'),
 (24,
  '0.008*"patient" + 0.008*"health" + 0.008*"group" + 0.007*"cov" + 0.007*"sars" + 0.005*"covid" + 0.005*"complication" + 0.005*"disease" + 0.005*"post" + 0.005*"community"'),
 (60,
  '0.007*"specie" + 0.007*"covid" + 0.006*"patient" + 0.005*"review" + 0.005*"cod" + 0.005*"alcohol" + 0.005*"infection" + 0.005*"disease" + 0.005*"sequence" + 0.004*"consumption"'),
 (82,
  '0.007*"health" + 0.005*"behaviour" + 0.005*"cell" + 0.005*"problem" + 0.004*"patient" + 0.004*"data" + 0.004*"culture" + 0.004*"protein" + 0.004*"technique" + 0.004*"covid"'),
 (28,
  '0.008*"patient" + 0.008*"protein" + 0.008*"systemic" + 0.006*"in

In [21]:
df_topic = get_top_n_topics(df_tfidf, lda_model, 3)
df_topic.head()


Unnamed: 0_level_0,topic_1,topic_1_prop,topic_2,topic_2_prop,topic_3,topic_3_prop
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34153941,7,0.231999,2.0,0.204838,85.0,0.159615
34153942,67,0.334933,44.0,0.207649,70.0,0.195407
34153964,72,0.326289,85.0,0.274621,24.0,0.166924
34153968,1,0.37717,11.0,0.313724,95.0,0.193986
34153978,16,0.301278,64.0,0.190193,31.0,0.146866


In [22]:
df_topic.to_csv(os.path.join(DATA_PATH, "tfidf_100_topic.csv"))
