In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import gensim
import spacy
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from gensim.models import CoherenceModel

## Dataset Exploration

In [2]:
#read dataset from excel file
dataset = pd.read_excel('Pubmed5k.xlsx')

In [3]:
dataset.head()

Unnamed: 0,ArticleID,Title,Abstract
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...


In [4]:
#check size of dataset
dataset.shape

(4999, 3)

In [5]:
#show sample of titles
dataset.iloc[0]['Title']

'Stable Coordination Variability in Overground Walking and Running at Preferred and Fixed Speeds.'

In [6]:
#show sample of abstract
dataset.iloc[0]['Abstract']

'Coordination variability (CV) is commonly analyzed to understand dynamical qualities of human locomotion. The purpose of this study was to develop guidelines for the number of trials required to inform the calculation of a stable mean lower limb CV during overground locomotion. Three-dimensional lower limb kinematics were captured for 10 recreational runners performing 20 trials each of preferred and fixed speed walking and running. Stance phase CV was calculated for 9 segment and joint couplings using a modified vector coding technique. The number of trials required to achieve a CV mean within 10% of 20 strides average was determined for each coupling and individual. The statistical outputs of mode (walking vs running) and speed (preferred vs fixed) were compared when informed by differing numbers of trials. A minimum of 11 trials were required for stable mean stance phase CV. With fewer than 11 trials, CV was underestimated and led to an oversight of significant differences between 

In [7]:
#check dataset info
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleID  4999 non-null   int64 
 1   Title      4999 non-null   object
 2   Abstract   4999 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [8]:
#check null values
dataset.isnull().sum()

ArticleID    0
Title        0
Abstract     0
dtype: int64

In [9]:
#check duplicates
dataset.duplicated().sum()

0

In [10]:
#check duplicates in columns
dataset['Abstract'].duplicated().sum()

10

* there are duplicates in Abstract column

In [11]:
#check duplicates in columns
dataset['Title'].duplicated().sum()

0

In [12]:
#check duplicates in columns
dataset['ArticleID'].duplicated().sum()

0

## Dataset Preprocessing

In [13]:
#check for Abstract Duplicates
dataset[dataset['Abstract'].duplicated()]

Unnamed: 0,ArticleID,Title,Abstract
2590,34669440,Peptide-based urinary monitoring of fibrotic n...,[Figure: see text].
2591,34669441,A rapid assay provides on-site quantification ...,[Figure: see text].
2592,34669442,Fatal enhanced respiratory syncytial virus dis...,[Figure: see text].
2593,34669443,Macrophage migration inhibitory factor drives ...,[Figure: see text].
2594,34669444,"Development of ICT01, a first-in-class, anti-B...",[Figure: see text].
3872,34258891,Too much of a good thing in ischemic mitral: l...,No abstract present.
3873,34258892,COVID-19 infection and cardiometabolic complic...,No abstract present.
3874,34258893,Comments on Cardiovascular effects of waterpip...,No abstract present.
3875,34258894,A case of COVID-19 infection quickly relieved ...,No abstract present.
4757,34425679,Study of anabolic activity of dry extracts of ...,This article presents the results of the study...


In [14]:
dataset[dataset['Abstract']==dataset.iloc[4757]['Abstract']]

Unnamed: 0,ArticleID,Title,Abstract
446,34237945,Studium anabolické aktivity suchých extrakt&#3...,This article presents the results of the study...
4757,34425679,Study of anabolic activity of dry extracts of ...,This article presents the results of the study...


In [15]:
dataset.iloc[446]['Title']

'Studium anabolické aktivity suchých extrakt&#367; list&#367; a oddenk&#367; Iris hungarica na modelu hydrokortizonem navozeného katabolismu bílkovin.'

* We notice that there are not abstract data in some samples
* There is a duplicated instance, but its title written in another language 

In [16]:
#droping these instances
indices = dataset[dataset['Abstract'].duplicated()].index
#replace index of an instance with another written in another language 
indices = indices.to_list()
del(indices[-1])
indices.append(446)
indices = np.array(indices)

In [17]:
#drop duplicates
dataset.drop(index = indices,inplace=True)

In [18]:
#check after remove duplicates
dataset[dataset['Abstract'].duplicated()]

Unnamed: 0,ArticleID,Title,Abstract


In [19]:
#check for instances that have not abstract data
dataset[dataset['Abstract']=='[Figure: see text].']

Unnamed: 0,ArticleID,Title,Abstract
2589,34669439,DNA binding to TLR9 expressed by red blood cel...,[Figure: see text].


In [20]:
dataset[dataset['Abstract']=='No abstract present.']

Unnamed: 0,ArticleID,Title,Abstract
3871,34258890,Closing gaps in the care of patients with hear...,No abstract present.


In [21]:
dataset.drop(index = dataset[(dataset['Abstract']=='[Figure: see text].')|(dataset['Abstract']=='No abstract present.')].index,inplace=True)

In [22]:
#check after dropping
dataset[(dataset['Abstract']=='[Figure: see text].')|(dataset['Abstract']=='No abstract present.')]

Unnamed: 0,ArticleID,Title,Abstract


In [23]:
dataset.shape

(4987, 3)

In [24]:
preprocessed_data = dataset.copy()

In [25]:
#filter dataset using regex
def filter_data(text):
    '''
     Removing special characters and digits.
     * Parameters:
         Text: (String) 
     * Return:
         Text: (String)
    '''
    return re.sub('\s+',' ',re.sub('\d+','',re.sub('[^\w+\s]','',text))).lower()

In [26]:
#remove digits and special characters
preprocessed_data['Title']    = preprocessed_data['Title'].apply(filter_data)
preprocessed_data['Abstract'] = preprocessed_data['Abstract'].apply(filter_data)

In [27]:
preprocessed_data.iloc[0]['Abstract']

'coordination variability cv is commonly analyzed to understand dynamical qualities of human locomotion the purpose of this study was to develop guidelines for the number of trials required to inform the calculation of a stable mean lower limb cv during overground locomotion threedimensional lower limb kinematics were captured for recreational runners performing trials each of preferred and fixed speed walking and running stance phase cv was calculated for segment and joint couplings using a modified vector coding technique the number of trials required to achieve a cv mean within of strides average was determined for each coupling and individual the statistical outputs of mode walking vs running and speed preferred vs fixed were compared when informed by differing numbers of trials a minimum of trials were required for stable mean stance phase cv with fewer than trials cv was underestimated and led to an oversight of significant differences between mode and speed future overground loc

### Extract topics from Abstract column
###### preprocessing: 
* remove stop words after filtering from special characters and digits
* lemmatize the words after splitting sentences into tokens
* use bigram to concatenate the most frequent words together

In [28]:
#remove stop words and split sentences into tokens
def sent_to_words(sentences):
    result,data = [],[]
    for sentence in sentences:
        for token in gensim.utils.simple_preprocess(sentence) :
            if token not in gensim.parsing.preprocessing.STOPWORDS:
                result.append(token)
        data.append(result)
        result = []
    return data
 

In [29]:
#load spacy to lemmatize words
nlp = spacy.load('en_core_web_md',disable=['parser', 'ner'])

In [30]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [31]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

### Split dataset into train and test

In [32]:
#split dataset
train_data,test_data = train_test_split(preprocessed_data,test_size=0.01,random_state=0)

In [33]:
#size of training data
len(train_data)

4937

In [34]:
#size of testing data
len(test_data)

50

In [35]:
#training data
data = train_data.iloc[:,-1].values.tolist()
train_data_words = sent_to_words(data)
#testing data
data = test_data.iloc[:,-1].values.tolist()
test_data_words = sent_to_words(data)

In [36]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(train_data_words, min_count=3, threshold=10) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [37]:
train_data_words_bigrams = make_bigrams(train_data_words)
test_data_words_bigrams = make_bigrams(test_data_words)

In [38]:
train_data_lemmatized = lemmatization(train_data_words_bigrams)
test_data_lemmatized = lemmatization(test_data_words_bigrams)

In [39]:
print(train_data_lemmatized[0])

['evaluate_effect', 'lowcalorie', 'diet', 'hour_urinary', 'metabolic', 'parameter', 'obese', 'adult', 'idiopathic', 'calcium_oxalate', 'kidney', 'stonesadult', 'idiopathic', 'calcium_oxalate', 'stone', 'former', 'body_mass', 'index_bmi', 'kgm', 'know', 'lithogenic', 'metabolic', 'abnormality', 'submit', 'lowcalorie', 'diet', 'week', 'enrolment', 'anthropometric', 'measure', 'serum', 'exam', 'hour_urinary', 'metabolic', 'parameter', 'body', 'impedance', 'collect', 'month', 'prior', 'dietary', 'intervention', 'end', 'week', 'correlation', 'waist_circumference', 'loss', 'fat', 'loss', 'variation', 'hour_urinary', 'lithogenic', 'parameter', 'calcium_oxalate', 'urinary', 'supersaturation', 'equation', 'patients_enrolle', 'participate', 'study', 'prescribe', 'diet', 'kcalday', 'mean_age', 'female', 'participant', 'shift', 'obesity', 'bmi_kgm', 'bmi_kgm', 'significant_correlation', 'baseline', 'hour_urinary', 'oxalate', 'weight', 'correlation', 'variation', 'weight', 'waist_circumference', 'f

### Data preparation for model

In [40]:
dictionary = gensim.corpora.Dictionary(train_data_lemmatized)

In [41]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abnormality
1 adult
2 anthropometric
3 associate
4 baseline
5 bmi_kgm
6 body
7 body_mass
8 calcium_oxalate
9 caox
10 collect


In [42]:
len(dictionary)

42654

In [43]:
dictionary.filter_extremes(no_below=5, no_above=0.1, keep_n= None)

In [44]:
len(dictionary)

8680

In [45]:
bow_corpus = [dictionary.doc2bow(doc) for doc in train_data_lemmatized]

In [46]:
len(bow_corpus)

4937

In [47]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 5
bow_doc_x = bow_corpus[document_num]

for i in range(7):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 37 ("parameter") appears 1 time.
Word 56 ("condition") appears 1 time.
Word 78 ("interaction") appears 1 time.
Word 81 ("observation") appears 2 time.
Word 94 ("suggest") appears 1 time.
Word 182 ("review") appears 2 time.
Word 191 ("water") appears 1 time.


### Train the model on training data

In [64]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,
                                   random_state=42,
                                   passes = 10,
                                   workers = 2)

In [65]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"covid" + 0.007*"risk" + 0.006*"outcome" + 0.006*"score" + 0.006*"woman" + 0.005*"symptom" + 0.005*"month" + 0.005*"day" + 0.005*"rate" + 0.005*"mortality"


Topic: 1 
Words: 0.005*"stress" + 0.005*"performance" + 0.004*"network" + 0.004*"exposure" + 0.004*"difference" + 0.004*"impact" + 0.003*"condition" + 0.003*"activity" + 0.003*"area" + 0.003*"temperature"


Topic: 2 
Words: 0.012*"child" + 0.006*"intervention" + 0.005*"rate" + 0.005*"score" + 0.004*"participant" + 0.004*"student" + 0.003*"mother" + 0.003*"pain" + 0.003*"month" + 0.003*"number"


Topic: 3 
Words: 0.004*"design" + 0.004*"product" + 0.004*"region" + 0.004*"technology" + 0.004*"application" + 0.003*"information" + 0.003*"system" + 0.003*"quality" + 0.003*"efficiency" + 0.003*"property"


Topic: 4 
Words: 0.006*"protein" + 0.005*"specie" + 0.004*"population" + 0.004*"concentration" + 0.004*"sample" + 0.004*"structure" + 0.003*"activity" + 0.003*"sequence" + 0.003*"age" + 0.003*"community"


Topic

### Using coherence model to evaluate the optimum number of topics extracted

In [66]:
#get the value of coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=train_data_lemmatized, dictionary=dictionary, coherence='c_v')

In [67]:
print('Coherence value: ',coherence_model_lda.get_coherence())

Coherence value:  0.3720903985533456


### Test The Model on unseen document

In [68]:
test_data.iloc[1]['Abstract']

'the objective of this study were to identify the fatty acid composition for decanoic c tridecanoic c myristic c pentadecanoic c palmitic c stearic c oleic cnc linoleic cnc arachidic c arachidonic cn heneicosanoic c erucic cn and cisdocosahexaenoic cn acids by neocallimastix orpinomyces caecomyces and piromyces species of rumen fungus during in vitro culture fatty acid fa profi le of anaerobic fungi comprises carbon chains of length ranging from to were analyzed as methyl esters analysis of fatty acids was performed using gas chromatographymass spectrophotometer gcms fa measures are presented as proportions of relative amounts total fatty acid the highest amounts of fatty acids for all samples were found as myristic c acid the tridecanoic c acid represented the second abundant fa in the fungi in all experimental groups stearic acid c was the third major fatty acid for isolates investigated in the current study in addition another fatty acid was palmitic c acid with relative amount repr

In [92]:
test_data.iloc[0]['Title']

'microcatheters for antegrade recanalization of chronic total coronary occlusions feasibility and safety of the corsair a retrospective registrybased single operator experience'

##### Top  most relevant topic names with their praobability score

In [91]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(test_data_lemmatized[0])

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])[:3]:
    print("Score: {} Topic: {}".format(score, lda_model.print_topic(index)))
    print('')

Score: 0.3383961617946625 Topic: 0.012*"child" + 0.006*"intervention" + 0.005*"rate" + 0.005*"score" + 0.004*"participant" + 0.004*"student" + 0.003*"mother" + 0.003*"pain" + 0.003*"month" + 0.003*"number"

Score: 0.2990000545978546 Topic: 0.008*"covid" + 0.007*"risk" + 0.006*"outcome" + 0.006*"score" + 0.006*"woman" + 0.005*"symptom" + 0.005*"month" + 0.005*"day" + 0.005*"rate" + 0.005*"mortality"

Score: 0.2273796945810318 Topic: 0.005*"stress" + 0.005*"performance" + 0.004*"network" + 0.004*"exposure" + 0.004*"difference" + 0.004*"impact" + 0.003*"condition" + 0.003*"activity" + 0.003*"area" + 0.003*"temperature"



In [93]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
LDAvis_prepared