In [8]:
# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# model building package
import sklearn

# package to clean text
import re
import nltk

# for text preprocessing
import re
import spacy

import string
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from __future__ import print_function

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\menna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# imports for this specific cleaning task
import nltk; nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\menna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#read in this dataset and have a look at it.
df = pd.read_excel ('Pubmed5k.xlsx')
df.head()

Unnamed: 0,ArticleID,Title,Abstract
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...


In [9]:
my_stopwords = nltk.corpus.stopwords.words('english') #Stop word removal
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem  #stemming
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_Abstract(Abstract, bigrams=False):
    Abstract = Abstract.lower() # lower case
    Abstract = re.sub('['+my_punctuation + ']+', ' ', Abstract) # strip punctuation
    Abstract = re.sub('\s+', ' ', Abstract) #remove double spacing
    Abstract = re.sub('([0-9]+)', '', Abstract) # remove numbers
    Abstract_token_list = [word for word in Abstract.split(' ')
                            if word not in my_stopwords] # remove stopwords

    Abstract_token_list = [word_rooter(word) if '#' not in word else word
                        for word in Abstract_token_list] # apply word rooter
    if bigrams:
        Abstract_token_list = Abstract_token_list+[Abstract_token_list[i]+'_'+Abstract_token_list[i+1]
                                            for i in range(len(Abstract_token_list)-1)]
    Abstract = ' '.join(Abstract_token_list)
    return Abstract


In [10]:
df['clean_Abstract'] = df.Abstract.apply(clean_Abstract)
df['clean_Abstract']

0       coordin variabl cv commonli analyz understand ...
1       clinic scenario dynam knee valgu dkv mechan al...
2       variou methodolog report assess real world epi...
3       outcom acut ischem stroke ai vari accord clini...
4       hear loss children result development deficit ...
                              ...                        
4994    integr care pathway icp prevail concept health...
4995    object medicin provid human best possibl healt...
4996    research involv particip  chines student first...
4997    studi explor impact pre intervent effect commu...
4998    medic regimen complex mrc may influenc health ...
Name: clean_Abstract, Length: 4999, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_Abstract']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()

In [12]:
# Building the model 
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 3
lda = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [13]:
lda.fit(tf)

LatentDirichletAllocation(n_components=3, random_state=0)

In [14]:
#this function, which returns a dataframe, to show you the topics we created. Remember that each topic is a list of words/tokens and weights
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)


In [21]:
#You can apply this function like so
no_top_words = 20
display_topics(lda, tf_feature_names, no_top_words)

  and should_run_async(code)


Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,studi,2920.7,patient,4276.5,use,2359.4
1,use,2509.6,p,1965.5,speci,1436.7
2,health,2456.8,group,1786.8,studi,1421.5
3,covid,1757.6,studi,1619.8,cell,1409.0
4,care,1473.8,treatment,1514.0,model,1106.9
5,data,1423.9,cancer,1328.3,result,1060.9
6,patient,1211.1,diseas,1307.4,effect,1060.0
7,particip,1186.7,clinic,1182.2,base,1038.4
8,among,1060.2,associ,1179.8,differ,1034.4
9,associ,1051.2,use,1095.1,activ,1026.4


In [22]:
#Calculate approximate log-likelihood as score
lda.score(tf)

  and should_run_async(code)


-3823175.9069933514

In [23]:
# displaying or visualizing the topic modelling performed through LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
dtm = np.matrix(tf) 
pyLDAvis.sklearn.prepare(lda,dtm, vectorizer,mds='tsne', R=20)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(
