<a href="https://colab.research.google.com/github/NganTran-0017/Disaster-Analysis/blob/main/DA_Query_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
""" This notebook query a search from pmc database using E-Utilities, and the reponse contains
  the search result in MEDLINE format, which is similar to the downloaded txt file on the website """

""" Queries list:
  "disaster informatics"[All Fields] AND ("2016/01/01"[PubDate] : "2019/12/31"[PubDate])
  "disaster informatics"[All Fields] AND ("2020/01/01"[PubDate] : "2022/12/31"[PubDate])

  "crisis informatics"[All Fields] AND ("2016/01/01"[PubDate] : "2019/12/31"[PubDate])
  "crisis informatics"[All Fields] AND ("2020/01/01"[PubDate] : "2022/12/31"[PubDate])

  "pandemic crisis"[All Fields] AND ("2016/01/01"[PubDate] : "2019/12/31"[PubDate])
  "pandemic crisis"[All Fields] AND ("2020/01/01"[PubDate] : "2022/12/31"[PubDate])
  
  """
!pip install biopython
  ## Using KeyBERT to extract keywords from abstract
!pip install keybert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 7.6 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.6.0-py2.py3-none-any.whl (22 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.9 MB/s 
[?25hCollecting rich>=10.4.0
  Downloading rich-12.5.1-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 48.2 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 4.6 MB/s 
Collecting transfor

In [None]:
from requests.models import ReadTimeoutError
from requests.api import request
import requests

def query_search(db, query, outfile, retmax):
    
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    url  = 'esearch.fcgi?'
    #url_link = base + "esearch.fcgi?db={}&term={}&usehistory=y&retmax={}".format(db, query, retmax)
    url_params = {'db': db, 'term': query, 'usehistory': 'y', 'retmax': retmax}
    output = requests.post(url= base+url, params= url_params)

    #print(output.text)
    query_key_position = output.text.find('<QueryKey>')
    query_key = output.text[query_key_position + 10]

    WebEnv_position = [i for i in range( len(output.text) ) if output.text.startswith('WebEnv',i)]
    WebEnv = output.text[WebEnv_position[0]+7 : WebEnv_position[1]-2]
    print('Query key: {} at index {} '.format( output.text[query_key_position+10], query_key_position))
    print('WebEnv: {} at index {} '.format( WebEnv, WebEnv_position))
    #print('URL: ',base+url, url_params ,'\nreturn search result: ', output.text)

    # Query EFetch
    url  = 'efetch.fcgi?'
    url_params = {'db': db, 'query_key': query_key, 'WebEnv': WebEnv, 'rettype': 'medline', 'retmax': retmax}
    medline_output = requests.post(url = base+url, params = url_params)                     
    #print(medline_output.text)

    outfile = open(outfile + '.txt', 'w')
    n = outfile.write(medline_output.text)
    outfile.close()
    

db = 'pmc'   
query = '"pandemic crisis"[All Fields] AND ("2020/01/01"[PubDate] : "2022/12/31"[PubDate])'
outfile = "pandemiccrisis_20_22"
query_search(db, query, outfile, retmax=5000)



Query key: 1 at index 251 
WebEnv: MCID_6323ee9bd3abbe21c3706618 at index [274, 312] 


In [None]:
# Parse MEDLINE data

from Bio import Medline
import pandas as pd
from tqdm import tqdm

alldata = []
count = 0
with open(outfile+'.txt', encoding= 'utf8') as f:
    pmids = Medline.parse(f)
    for pmid in pmids:
        try:     pid      = pmid['PMC']
        except:  pid      = ''; print('No PID found')
        try:     title    = pmid['TI']
        except:  title    = ''; print('No Title found')
        try:     abstract = pmid['AB']
        except:  abstract = ''; print('No Abstract found in PMC{}'.format(pid))
        try:     authors  = pmid['AU']
        except:  authors  = ''; print('No Authors found in PMC{}'.format(pid))
        try:     journal  = pmid['JT']
        except:  journal  = ''; print('No Journal found  in PMC{}'.format(pid))
        try:     pub_date = pmid['DP']
        except:  pub_date = ''; print('No Published Date found in PMC{}'.format(pid))
        try:     language = pmid['LA']
        except:  language = ''; print('No Language found in PMC{}'.format(pid))
            
        dic ={
            'PMC': pid,
            'Title': title,
            'Abstract': abstract,
            'Authors': authors,
            'Journal': journal,
            'Language': language,
            'Published Date': pub_date
        }
        alldata.append(dic)
        count +=1

count, len(alldata)

No Published Date found in PMCPMC9446650
No Published Date found in PMCPMC9444316
No Published Date found in PMCPMC9442597
No Published Date found in PMCPMC9438885
No Published Date found in PMCPMC9434505
No Published Date found in PMCPMC9434190
No Published Date found in PMCPMC9434072
No Published Date found in PMCPMC9428381
No Published Date found in PMCPMC9428377
No Abstract found in PMCPMC9423697
No Published Date found in PMCPMC9423697
No Published Date found in PMCPMC9420080
No Published Date found in PMCPMC9419650
No Published Date found in PMCPMC9418651
No Published Date found in PMCPMC9415258
No Published Date found in PMCPMC9412141
No Published Date found in PMCPMC9411139
No Published Date found in PMCPMC9401204
No Abstract found in PMCPMC9134623
No Published Date found in PMCPMC9397183
No Published Date found in PMCPMC9396745
No Published Date found in PMCPMC9395906
No Published Date found in PMCPMC9395878
No Published Date found in PMCPMC9391218
No Published Date found in P

(4644, 4644)

In [None]:
#alldata

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(alldata[0]['Abstract'])

keytitle = kw_model.extract_keywords(alldata[0]['Title'])
"""From Title """, keytitle, """From abstract""",keywords

('From Title ',
 [('intrahepatic', 0.3766),
  ('metabolic', 0.3458),
  ('fat', 0.3443),
  ('nafld', 0.3349),
  ('covid', 0.257)],
 'From abstract',
 [('dietary', 0.3364),
  ('metabolic', 0.2531),
  ('nutritional', 0.2497),
  ('fatty', 0.2486),
  ('liver', 0.2437)])

In [None]:

def extract_keyword( source, ngrams, outfile, diversive = True ):
  keywords_list = []
  if diversive: diverse = 'Diverse'
  else:         diverse = 'Non-diverse'

  with open(outfile + '-extracted-kw.txt', 'a') as f:
    f.write('\n\n{} {} Keywords from {}:\n'.format(diverse, ngrams, source))
    for i in range(len(alldata)):
        keywords = kw_model.extract_keywords(alldata[i][source], keyphrase_ngram_range=(1,ngrams),\
                                      stop_words= 'english', use_mmr=True, diversity=0.7) #\ top_n=5) 
        keywords_list.append(keywords)
        f.write('%s\n' %keywords)
  f.close()
  return keywords_list

ngrams = [2, 3]
sources = ['Title', 'Abstract']
for s in sources:
  for n in ngrams:
    print('Extracting {} from {}'.format(n, s))
    extract_keyword(s, n, outfile)



Extracting 2 from Title
Extracting 3 from Title
Extracting 2 from Abstract
Extracting 3 from Abstract


keywords_list = []
with open(outfile + '-extracted-kw.txt', 'a') as f:
  for i in range(len(alldata)):
      keywords =  kw_model.extract_keywords(alldata[i]['Abstract'], keyphrase_ngram_range=(1,2)) 
      f.write('%s\n' %keywords)
      keywords_list.append(keywords)
f.close()

keywords_list