In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import xmltodict
import json
from collections import Counter

In [None]:
fpath = '/data/pubmed-data.tsv'

## Reading in Search History data

In [None]:
data = pd.read_csv(fpath, sep='\t')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data[data['query_term']=='covid 19']

## Getting list of PMIDs

In [None]:
pmids = data['PMID'].tolist()
pmid_list = []
for i in pmids:
    pmid_list.extend(i.split(','))

In [None]:
print("Total number of PMIDs: {}".format(len(pmid_list)))
print("Unique number of PMIDs: {}".format(len(set(pmid_list))))

In [None]:
most_common = Counter(pmid_list).most_common()[:10]

In [None]:
most_common_ids = [i[0] for i in most_common]

In [None]:
most_common_ids

In [None]:
most_common_dict = {}
for i in most_common:
    key = i[0]
    val = i[1]
    most_common_dict[key] = val

In [None]:
pmids = data['PMID'].tolist()
pmid_list = []
nums = []
for i in pmids:
    nums.append(len(i))
    pmid_list.extend(i.split(',')[:10])

In [None]:
data['num_results'] = nums

## Search Summary Stats

In [None]:
print(f"Median number of search results: {data['num_results'].median()}")
plt.hist(data['num_results'])
plt.title("Number of results (all searches)");

In [None]:
data['sort_algorithm'].value_counts().to_dict()

## Getting data from Pubmed

In [None]:
def request_pmids(ids):
    list_ids = ','.join(ids)
    url = "https://eutilspreview.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}".format(list_ids)
    r = requests.get(url)
    return json.dumps(xmltodict.parse(r.text))

In [None]:
# getting metadata for 10 most common
jsonString = request_pmids(most_common_ids)

In [None]:
json.loads(jsonString)['PubmedArticleSet']['PubmedArticle'][0]

## Parsing Pubmed data

In [None]:
def get_abstract(record):
    
    abstract = ''
    hasStructuredAbstract=False
    try: 
        if record['MedlineCitation']['Article']['Abstract']:
            if record['MedlineCitation']['Article']['Abstract']['AbstractText']:
                abstract = record['MedlineCitation']['Article']['Abstract']['AbstractText']
                if type(abstract)==dict:
                    try:
                        abstract = abstract['#text']
                    except:
                        pass
                elif type(abstract)==list:
                    hasStructuredAbstract=True
                
    except Exception as e:
        pass
    
    if abstract == '':
        print("Did not retrieve an abstract")
    
    return abstract, hasStructuredAbstract
    
def get_pubtype(record):
    pubs = []
    try:
        if record['MedlineCitation']['Article']['PublicationTypeList']['PublicationType']:
            if type(record['MedlineCitation']['Article']['PublicationTypeList']['PublicationType'])==list:
                for i in record['MedlineCitation']['Article']['PublicationTypeList']['PublicationType']:
                    pubs.append(i['#text'])
            else:
                pubs.append(record['MedlineCitation']['Article']['PublicationTypeList']['PublicationType']['#text'])
    except Exception as e:
        print("** Error retrieving pubtype")
        print(e)
    
    return pubs

def get_title(record):
    try:
        return record['MedlineCitation']['Article']['ArticleTitle']['#text']
    except Exception as e:
        return record['MedlineCitation']['Article']['ArticleTitle']
        

In [None]:
def extract_xml(jsonString):
    parsed = []
    res = json.loads(jsonString)['PubmedArticleSet']['PubmedArticle']
    for i in res:   
        pmid = int(i['MedlineCitation']['PMID']['#text'])
        print(f"\nExtracting {pmid}")
        try:
            abstract, hasStructuredAbstract = get_abstract(i)
            hasAbstract = bool(abstract)
            record = {
                "pmid": pmid,
                "journal": i['MedlineCitation']['Article']['Journal']['Title'],
                "title": get_title(i),
                "abstract": abstract,
                "hasAbstract": hasAbstract,
                "hasStructuredAbstract": hasStructuredAbstract,
                "pubtype": get_pubtype(i)
            }
            parsed.append(record)
        except Exception as e:
            print("** Error extracting XML")
            print(e)
    
    return parsed

In [None]:
parsed = extract_xml(jsonString)

In [None]:
plist = []
for i in parsed:
    r = {
        "pmid": i['pmid'],
        "title": i['title'],
        "journal": i['journal'],
        "pubtype": i['pubtype'],
        "hasAbstract": i['hasAbstract'],
        "hasStructuredAbstract": i['hasStructuredAbstract'],
        "appears_in_results": most_common_dict[str(i['pmid'])]
    }
    plist.append(r)

In [None]:
d = pd.DataFrame(plist)

In [None]:
d