In [9]:
"""
Scrape all the mesh terms from the pubmed papers
"""
import math
import urllib

from bs4 import BeautifulSoup
from contextlib import closing
import cxutils as cx
import matplotlib.pyplot as plt
from requests import get
from requests.exceptions import RequestException

# Too old, need to figure out a replacement
# from wordcloud import WordCloud
# %matplotlib qt

In [6]:
# List of paper titles found on pubmed after removing FPs.
titles = cx.read_lines(os.path.join('search_results', 'pubmed', 'with-constraints.txt'))

base_url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term='

In [31]:
# The base page with search results. This is uncleaned (false positives).
# scrape_page = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=(mimic-ii+OR+mimic-iii+OR+mimicii+OR+mimiciii+OR+mimic-2+OR+mimic-3+OR+mimic2+OR+mimic3)+AND+(physionet+OR+icu+OR+%E2%80%9Cintensive+care%E2%80%9D+OR+%E2%80%9Ccritical+care%E2%80%9D)'

In [7]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors.
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [10]:
all_mesh_terms = []
all_publication_types = []

for title in titles:
    print('Title: {}'.format(title))
    search_url = base_url + urllib.parse.quote_plus(title)
    content = simple_get(search_url)
    html = BeautifulSoup(content, 'html.parser')
    # The publication type. May be present.
    pub_item = html.find('a', alsec='ptyp')
    if pub_item:
        all_publication_types.append(pub_item.text)
    # The mesh items
    mesh_items = html.find_all('a', alsec='mesh')
    print('Number of items: {}'.format(len(mesh_items)))
    for item in mesh_items:
        all_mesh_terms.append(item.text)


Title: prediction model of in-hospital mortality in intensive care unit patients with heart failure: machine learning-based, retrospective analysis of the mimic-iii database
Number of items: 0
Title: effects of circadian heart rate variation on short-term and long-term mortality in intensive care unit patients: a retrospective cohort study based on mimic-ii database
Number of items: 0
Title: the impact of transthoracic echocardiography on the short-term prognosis of elderly patients in the intensive care unit: a retrospective analysis based on the mimic-iii database
Number of items: 0
Title: a comprehensive evaluation for the prediction of mortality in intensive care units with lstm networks: patients with cardiovascular disease
Number of items: 0
Title: risk factors for death and their predictive value on diabetic kidney disease patients in intensive care unit based on mimic-iii database
Number of items: 0
Title: outcomes for patients with sepsis following admission to the intensive c

TypeError: object of type 'NoneType' has no len()

In [84]:
# Save the results
cx.write_lines(os.path.join('search_results', 'pubmed', 'publication-types.txt'), all_publication_types)
cx.write_lines(os.path.join('search_results', 'pubmed', 'mesh-terms.txt'), all_mesh_terms)

In [68]:
all_publication_types

['Research Support, N.I.H., Extramural',
 'Research Support, N.I.H., Intramural',
 'Research Support, N.I.H., Extramural',
 'Research Support, N.I.H., Extramural',
 'Comparative Study',
 "Research Support, Non-U.S. Gov't",
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., Intramural',
 "Research Support, Non-U.S. Gov't",
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., Extramural',
 'Research Support, N.I.H., Extramural',
 'Research Support, N.I.H., Extramural',
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., Extramural',
 'Research Support, N.I.H., Extramural',
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., Extramural',
 'Observational Study',
 "Research Support, Non-U.S. Gov't",
 "Research Support, Non-U.S. Gov't",
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., Extramural',
 'Comparative Study',
 'Research Support, N.I.H., Extramural',
 "Research Support, Non-U.S. Gov't",
 'Research Support, N.I.H., E

In [67]:
all_mesh_terms

['Aged',
 'Aged, 80 and over',
 'Algorithms',
 'Boston',
 'Female',
 'Hospital Mortality*',
 'Humans',
 'Intensive Care Units/statistics & numerical data*',
 'Machine Learning/statistics & numerical data*',
 'Male',
 'Middle Aged',
 'ROC Curve',
 'Reproducibility of Results',
 'Risk',
 'Severity of Illness Index*',
 'Acute Disease',
 'Aged',
 'Databases, Factual*',
 'Erythrocyte Indices*',
 'Female',
 'Humans',
 'Intensive Care Units*',
 'Male',
 'Middle Aged',
 'Multivariate Analysis',
 'Pancreatitis/blood*',
 'Pancreatitis/diagnosis*',
 'Prognosis',
 'Retrospective Studies',
 'Academic Medical Centers',
 'Adult',
 'African Americans/statistics & numerical data',
 'Aged',
 'Aged, 80 and over',
 'Asian Americans/statistics & numerical data',
 'Boston/epidemiology',
 'Continental Population Groups/statistics & numerical data*',
 'Electronic Health Records',
 'European Continental Ancestry Group/statistics & numerical data',
 'Female',
 'Health Status Disparities*',
 'Hospital Mortality/

In [69]:
set(all_mesh_terms)

{'Academic Medical Centers',
 'Access to Information',
 'Access to Information*',
 'Acute Disease',
 'Acute Kidney Injury/blood*',
 'Acute Kidney Injury/mortality',
 'Acute Kidney Injury/mortality*',
 'Acute Kidney Injury/physiopathology',
 'Adolescent',
 'Adult',
 'African Americans/statistics & numerical data',
 'Age Factors',
 'Aged',
 'Aged, 80 and over',
 'Airway Extubation/statistics & numerical data*',
 'Alanine Transaminase/blood',
 'Alcoholism/mortality*',
 'Algorithms',
 'Algorithms*',
 'Anti-Bacterial Agents/therapeutic use',
 'Antipyretics/adverse effects*',
 'Antipyretics/therapeutic use*',
 'Area Under Curve',
 'Arrhythmias, Cardiac/diagnosis',
 'Arrhythmias, Cardiac/diagnosis*',
 'Arrhythmias, Cardiac/physiopathology*',
 'Arterial Pressure',
 'Artifacts',
 'Artificial Intelligence',
 'Artificial Intelligence*',
 'Asian Americans/statistics & numerical data',
 'Bayes Theorem',
 'Bilirubin/blood',
 'Biomarkers',
 'Biomarkers/blood',
 'Biomedical Engineering/methods',
 'Bio

In [88]:
print(len(all_mesh_terms))
print(len(set(all_mesh_terms)))

1028
432


In [103]:
# Clean the mesh terms. Strip the * and remove things after /
for i in range(len(all_mesh_terms)):
    all_mesh_terms[i] = all_mesh_terms[i].strip('*').split('/')[0]

print('Total mesh terms:', len(all_mesh_terms))
print('Unique mesh terms:', len(set(all_mesh_terms)))

Total mesh terms: 1028
Unique mesh terms: 258


In [121]:
# Generating figures
mesh_text = ' '.join(term for term in all_mesh_terms)
wordcloud = WordCloud(width=1920, height=1080, background_color="white").generate(mesh_text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [102]:
all_mesh_terms

['Aged',
 'Aged, 80 and over',
 'Algorithms',
 'Boston',
 'Female',
 'Hospital Mortality',
 'Humans',
 'Intensive Care Units',
 'Machine Learning',
 'Male',
 'Middle Aged',
 'ROC Curve',
 'Reproducibility of Results',
 'Risk',
 'Severity of Illness Index',
 'Acute Disease',
 'Aged',
 'Databases, Factual',
 'Erythrocyte Indices',
 'Female',
 'Humans',
 'Intensive Care Units',
 'Male',
 'Middle Aged',
 'Multivariate Analysis',
 'Pancreatitis',
 'Pancreatitis',
 'Prognosis',
 'Retrospective Studies',
 'Academic Medical Centers',
 'Adult',
 'African Americans',
 'Aged',
 'Aged, 80 and over',
 'Asian Americans',
 'Boston',
 'Continental Population Groups',
 'Electronic Health Records',
 'European Continental Ancestry Group',
 'Female',
 'Health Status Disparities',
 'Hospital Mortality',
 'Humans',
 'Intensive Care Units',
 'Male',
 'Middle Aged',
 'Retrospective Studies',
 'Adult',
 'Artificial Intelligence',
 'Critical Care',
 'Databases, Factual',
 'Decision Support Systems, Clinical',
 

In [101]:
set(all_mesh_terms)

{'Academic Medical Centers',
 'Access to Information',
 'Acute Disease',
 'Acute Kidney Injury',
 'Adolescent',
 'Adult',
 'African Americans',
 'Age Factors',
 'Aged',
 'Aged, 80 and over',
 'Airway Extubation',
 'Alanine Transaminase',
 'Alcoholism',
 'Algorithms',
 'Anti-Bacterial Agents',
 'Antipyretics',
 'Area Under Curve',
 'Arrhythmias, Cardiac',
 'Arterial Pressure',
 'Artifacts',
 'Artificial Intelligence',
 'Asian Americans',
 'Bayes Theorem',
 'Bilirubin',
 'Biomarkers',
 'Biomedical Engineering',
 'Biomedical Research',
 'Blood Glucose',
 'Blood Pressure',
 'Blood Pressure Determination',
 'Body Mass Index',
 'Body Temperature',
 'Boston',
 'Brain Injuries',
 'Brain Ischemia',
 'Calcium',
 'Cardiology',
 'Cell Count',
 'Child',
 'Child, Preschool',
 'Circadian Rhythm',
 'Classification',
 'Clinical Alarms',
 'Clinical Coding',
 'Clinical Laboratory Techniques',
 'Clostridium Infections',
 'Clostridium difficile',
 'Cluster Analysis',
 'Cohort Studies',
 'Comorbidity',
 'Co

In [124]:
# Get the counts, and count percentages (divided by number of papers)
counts = []

for term in set(all_mesh_terms):
    wc = all_mesh_terms.count(term)
    counts.append([wc, math.ceil(wc / len(titles) * 100), term])
    
counts.sort(reverse=True)

In [125]:
counts

[[85, 68, 'Humans'],
 [46, 37, 'Intensive Care Units'],
 [37, 30, 'Databases, Factual'],
 [32, 26, 'Algorithms'],
 [27, 22, 'Female'],
 [26, 21, 'Critical Care'],
 [25, 20, 'Male'],
 [24, 20, 'Middle Aged'],
 [21, 17, 'Aged'],
 [21, 17, 'Adult'],
 [17, 14, 'Prognosis'],
 [17, 14, 'Electronic Health Records'],
 [16, 13, 'Retrospective Studies'],
 [16, 13, 'Critical Illness'],
 [15, 12, 'Reproducibility of Results'],
 [14, 12, 'Sepsis'],
 [11, 9, 'Monitoring, Physiologic'],
 [11, 9, 'Hypotension'],
 [11, 9, 'Hospital Mortality'],
 [10, 8, 'Sensitivity and Specificity'],
 [10, 8, 'Risk Assessment'],
 [9, 8, 'Time Factors'],
 [9, 8, 'Software'],
 [9, 8, 'Signal Processing, Computer-Assisted'],
 [9, 8, 'ROC Curve'],
 [9, 8, 'Blood Pressure'],
 [8, 7, 'Severity of Illness Index'],
 [8, 7, 'Multiple Organ Failure'],
 [8, 7, 'Data Mining'],
 [8, 7, 'Aged, 80 and over'],
 [7, 6, 'Risk Factors'],
 [7, 6, 'Regression Analysis'],
 [7, 6, 'Predictive Value of Tests'],
 [7, 6, 'Pattern Recognition, 

In [126]:
# Write results to pipe delimited file.
with open('/home/cx1111/Projects/citation-tracker/search_results/pubmed/mesh-counts.csv', 'w') as f:
    f.write('term|count_percentage\n')
    for count, pcount, word in counts:
        f.write('|'.join([word, str(pcount)]) + '\n')

In [129]:
# Organize the publication types
pub_counts = []

for term in set(all_publication_types):
    wc = all_publication_types.count(term)
    pub_counts.append([wc, math.ceil(wc / len(titles) * 100), term])    

# There is also the unspecified category
wc = len(titles) - len(all_publication_types)
pub_counts.append([wc, math.ceil(wc / len(titles) * 100), 'Unspecified'])

pub_counts.sort(reverse=True)

In [130]:
pub_counts

[[78, 63, 'Unspecified'],
 [19, 16, 'Research Support, N.I.H., Extramural'],
 [15, 12, "Research Support, Non-U.S. Gov't"],
 [6, 5, 'Comparative Study'],
 [3, 3, 'Research Support, N.I.H., Intramural'],
 [2, 2, "Research Support, U.S. Gov't, Non-P.H.S."],
 [1, 1, 'Observational Study'],
 [1, 1, 'Evaluation Studies']]

In [None]:
plt.figure()
country.max().sort_values(by="points",ascending=False)["points"].plot.bar()
plt.xticks(rotation=50)
plt.xlabel("Country of Origin")
plt.ylabel("Highest point of Wines")
plt.show()

In [149]:
# Bar graph of publication types
x = range(len(pub_counts))
plt.figure()
plt.bar(x, [p[1] for p in pub_counts])
# Strip the long labels on the x axis
plt.xticks(x, [p[2].strip('Research Support, ') for p in pub_counts])
plt.title('Publication Types')
plt.xlabel('Category')
plt.ylabel('Percentage')
plt.show()

In [148]:
# Bar graph of publication types, excluding unspecified
scale_factor = sum(p[1] for p in pub_counts) /  sum(p[1] for p in pub_counts[1:])
x = range(len(pub_counts)-1)
plt.figure()
plt.bar(x, [p[1]*scale_factor for p in pub_counts[1:]])
# Strip the long labels on the x axis
plt.xticks(x, [p[2].strip('Research Support, ') for p in pub_counts[1:]])
plt.title('Publication Types (excluding unspecified)')
plt.xlabel('Category')
plt.ylabel('Percentage')
plt.show()

In [151]:
pub_counts

[[78, 63, 'Unspecified'],
 [19, 16, 'Research Support, N.I.H., Extramural'],
 [15, 12, "Research Support, Non-U.S. Gov't"],
 [6, 5, 'Comparative Study'],
 [3, 3, 'Research Support, N.I.H., Intramural'],
 [2, 2, "Research Support, U.S. Gov't, Non-P.H.S."],
 [1, 1, 'Observational Study'],
 [1, 1, 'Evaluation Studies']]

In [153]:
# Write results to pipe delimited file.
with open('/home/cx1111/Projects/citation-tracker/search_results/pubmed/publication-types.csv', 'w') as f:
    f.write('publication_type|count_percentage|count_percentage_excluding_unspecified\n')
    for count, pcount, ptype in pub_counts:
        if ptype == 'Unspecified':
            f.write('|'.join([ptype, str(pcount), '0']) + '\n')
        else:
            f.write('|'.join([ptype, str(pcount), str(math.ceil(pcount * scale_factor))]) + '\n')
