# Extracting Data via API Requests

# Load Libraries and Functions

In [3]:
# API, System Spec, Data Format Libraries 
import requests
import json
import os
import datetime
import re
import time
import warnings
warnings.filterwarnings('ignore')
from io import StringIO
import sys
import warnings
import pandas as pd
import numpy as np
import csv
import random
import shutil

# SQL Interface Libraries
import pymysql as mysql
import mysql.connector
import pyodbc
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import event
# mysql password
PASSWORD = "password"

# Data Vis libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from tabulate import tabulate
from wordcloud import WordCloud

# Natural Language Processing Libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from string import punctuation
import string
import nltk
from collections import OrderedDict
from pandas import json_normalize 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
punctuation = set(punctuation)
punctuation.update({'_', '-','‘'})
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

#nltk.download('words')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
english_words = set(words.words())

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from imblearn.metrics import specificity_score, sensitivity_score
from nltk.classify import NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import cross_val_score, cross_val_predict

# Unsupervised Natural Language Processing Libraries
import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models
import spacy
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\halee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Call Data from PUBMED Entrez API

## Do a Test Call to API to check for bugs

In [23]:
# First find UIDs with search for Veteran
# Then run IDs in batches to get abstracts as text

## Run full call to API

In [9]:
# Base URL for PubMed API
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

# Function to add a random pause
def random_pause():
    time.sleep(1 + 2 * random.random())

# Function to fetch data from PubMed API
def fetch_pubmed_data(search_terms, max_requests):
    all_articles = []
    request_count = 0
    retrieved_ids = set()
    
    payload = {
        'db': 'pubmed',
        'term': search_terms,
        'retmode': 'json',
        'retmax': 10000,
        'usehistory': 'y'
    }
    
    response = requests.post(f"{BASE_URL}/esearch.fcgi", data=payload)
    search_data = response.json()
    webenv = search_data['esearchresult']['webenv']
    query_key = search_data['esearchresult']['querykey']
    count = int(search_data['esearchresult']['count'])
    
    while request_count < max_requests and len(all_articles) < count:
        payload = {
            'db': 'pubmed',
            'query_key': query_key,
            'WebEnv': webenv,
            'retmode': 'json',
            'retstart': request_count * 100,
            'retmax': 100,
        }
        
        response = requests.post(f"{BASE_URL}/esummary.fcgi", data=payload)
        summary_data = response.json()
        
        for uid in summary_data['result']['uids']:
            if uid not in retrieved_ids:
                article = summary_data['result'][uid]
                article_data = {
                    'pubmedid': article.get('uid'),
                    'doi': article.get('elocationid'),
                    'title': article.get('title'),
                    'abstract': article.get('abstract'),
                    'date': article.get('pubdate'),
                    'journal': article.get('fulljournalname'),
                    'clinical_trial_number': article.get('articleids', [{}])[0].get('value', ''),
                    'article_type': article.get('pubtype', []),
                    'document_summary': article.get('docsum', '')
                }
                all_articles.append(article_data)
                retrieved_ids.add(uid)
        
        request_count += 1
        random_pause()
    
    return all_articles

# Fetch the data
search_terms = "(Veteran OR veteran)"
articles = fetch_pubmed_data(search_terms, max_requests=2)

# Output the first few articles
for article in articles[:5]:
    print(article)

{'pubmedid': '39025471', 'doi': 'doi: 10.1055/a-2368-9008', 'title': 'Evaluating the Clinical Reliability and Reference Values of the International Outcome Inventory for Cochlear Implants in a DOD Population.', 'abstract': None, 'date': '2024 Jul 18', 'journal': 'Journal of the American Academy of Audiology', 'clinical_trial_number': '39025471', 'article_type': ['Journal Article'], 'document_summary': ''}
{'pubmedid': '39025253', 'doi': 'pii: S1542-3565(24)00633-5. doi: 10.1016/j.cgh.2024.05.051', 'title': 'IBD Matchmaking - Rational Combination Therapy.', 'abstract': None, 'date': '2024 Jul 16', 'journal': 'Clinical gastroenterology and hepatology : the official clinical practice journal of the American Gastroenterological Association', 'clinical_trial_number': '39025253', 'article_type': ['Journal Article', 'Review'], 'document_summary': ''}
{'pubmedid': '39025177', 'doi': 'doi: 10.1016/j.ijbiomac.2024.133955', 'title': 'Disulfiram inhibits coronaviral main protease by conjugating to

In [10]:
test_df = pd.DataFrame(articles)
test_df

Unnamed: 0,pubmedid,doi,title,abstract,date,journal,clinical_trial_number,article_type,document_summary
0,39025471,doi: 10.1055/a-2368-9008,Evaluating the Clinical Reliability and Refere...,,2024 Jul 18,Journal of the American Academy of Audiology,39025471,[Journal Article],
1,39025253,pii: S1542-3565(24)00633-5. doi: 10.1016/j.cgh...,IBD Matchmaking - Rational Combination Therapy.,,2024 Jul 16,Clinical gastroenterology and hepatology : the...,39025253,"[Journal Article, Review]",
2,39025177,doi: 10.1016/j.ijbiomac.2024.133955,Disulfiram inhibits coronaviral main protease ...,,2024 Jul 16,International journal of biological macromolec...,39025177,[Journal Article],
3,39025090,pii: S2213-2600(24)00169-3. doi: 10.1016/S2213...,Biomarker-defined endotypes of pulmonary fibro...,,2024 Jul 15,The Lancet. Respiratory medicine,39025090,[Journal Article],
4,39024961,doi: 10.1016/j.schres.2024.06.054,Neurocognition in adolescents and young adults...,,2024 Jul 17,Schizophrenia research,39024961,[Journal Article],
...,...,...,...,...,...,...,...,...,...
195,38995211,doi: 10.1152/ajpheart.00726.2023,Interplay of Race and Neighborhood Deprivation...,,2024 Jul 12,American journal of physiology. Heart and circ...,38995211,[Journal Article],
196,38995164,pii: glae175. doi: 10.1093/gerona/glae175,Long-term trajectories of low back pain in old...,,2024 Jul 12,"The journals of gerontology. Series A, Biologi...",38995164,[Journal Article],
197,38994987,doi: 10.3390/cells13131135,The Role of PGC-1α in Aging Skin Barrier Funct...,,2024 Jul 2,Cells,38994987,"[Journal Article, Review]",
198,38994972,doi: 10.3390/cells13131120,"Orthotopic Models Using New, Murine Lung Adeno...",,2024 Jun 28,Cells,38994972,[Journal Article],


In [14]:
# Base URL for PubMed API
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

# Function to add a random pause
def random_pause():
    time.sleep(1 + 2 * random.random())

# Function to fetch data from PubMed API
def fetch_pubmed_data(max_requests):
    all_articles = []
    request_count = 0
    retrieved_ids = set()
    
    payload = {
        'db': 'pubmed',
        'term': 'veteran',
        'retmode': 'json',
        'retmax': 10000,
        'usehistory': 'y'
    }
    
    response = requests.post(f"{BASE_URL}/esearch.fcgi", data=payload)
    search_data = response.json()
    webenv = search_data['esearchresult']['webenv']
    query_key = search_data['esearchresult']['querykey']
    count = int(search_data['esearchresult']['count'])
    
    while request_count < max_requests and len(all_articles) < count:
        payload = {
            'db': 'pubmed',
            'query_key': query_key,
            'WebEnv': webenv,
            'retmode': 'json',
            'retstart': request_count * 100,
            'retmax': 100,
        }
        
        response = requests.post(f"{BASE_URL}/esummary.fcgi", data=payload)
        summary_data = response.json()
        
        for uid in summary_data['result']['uids']:
            if uid not in retrieved_ids:
                article = summary_data['result'][uid]
                article_data = {
                    'pubmedid': article.get('uid'),
                    'doi': article.get('elocationid'),
                    'title': article.get('title'),
                    'date': article.get('pubdate'),
                    'journal': article.get('fulljournalname'),
                    'clinical_trial_number': article.get('articleids', [{}])[0].get('value', ''),
                    'article_type': article.get('pubtype', []),
                    'document_summary': article.get('docsum', '')
                }
                all_articles.append(article_data)
                retrieved_ids.add(uid)
        
        request_count += 1
        random_pause()
    
    return all_articles

# Fetch the data
articles = fetch_pubmed_data(max_requests=2)

# Output the first few articles
for article in articles[:5]:
    print(article)

{'pubmedid': '39028544', 'doi': 'pii: 24m15257. doi: 10.4088/JCP.24m15257', 'title': 'Effects of Low-Dose Ketamine Infusion on the Positive and Negative Domains of Hopelessness and Suicidal Thoughts.', 'date': '2024 Jul 8', 'journal': 'The Journal of clinical psychiatry', 'clinical_trial_number': '39028544', 'article_type': ['Journal Article', 'Randomized Controlled Trial'], 'document_summary': ''}
{'pubmedid': '39028406', 'doi': 'doi: 10.1007/s11606-024-08940-2', 'title': 'Misuse of Prescribed and Nonprescribed Substances Among U.S. Cancer Survivors.', 'date': '2024 Jul 19', 'journal': 'Journal of general internal medicine', 'clinical_trial_number': '39028406', 'article_type': ['Journal Article'], 'document_summary': ''}
{'pubmedid': '39028405', 'doi': 'doi: 10.1007/s11606-024-08938-w', 'title': "No Association Between Medicare Advantage Providers' Network Restrictiveness and Star Rating Between 2013 and 2017: An Observational Study.", 'date': '2024 Jul 19', 'journal': 'Journal of gen

In [18]:
# Base URL for PubMed API
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

# Function to add a random pause
def random_pause():
    time.sleep(1 + 2 * random.random())

# Function to fetch data from PubMed API
def fetch_pubmed_abstracts(max_requests):
    all_articles = []
    request_count = 0
    retrieved_ids = set()
    
    # First API call to esearch to get UIDs
    payload = {
        'db': 'pubmed',
        'term': '(veteran OR Veteran) AND NOT review[pt]',
        'retmode': 'json',
        'retmax': 500,
        'usehistory': 'y'
    }
    
    response = requests.post(f"{BASE_URL}/esearch.fcgi", data=payload)
    search_data = response.json()
    
    if 'esearchresult' not in search_data:
        print("Error: 'esearchresult' not in search_data")
        return all_articles

    webenv = search_data['esearchresult']['webenv']
    query_key = search_data['esearchresult']['querykey']
    count = int(search_data['esearchresult']['count'])
    
    while request_count < max_requests and len(all_articles) < count:
        payload = {
            'db': 'pubmed',
            'query_key': query_key,
            'WebEnv': webenv,
            'retmode': 'json',
            'retstart': request_count * 500,
            'retmax': 500,
        }
        
        response = requests.post(f"{BASE_URL}/esummary.fcgi", data=payload)
        summary_data = response.json()
        print(summary_data)
        
        if 'result' not in summary_data:
            print("Error: 'result' not in summary_data")
            break

        for uid in summary_data['result']['uids']:
            if uid not in retrieved_ids:
                retrieved_ids.add(uid)
        
        request_count += 1
        random_pause()
    
    # Second API call to efetch for abstracts
    for i in range(0, len(retrieved_ids), 500):
        uids_batch = list(retrieved_ids)[i:i+500]
        payload = {
            'db': 'pubmed',
            'id': ','.join(uids_batch),
            'retmode': 'text',
            'rettype': 'abstract'
        }
        
        response = requests.post(f"{BASE_URL}/efetch.fcgi", data=payload)
        abstracts = response.text
        
        for abstract in abstracts.split('\n\n'):
            all_articles.append(abstract.strip())
        
        random_pause()
    
    return all_articles

# Fetch the data
articles = fetch_pubmed_abstracts(max_requests=1)

# Output the first few articles
for article in articles[:5]:
    print(article)
    print("\n---\n")

{'header': {'type': 'esummary', 'version': '0.3'}, 'result': {'uids': ['39025858', '39025253', '39024357', '39021520', '39020071', '39020007', '39019223', '39017830', '39007360', '39005041', '39001720', '39001412', '39000239', '39000032', '38999300', '38997260', '38995805', '38995483', '38994987', '38994587', '38993763', '38991799', '38990904', '38987064', '38987014', '38982930', '38982304', '38980067', '38979497', '38978144', '38976221', '38974594', '38973128', '38972766', '38972468', '38970594', '38967905', '38967663', '38967397', '38967145', '38964821', '38964373', '38962842', '38960783', '38960534', '38960530', '38960496', '38957985', '38956330', '38955889', '38955096', '38954524', '38952362', '38950808', '38950161', '38948323', '38946118', '38946101', '38945657', '38945653', '38945651', '38945649', '38945645', '38944502', '38944430', '38940226', '38937015', '38937014', '38929042', '38929040', '38927967', '38927393', '38925497', '38922136', '38921350', '38919622', '38914279', '3891

### Convert Raw JSON to Dataframe

In [21]:
pubmed_data = pd.json_normalize(articles)

In [22]:
pubmed_data

0
1
2
3
4
...
3665
3666
3667
3668
3669


# Examine Data Structure

### PUBMED Data