# Extracting Data via API Requests

# Load Libraries and Functions

In [None]:
import requests
import json
import os
import datetime
import re
import time
import warnings
from io import StringIO
import sys

from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import shutil
import pandas as pd
import numpy as np
from tabulate import tabulate
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from imblearn.metrics import specificity_score, sensitivity_score
import nltk
from nltk.classify import NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from sklearn.model_selection import cross_val_score, cross_val_predict

import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

import spacy
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from wordcloud import WordCloud

import en_core_web_sm
nlp = en_core_web_sm.load()
# Download necessary NLTK data
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('stopwords')

In [None]:
# Add some additional workflow settings

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Suppress all warnings
warnings.filterwarnings("ignore")

# Call Data from Clinicaltrial.gov API

## Do a Test Call to API to check for bugs

In [None]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL) AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)',
    'fields': 'NCTId|Condition|StartDate|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 3  # Limit to 3 records for testing
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Fetch the data and print it to test the connection
data = fetch_clinical_trials(params)

# Check if data was retrieved successfully
if data:
    # Print the JSON data
    print(json.dumps(data, indent=2))
else:
    print("No data retrieved or error occurred.")

## Run full call to API

In [None]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL)',
    'fields': 'NCTId|Condition|StartDate|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 1000  # Number of records to fetch per request
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        #print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            print("API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Function to extract relevant data from the API response
def extract_study_data(study):
    protocol_section = study.get('protocolSection', {})
    identification_module = protocol_section.get('identificationModule', {})
    status_module = protocol_section.get('statusModule', {})
    description_module = protocol_section.get('descriptionModule', {})
    conditions_module = protocol_section.get('conditionsModule', {})
    design_module = protocol_section.get('designModule', {})
    eligibility_module = protocol_section.get('eligibilityModule', {})

    # Exclude records where sex or brief summary is not available
    if 'sex' not in eligibility_module or 'briefSummary' not in description_module:
        return None

    return {
        'NCTId': identification_module.get('nctId', 'N/A'),
        'Condition': conditions_module.get('conditions', ['N/A']),
        'StartDate': status_module.get('startDateStruct', {}).get('date', 'N/A'),
        'PrimaryCompletionDate': status_module.get('primaryCompletionDateStruct', {}).get('date', 'N/A'),
        'BriefSummary': description_module.get('briefSummary', 'N/A'),
        'EnrollmentCount': design_module.get('enrollmentInfo', {}).get('count', 'N/A'),
        'Sex': eligibility_module.get('sex', 'N/A'),
        'MinimumAge': eligibility_module.get('minimumAge', 'N/A'),
        'MaximumAge': eligibility_module.get('maximumAge', 'N/A')
    }

# Fetch and process the data with a delay between requests and avoid duplicates
def fetch_and_process_data(params, max_requests):  # Increase max_requests for more data retrieval
    all_extracted_data = []
    seen_nct_ids = set()
    next_page_token = None

    for _ in range(max_requests):
        if next_page_token:
            params['pageToken'] = next_page_token

        data = fetch_clinical_trials(params)
        if data:
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken', None)

            print(f"Number of studies fetched: {len(studies)}")  # Debugging statement
            for study in studies:
                nct_id = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId')
                if nct_id and nct_id not in seen_nct_ids:
                    seen_nct_ids.add(nct_id)
                    extracted_data = extract_study_data(study)
                    all_extracted_data.append(extracted_data)
                    #print(json.dumps(extracted_data, indent=2))  # Debugging statement
                else:
                    print(f"Duplicate or missing NCTId: {nct_id}")

            if not next_page_token:
                break  # No more pages to fetch

        # Pause the execution for a short, random period of time to avoid overwhelming the server
        time.sleep(5 + 10 * random.random())
    
    return all_extracted_data

# Fetch data for random sample of MALE or FEMALE studies
random_sample_params = params.copy()
random_sample_params['postFilter.advanced'] += ' AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)'
random_sample_studies = fetch_and_process_data(random_sample_params, max_requests=15)

# Combine both sets of data
all_data = random_sample_studies 

print("Data fetching and processing complete.")

## Save Raw API Data as .csv and .json

In [None]:
# Convert the extracted data to a DataFrame
all_df = pd.DataFrame(all_data)

# Create a new folder called "data_files"
os.makedirs("raw_data_files", exist_ok=True)

# Save DataFrame to a CSV file in the "data_files" folder
all_df.to_csv(os.path.join("raw_data_files", "clinical_trials.csv"), index=False)

# Save the data to a JSON file in the "data_files" folder
with open(os.path.join("raw_data_files", "clinical_trials.json"), 'w') as outfile:
    json.dump(all_data, outfile, indent=2)

# Call Data from PUBMED Entrez API

## Do a Test Call to API to check for bugs

## Run full call to API

In [None]:
# Define the base URL for the API
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# Define the query parameters and desired fields
params = {
    'db': 'pubmed',
    'term': 'completed[Title/Abstract] AND (interventional[Filter] OR observational[Filter])',
    'retmode': 'json',
    'retmax': 1000,  # Number of records to fetch per request
}

# Function to make the API call to get IDs
def fetch_pubmed_ids(params):
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            print("API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Function to fetch the details for the IDs retrieved
def fetch_pubmed_details(ids):
    details_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    details_params = {
        'db': 'pubmed',
        'id': ','.join(ids),
        'retmode': 'json',
    }
    try:
        response = requests.get(details_url, params=details_params)
        if response.status_code == 200:
            print("Details API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve details: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Details request failed: {e}")
        return None

# Function to extract relevant data from the API response
def extract_study_data(study):
    return {
        'Id': study.get('uid', 'N/A'),
        'Title': study.get('title', 'N/A'),
        'Source': study.get('source', 'N/A'),
        'PubDate': study.get('pubdate', 'N/A'),
        'Authors': [author['name'] for author in study.get('authors', [])],
        'Abstract': study.get('elocationid', 'N/A'),
        'Volume': study.get('volume', 'N/A'),
        'Issue': study.get('issue', 'N/A'),
        'Pages': study.get('pages', 'N/A')
    }

# Fetch and process the data
def fetch_and_process_data(params, max_requests=15):  # Increase max_requests for more data retrieval
    all_extracted_data = []
    next_page_token = None

    for _ in range(max_requests):
        if next_page_token:
            params['pageToken'] = next_page_token

        data = fetch_pubmed_ids(params)
        if data:
            ids = data.get('esearchresult', {}).get('idlist', [])
            next_page_token = data.get('esearchresult', {}).get('next_page_token', None)

            if ids:
                details = fetch_pubmed_details(ids)
                studies = details.get('result', {}).get('uids', [])
                
                for study_id in studies:
                    study = details['result'][study_id]
                    extracted_data = extract_study_data(study)
                    all_extracted_data.append(extracted_data)

            if not next_page_token:
                break  # No more pages to fetch

        # Pause the execution for a short, random period of time to avoid overwhelming the server
        time.sleep(5 + 10 * random.random())
    
    return all_extracted_data

# Fetch data
random_sample_studies = fetch_and_process_data(params, max_requests=15)

# Combine both sets of data
all_data = random_sample_studies 

print("Data fetching and processing complete.")

In [None]:
all_data

# Save DataFrame to a CSV file in the "data_files" folder
all_df.to_csv(os.path.join("raw_data_files", "clinical_trials.csv"), index=False)

# Save the data to a JSON file in the "data_files" folder
with open(os.path.join("raw_data_files", "clinical_trials.json"), 'w') as outfile:
    json.dump(all_data, outfile, indent=2)

# Data Wrangling to Standardize Features from Both Data Sources

### Clinical Trial Data

### PUBMED Data

### Combine Data