# Call Data from Clinicaltrial.gov API

In [13]:
# Import libraries
import requests
import json
import os
import datetime
import re
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import shutil
import pandas as pd

### Do a Test Call to API to check for bugs

In [28]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': 'AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 5  # Limit to 5 records for testing
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Fetch the data and print it to test the connection
data = fetch_clinical_trials(params)

# Check if data was retrieved successfully
if data:
    # Print the JSON data
    print(json.dumps(data, indent=2))
else:
    print("No data retrieved or error occurred.")



Request URL: https://clinicaltrials.gov/api/v2/studies?format=json&postFilter.overallStatus=COMPLETED&postFilter.advanced=AREA%5BStudyType%5DINTERVENTIONAL+OR+OBSERVATIONAL&fields=NCTId%7CCondition%7CPrimaryCompletionDate%7CBriefSummary%7CEnrollmentCount%7CSex%7CMinimumAge%7CMaximumAge&pageSize=5
{
  "studies": [
    {
      "protocolSection": {
        "identificationModule": {
          "nctId": "NCT00117624"
        },
        "statusModule": {},
        "descriptionModule": {
          "briefSummary": "The purpose of this study is to compare the efficacy of darbepoetin alfa administered using a front-loading approach with subjects receiving standard weekly dosing in the treatment of anemia in subjects with a non-myeloid malignancy and receiving multicycle chemotherapy."
        },
        "conditionsModule": {
          "conditions": [
            "Anemia"
          ]
        },
        "designModule": {},
        "eligibilityModule": {
          "sex": "ALL",
          "minimumAge

## Run full call to API

In [29]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': 'AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 100  # Number of records to fetch per request
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        #print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            print("API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Function to extract relevant data from the API response
def extract_study_data(study):
    protocol_section = study.get('protocolSection', {})
    identification_module = protocol_section.get('identificationModule', {})
    status_module = protocol_section.get('statusModule', {})
    description_module = protocol_section.get('descriptionModule', {})
    conditions_module = protocol_section.get('conditionsModule', {})
    design_module = protocol_section.get('designModule', {})
    eligibility_module = protocol_section.get('eligibilityModule', {})
    
    return {
        'NCTId': identification_module.get('nctId', 'N/A'),
        'Condition': conditions_module.get('conditions', ['N/A']),
        'PrimaryCompletionDate': status_module.get('primaryCompletionDateStruct', {}).get('date', 'N/A'),
        'BriefSummary': description_module.get('briefSummary', 'N/A'),
        'EnrollmentCount': design_module.get('enrollmentInfo', {}).get('count', 'N/A'),
        'Sex': eligibility_module.get('sex', 'N/A'),
        'MinimumAge': eligibility_module.get('MinimumAge', 'N/A'),
        'MaximumAge': eligibility_module.get('MaximumAge', 'N/A'),
    }

# Fetch and process the data with a delay between requests and avoid duplicates
def fetch_and_process_data(params, max_requests=100):  # Increase max_requests for full data retrieval
    all_extracted_data = []
    seen_nct_ids = set()
    next_page_token = None

    for _ in range(max_requests):
        if next_page_token:
            params['pageToken'] = next_page_token

        data = fetch_clinical_trials(params)
        if data:
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken', None)

            print(f"Number of studies fetched: {len(studies)}")  # Debugging statement
            for study in studies:
                nct_id = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId')
                if nct_id and nct_id not in seen_nct_ids:  # Make loop to eliminate adding duplicate study IDs
                    seen_nct_ids.add(nct_id)
                    extracted_data = extract_study_data(study)
                    all_extracted_data.append(extracted_data)
                    #print(json.dumps(extracted_data, indent=2))  # Debugging statement
                else:
                    print(f"Duplicate or missing NCTId: {nct_id}")

            if not next_page_token:
                break  # No more pages to fetch

        # Pause the execution for a short, random period of time to avoid overwhelming the server
        time.sleep(5 + 10 * random.random())
    
    return all_extracted_data

# Fetch the data and process it
all_extracted_data = fetch_and_process_data(params)

# Optionally, save the data to a file
with open('clinical_trials_data.json', 'w') as outfile:
    json.dump(all_extracted_data, outfile, indent=2)

print("Data fetching and processing complete.")

API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of studies fetched: 100
API call successful
Number of s

## Examine Compiled Data from API Call (100 records per quest, 100 requests total, n = 10,000)

In [30]:
# Convert the extracted data to a DataFrame
clinical_trials_df = pd.DataFrame(all_extracted_data)
# Save DataFrame to a CSV file
clinical_trials_df.to_csv('clinical_trials_data.csv', index=False)
# Dataframe shape
clinical_trials_df.shape

(10000, 8)

In [31]:
# Display the DataFrame
clinical_trials_df.head(25)

Unnamed: 0,NCTId,Condition,PrimaryCompletionDate,BriefSummary,EnrollmentCount,Sex,MinimumAge,MaximumAge
0,NCT00117624,[Anemia],,The purpose of this study is to compare the ef...,,ALL,,
1,NCT02105324,[Type 1 Diabetes],2014-08,This study will test the hypothesis that a wea...,19.0,ALL,,
2,NCT05109624,[COVID-19 Acute Respiratory Distress Syndrome],2022-01-01,the aim of the study is to assess safety and e...,52.0,ALL,,
3,NCT03298724,[Behavior Problem],2018-06-25,There are two aims of this study (a) to examin...,31.0,ALL,,
4,NCT02898324,[Bullying],2017-12,"Bullying is a major problem worldwide and, wit...",4485.0,ALL,,
5,NCT04297124,[Healthy Volunteer],2021-06-04,"CC-90009-CP-001 is a Phase 1, single-center, o...",8.0,MALE,,
6,NCT00759824,[Small Cell Lung Carcinoma],2013-12,The primary aim of this study is to determine ...,64.0,ALL,,
7,NCT00116324,[Asthma],,The purpose of this study is to examine a spec...,150.0,ALL,,
8,NCT00811824,[Breast Cancer],2009-09,This study is testing the effects of exercise ...,42.0,FEMALE,,
9,NCT04347824,[Covid-19],2020-12-31,"This non-interventional, observational study r...",223.0,ALL,,


# EDA of Data Other than Brief Summary

In [None]:
# Missing values

In [33]:
# Gender stats
clinical_trials_df['Sex'].value_counts()

Sex
ALL       8479
FEMALE     970
MALE       545
N/A          6
Name: count, dtype: int64

In [None]:
# Age stats by sex

In [None]:
# Date stats by sex

In [None]:
# Number of conditions by sex

In [None]:
# Enrollment Numbers by Sex

# Apply Preprocessing Steps to Brief Summary Text Data