# Call Data from Clinicaltrial.gov API

In [13]:
# Import libraries
import requests
import json
import os
import datetime
import re
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import shutil
import pandas as pd

### Do a Test Call to API to check for bugs

In [38]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL) AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 5  # Limit to 5 records for testing
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Fetch the data and print it to test the connection
data = fetch_clinical_trials(params)

# Check if data was retrieved successfully
if data:
    # Print the JSON data
    print(json.dumps(data, indent=2))
else:
    print("No data retrieved or error occurred.")



Request URL: https://clinicaltrials.gov/api/v2/studies?format=json&postFilter.overallStatus=COMPLETED&postFilter.advanced=%28AREA%5BStudyType%5DINTERVENTIONAL+OR+OBSERVATIONAL%29+AND+%28AREA%5BSex%5DMALE+OR+AREA%5BSex%5DFEMALE%29&fields=NCTId%7CCondition%7CPrimaryCompletionDate%7CBriefSummary%7CEnrollmentCount%7CSex%7CMinimumAge%7CMaximumAge&pageSize=5
{
  "studies": [
    {
      "protocolSection": {
        "identificationModule": {
          "nctId": "NCT02178735"
        },
        "statusModule": {
          "primaryCompletionDateStruct": {
            "date": "2015-11"
          }
        },
        "descriptionModule": {
          "briefSummary": "To evaluate the clinical outcome and urodynamic effect of two novel vaginal tailored mesh surgeries."
        },
        "conditionsModule": {
          "conditions": [
            "Pelvic Organ Prolapse"
          ]
        },
        "designModule": {
          "enrollmentInfo": {
            "count": 104
          }
        },
     

## Run full call to API to call Equal Number of Male and Female Studies

In [49]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL) AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 1000  # Number of records to fetch per request
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        #print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            print("API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Function to extract relevant data from the API response
def extract_study_data(study):
    protocol_section = study.get('protocolSection', {})
    identification_module = protocol_section.get('identificationModule', {})
    status_module = protocol_section.get('statusModule', {})
    description_module = protocol_section.get('descriptionModule', {})
    conditions_module = protocol_section.get('conditionsModule', {})
    design_module = protocol_section.get('designModule', {})
    eligibility_module = protocol_section.get('eligibilityModule', {})
    
    return {
        'NCTId': identification_module.get('nctId', 'N/A'),
        'Condition': conditions_module.get('conditions', ['N/A']),
        'PrimaryCompletionDate': status_module.get('primaryCompletionDateStruct', {}).get('date', 'N/A'),
        'BriefSummary': description_module.get('briefSummary', 'N/A'),
        'EnrollmentCount': design_module.get('enrollmentInfo', {}).get('count', 'N/A'),
        'Sex': eligibility_module.get('sex', 'N/A'),  
        'MinimumAge': eligibility_module.get('minimumAge', 'N/A'),  
        'MaximumAge': eligibility_module.get('maximumAge', 'N/A'), 
    }

# Fetch and process the data with a delay between requests and avoid duplicates
def fetch_and_process_data(params, max_requests=5):  # Increase max_requests for full data retrieval
    all_extracted_data = []
    seen_nct_ids = set()
    next_page_token = None

    for _ in range(max_requests):
        if next_page_token:
            params['pageToken'] = next_page_token

        data = fetch_clinical_trials(params)
        if data:
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken', None)

            print(f"Number of studies fetched: {len(studies)}")  # Debugging statement
            for study in studies:
                nct_id = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId')
                if nct_id and nct_id not in seen_nct_ids:
                    seen_nct_ids.add(nct_id)
                    extracted_data = extract_study_data(study)
                    all_extracted_data.append(extracted_data)
                    #print(json.dumps(extracted_data, indent=2))  # Debugging statement
                else:
                    print(f"Duplicate or missing NCTId: {nct_id}")

            if not next_page_token:
                break  # No more pages to fetch

        # Pause the execution for a short, random period of time to avoid overwhelming the server
        time.sleep(5 + 10 * random.random())
    
    return all_extracted_data

# Fetch data for MALE studies
male_params = params.copy()
male_params['postFilter.advanced'] += ' AND AREA[Sex]MALE'
male_studies = fetch_and_process_data(male_params)

# Fetch data for FEMALE studies
female_params = params.copy()
female_params['postFilter.advanced'] += ' AND AREA[Sex]FEMALE'
female_studies = fetch_and_process_data(female_params)

# Combine both sets of data
all_extracted_data = male_studies + female_studies

# Optionally, save the data to a file
with open('clinical_trials_data.json', 'w') as outfile:
    json.dump(all_extracted_data, outfile, indent=2)

print("Data fetching and processing complete.")

API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
Data fetching and processing complete.


## Examine Compiled Data from API Call (100 records per quest, 100 requests total, n = 10,000)

In [50]:
# Convert the extracted data to a DataFrame
clinical_trials_df = pd.DataFrame(all_extracted_data)
# Save DataFrame to a CSV file
clinical_trials_df.to_csv('clinical_trials_data.csv', index=False)
# Dataframe shape
clinical_trials_df.shape

(10000, 8)

In [54]:
# Display the DataFrame
clinical_trials_df.head(), clinical_trials_df.tail()

(         NCTId                                          Condition  \
 0  NCT00061035                              [Prostatic Neoplasms]   
 1  NCT00283335                           [Coronary Heart Disease]   
 2  NCT05734235  [Ultraviolet B Radiation, Microvesicle Particles]   
 3  NCT03751735  [Erectile Dysfunction Associated With Type 2 D...   
 4  NCT00821535     [Human Immunodeficiency Virus (HIV) Infection]   
 
   PrimaryCompletionDate                                       BriefSummary  \
 0                   N/A  Dr. Frederick Millard, MD, Associate Clinical ...   
 1               1998-09  This was a double-blind randomized trial compa...   
 2            2020-03-05  This study is designed to assess if ultraviole...   
 3            2019-01-13  Efficacy of Intracavernous injection of Wharto...   
 4               2009-05  To confirm safety and pharmacokinetics of mara...   
 
   EnrollmentCount   Sex MinimumAge MaximumAge  
 0              18  MALE   18 Years        N/A  
 1  

# EDA of Data Other than Brief Summary

In [None]:
# Missing values

In [55]:
# Gender stats
clinical_trials_df['Sex'].value_counts()

Sex
MALE      5000
FEMALE    5000
Name: count, dtype: int64

In [None]:
# Age stats by sex

In [None]:
# Date stats by sex

In [None]:
# Number of conditions by sex

In [None]:
# Enrollment Numbers by Sex

# Apply Preprocessing Steps to Brief Summary Text Data