# Call Data from Clinicaltrial.gov API

In [13]:
# Import libraries
import requests
import json
import os
import datetime
import re
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import shutil
import pandas as pd

### Do a Test Call to API to check for bugs

In [38]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL) AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 5  # Limit to 5 records for testing
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Fetch the data and print it to test the connection
data = fetch_clinical_trials(params)

# Check if data was retrieved successfully
if data:
    # Print the JSON data
    print(json.dumps(data, indent=2))
else:
    print("No data retrieved or error occurred.")



Request URL: https://clinicaltrials.gov/api/v2/studies?format=json&postFilter.overallStatus=COMPLETED&postFilter.advanced=%28AREA%5BStudyType%5DINTERVENTIONAL+OR+OBSERVATIONAL%29+AND+%28AREA%5BSex%5DMALE+OR+AREA%5BSex%5DFEMALE%29&fields=NCTId%7CCondition%7CPrimaryCompletionDate%7CBriefSummary%7CEnrollmentCount%7CSex%7CMinimumAge%7CMaximumAge&pageSize=5
{
  "studies": [
    {
      "protocolSection": {
        "identificationModule": {
          "nctId": "NCT02178735"
        },
        "statusModule": {
          "primaryCompletionDateStruct": {
            "date": "2015-11"
          }
        },
        "descriptionModule": {
          "briefSummary": "To evaluate the clinical outcome and urodynamic effect of two novel vaginal tailored mesh surgeries."
        },
        "conditionsModule": {
          "conditions": [
            "Pelvic Organ Prolapse"
          ]
        },
        "designModule": {
          "enrollmentInfo": {
            "count": 104
          }
        },
     

## Run full call to API
### Equal Number of Male and Female Studies for training set (N = 10000, n = 5000 each)
### Call unspecified sex for testing set (n = 3000)

In [57]:
# Define the base URL for the API
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Define the query parameters and desired fields
params = {
    'format': 'json',
    'postFilter.overallStatus': 'COMPLETED',
    'postFilter.advanced': '(AREA[StudyType]INTERVENTIONAL OR OBSERVATIONAL)',
    'fields': 'NCTId|Condition|PrimaryCompletionDate|BriefSummary|EnrollmentCount|Sex|MinimumAge|MaximumAge',
    'pageSize': 1000  # Number of records to fetch per request
}

# Function to make the API call
def fetch_clinical_trials(params):
    try:
        response = requests.get(base_url, params=params, headers={"accept": "application/json"})
        # Print the final URL to debug any issues with the query parameters
        #print(f"Request URL: {response.url}")
        # Check if the request was successful
        if response.status_code == 200:
            print("API call successful")
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Function to extract relevant data from the API response
def extract_study_data(study):
    protocol_section = study.get('protocolSection', {})
    identification_module = protocol_section.get('identificationModule', {})
    status_module = protocol_section.get('statusModule', {})
    description_module = protocol_section.get('descriptionModule', {})
    conditions_module = protocol_section.get('conditionsModule', {})
    design_module = protocol_section.get('designModule', {})
    eligibility_module = protocol_section.get('eligibilityModule', {})
    
    return {
        'NCTId': identification_module.get('nctId', 'N/A'),
        'Condition': conditions_module.get('conditions', ['N/A']),
        'PrimaryCompletionDate': status_module.get('primaryCompletionDateStruct', {}).get('date', 'N/A'),
        'BriefSummary': description_module.get('briefSummary', 'N/A'),
        'EnrollmentCount': design_module.get('enrollmentInfo', {}).get('count', 'N/A'),
        'Sex': eligibility_module.get('sex', 'N/A'),  
        'MinimumAge': eligibility_module.get('minimumAge', 'N/A'),  
        'MaximumAge': eligibility_module.get('maximumAge', 'N/A'), 
    }

# Fetch and process the data with a delay between requests and avoid duplicates
def fetch_and_process_data(params, max_requests):  # Increase max_requests for full data retrieval
    all_extracted_data = []
    seen_nct_ids = set()
    next_page_token = None

    for _ in range(max_requests):
        if next_page_token:
            params['pageToken'] = next_page_token

        data = fetch_clinical_trials(params)
        if data:
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken', None)

            print(f"Number of studies fetched: {len(studies)}")  # Debugging statement
            for study in studies:
                nct_id = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId')
                if nct_id and nct_id not in seen_nct_ids:
                    seen_nct_ids.add(nct_id)
                    extracted_data = extract_study_data(study)
                    all_extracted_data.append(extracted_data)
                    #print(json.dumps(extracted_data, indent=2))  # Debugging statement
                else:
                    print(f"Duplicate or missing NCTId: {nct_id}")

            if not next_page_token:
                break  # No more pages to fetch

        # Pause the execution for a short, random period of time to avoid overwhelming the server
        time.sleep(5 + 10 * random.random())
    
    return all_extracted_data

# Fetch data for MALE studies
male_params = params.copy()
male_params['postFilter.advanced'] += ' AND AREA[Sex]MALE'
male_studies = fetch_and_process_data(male_params, max_requests=5)

# Fetch data for FEMALE studies
female_params = params.copy()
female_params['postFilter.advanced'] += ' AND AREA[Sex]FEMALE'
female_studies = fetch_and_process_data(female_params, max_requests=5)

# Fetch data for random sample of MALE or FEMALE studies
random_sample_params = params.copy()
random_sample_params['postFilter.advanced'] += ' AND (AREA[Sex]MALE OR AREA[Sex]FEMALE)'
random_sample_studies = fetch_and_process_data(random_sample_params, max_requests=3)

# Combine both sets of data
all_training_data = male_studies + female_studies
all_testing_data = random_sample_studies

# Optionally, save the data to a file
with open('clinical_trials_train.json', 'w') as outfile:
    json.dump(all_training_data, outfile, indent=2)

with open('clinical_trials_test.json', 'w') as outfile:
    json.dump(all_testing_data, outfile, indent=2)

print("Data fetching and processing complete.")

API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
API call successful
Number of studies fetched: 1000
Data fetching and processing complete.


## Examine Compiled Training Data from API Call (100 records per quest, 100 requests total, n = 10,000)

In [58]:
# Convert the extracted data to a DataFrame
train_df = pd.DataFrame(all_training_data)
# Save DataFrame to a CSV file
train_df.to_csv('clinical_trials_train.csv', index=False)
# Dataframe shape
train_df.shape

(10000, 8)

In [59]:
# Display the DataFrame
train_df.head(), train_df.tail()

(         NCTId                     Condition PrimaryCompletionDate  \
 0  NCT01898065     [Prostate Adenocarcinoma]               2014-11   
 1  NCT01484665             [Prostate Cancer]               2012-06   
 2  NCT01804465             [Prostate Cancer]            2020-02-27   
 3  NCT00124566             [Prostate Cancer]               2006-01   
 4  NCT00309166  [Infections, Papillomavirus]            2007-06-01   
 
                                         BriefSummary EnrollmentCount   Sex  \
 0  With functional imaging development, it become...              20  MALE   
 1  Decision-aids are tools to educate patients on...              72  MALE   
 2  The purpose of this study is to find out what ...              50  MALE   
 3  The purpose of this study is to assess the eff...             135  MALE   
 4  The main aim of this vaccine is to prevent cer...             270  MALE   
 
   MinimumAge MaximumAge  
 0   18 Years        N/A  
 1   50 Years   75 Years  
 2   18 Years  

# Examine Testing Data from API Call

In [60]:
# Convert the extracted data to a DataFrame
test_df = pd.DataFrame(all_testing_data)
# Save DataFrame to a CSV file
test_df.to_csv('clinical_trials_test.csv', index=False)
# Dataframe shape
test_df.shape

(3000, 8)

In [63]:
test_df.head(15)

Unnamed: 0,NCTId,Condition,PrimaryCompletionDate,BriefSummary,EnrollmentCount,Sex,MinimumAge,MaximumAge
0,NCT03705065,[Analgesia],2018-10-08,"This is a Phase 4, randomized, open-label stud...",30,FEMALE,18 Years,
1,NCT00110565,"[Rheumatoid Arthritis, Menopause]",2007-07,The purpose of this study is to determine whet...,87,FEMALE,35 Years,
2,NCT03564665,[Hot Flashes],2021-04-10,The goal of this study is to further evaluate ...,40,FEMALE,25 Years,85 Years
3,NCT00052065,[Ovarian Neoplasms],2006-03,"This is a dose-ranging, open-label, Phase 1-2a...",28,FEMALE,18 Years,
4,NCT01478165,[Postoperative Nausea and Vomiting],2011-11,A total intravenous anesthesia (TIVA) is a use...,100,FEMALE,19 Years,
5,NCT02297165,[Anorexic],2017-09,The purpose of this study is to estimate the i...,100,FEMALE,13 Years,24 Years
6,NCT01898065,[Prostate Adenocarcinoma],2014-11,"With functional imaging development, it become...",20,MALE,18 Years,
7,NCT02377765,[Overactive Bladder],2016-10,This study will help to determine the effectiv...,24,FEMALE,18 Years,
8,NCT03690765,[Endometriosis],2019-11-10,"A non-interventional, observational program to...",350,FEMALE,18 Years,45 Years
9,NCT04118465,[Breech Presentation; Before Labor],2020-12-31,A comparison of external cephalic (ECV) succes...,70,FEMALE,18 Years,45 Years


# EDA of Data Other than Brief Summary

In [None]:
# Missing values

In [65]:
# Gender stats
train_df['Sex'].value_counts()

Sex
MALE      5000
FEMALE    5000
Name: count, dtype: int64

In [64]:
# Gender stats
test_df['Sex'].value_counts()

Sex
FEMALE    1971
MALE      1029
Name: count, dtype: int64

In [None]:
# Age stats by sex

In [None]:
# Date stats by sex

In [None]:
# Number of conditions by sex

In [None]:
# Enrollment Numbers by Sex

# Apply Preprocessing Steps to Brief Summary Text Data

# Descriptive Stats of Text Data

In [None]:
# Create function for descriptive stats
def descriptive_stats(tokens, num_tokens = 5, verbose=True) :
 # Given a list of tokens...
    total_tokens = len(tokens)  # Number of tokens
    unique_tokens = len(set(tokens))  # Number of unique tokens
    num_characters = sum(len(token) for token in tokens)  # Numbers of characters
    lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0  # Lexical diversity
    
    # Calculate the most common tokens
    token_counts = Counter(tokens)
    most_common_tokens = token_counts.most_common(num_tokens)
    
    # Set up statement syntax
    if verbose:
        print(f"There are {total_tokens} tokens in the data.")
        print(f"There are {unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The {num_tokens} most common tokens are:")
        for token, count in most_common_tokens:
            print(f"'{token}': {count} times")
    
    # Return list of values
    return [total_tokens, unique_tokens, lexical_diversity, num_characters]