In [37]:
import requests
import pandas as pd
import json
import time

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy"
]


disease_to_consider = diseases[4]

# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
    "query.titles": disease_to_consider,
    "pageSize": 100
}

# Initialize an empty list to store the data
data_list = []
all_data = []


# Loop until there is no nextPageToken
x=1
while True:
    x = x+1
    # Print the current URL (for debugging purposes)
    print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)


    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies
        all_data.append(studies)
        # print(data)        

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break
    print(len(all_data))

    time.sleep(3)

with open("data/"+disease_to_consider+".json", "w") as json_file:
    json.dump(all_data, json_file, indent=4)

Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=duchenne muscular dystrophy&pageSize=100
1
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=duchenne muscular dystrophy&pageSize=100&pageToken=NF0g5JqBl_kgwA
2
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=duchenne muscular dystrophy&pageSize=100&pageToken=NF0g5JKOlfgoyQQ
3
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=duchenne muscular dystrophy&pageSize=100&pageToken=NF0g5JGOlPYvwgI


In [30]:
all_data[0]

[{'protocolSection': {'identificationModule': {'nctId': 'NCT00160420',
    'orgStudyIdInfo': {'id': 'M02-408'},
    'organization': {'fullName': 'Abbott', 'class': 'INDUSTRY'},
    'briefTitle': 'A Long-Term Study to Evaluate the Safety of Asoprisnil in the Treatment of Women With Endometriosis From Study M01-398',
    'officialTitle': 'A Phase 2, 12-Month, Open Label Extension Study to Evaluate the Safety of J867(5 mg QD) in Subjects With Endometriosis'},
   'statusModule': {'statusVerifiedDate': '2008-05',
    'overallStatus': 'COMPLETED',
    'expandedAccessInfo': {'hasExpandedAccess': False},
    'startDateStruct': {'date': '2002-12'},
    'primaryCompletionDateStruct': {'date': '2004-07', 'type': 'ACTUAL'},
    'completionDateStruct': {'date': '2004-07', 'type': 'ACTUAL'},
    'studyFirstSubmitDate': '2005-09-08',
    'studyFirstSubmitQcDate': '2005-09-08',
    'studyFirstPostDateStruct': {'date': '2005-09-12', 'type': 'ESTIMATED'},
    'lastUpdateSubmitDate': '2008-05-27',
    'l

In [None]:
import json

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# Example usage
file_path = "data/endometriosis.json"  # Replace with your JSON file path
studies = read_json_file(file_path)
data_list = []


for batch_study in studies:
    for study in batch_study:
            # Safely access nested keys
            nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
            conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
            acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

            # Extract locations safely
            locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

            # Extract dates and phases
            primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
            studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
            lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
            studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))

            # Append the data to the list as a dictionary
            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases
            })


df = pd.DataFrame(data_list)

print(df)
print(len(df))

# with open('dia.json', "w") as json_file:
#     json.dump(studies, json_file, indent=4)

# print(studies[0])
df.to_csv("data/"+disease_to_consider+"_procesed.csv", index=False)

         NCT ID  Acronym      Overall Status  Start Date  \
0   NCT00160420  Unknown           COMPLETED     2002-12   
1   NCT00619866  Unknown           COMPLETED  2008-02-19   
2   NCT04664660  Unknown             UNKNOWN  2020-03-10   
3   NCT00461838  Unknown           COMPLETED     1996-09   
4   NCT03305120  Unknown           COMPLETED  2016-09-16   
..          ...      ...                 ...         ...   
95  NCT06502548   EDISON  NOT_YET_RECRUITING  2024-09-30   
96  NCT00001848  Unknown           COMPLETED     1998-11   
97  NCT02575248  Unknown             UNKNOWN     2014-10   
98  NCT06289257  Unknown           COMPLETED  2024-05-06   
99  NCT04591548  Unknown           COMPLETED  2016-05-01   

                                           Conditions  \
0                                       Endometriosis   
1                                 Endometriosis, Pain   
2               Endometriosis, Obstetric Complication   
3                                       Endometrios

In [24]:
print(len(studies))

2


In [28]:
studies[0]

[{'protocolSection': {'identificationModule': {'nctId': 'NCT00160420',
    'orgStudyIdInfo': {'id': 'M02-408'},
    'organization': {'fullName': 'Abbott', 'class': 'INDUSTRY'},
    'briefTitle': 'A Long-Term Study to Evaluate the Safety of Asoprisnil in the Treatment of Women With Endometriosis From Study M01-398',
    'officialTitle': 'A Phase 2, 12-Month, Open Label Extension Study to Evaluate the Safety of J867(5 mg QD) in Subjects With Endometriosis'},
   'statusModule': {'statusVerifiedDate': '2008-05',
    'overallStatus': 'COMPLETED',
    'expandedAccessInfo': {'hasExpandedAccess': False},
    'startDateStruct': {'date': '2002-12'},
    'primaryCompletionDateStruct': {'date': '2004-07', 'type': 'ACTUAL'},
    'completionDateStruct': {'date': '2004-07', 'type': 'ACTUAL'},
    'studyFirstSubmitDate': '2005-09-08',
    'studyFirstSubmitQcDate': '2005-09-08',
    'studyFirstPostDateStruct': {'date': '2005-09-12', 'type': 'ESTIMATED'},
    'lastUpdateSubmitDate': '2008-05-27',
    'l