In [None]:
# ==============================================================================
# Step 1: Install and Import Necessary Libraries
# ==============================================================================
.

import requests
import pandas as pd
import json
import time

print("Libraries imported successfully.")

# ==============================================================================
# Step 2: Define API Configuration and Search Parameters
# ==============================================================================
# The base URL for the ClinicalTrials.gov API v2
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"

"
SEARCH_QUERY = "Ozempic AND Type 2 Diabetes"

#
STUDY_STATUS = "RECRUITING" # Other options: "COMPLETED", "NOT_YET_RECRUITING", etc.

# --- Pagination Settings ---
# Set a limit on the number of pages to fetch to avoid making too many requests.
# Set to None to fetch all pages.
MAX_PAGES = 5

# ==============================================================================
# Step 3: Function to Fetch and Process Data from the API
# ==============================================================================

def fetch_clinical_trials(query, status=None, max_pages=None):
    """
    Fetches trial data from the ClinicalTrials.gov API, handles pagination,
    and extracts relevant fields.
    """
    all_trials_data = []
    page_token = None
    page_count = 0

    while True:
        # --- Construct the request ---
        params = {
            'query.term': query,
            'pageSize': 100, # Request the maximum number of studies per page
            'format': 'json'
        }
        if status:
            params['filter.overallStatus'] = status
        if page_token:
            params['pageToken'] = page_token

        print(f"Fetching page {page_count + 1}...")

        # --- Make the API call ---
        try:
            response = requests.get(BASE_URL, params=params)
            response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            break

        data = response.json()
        studies = data.get('studies', [])

        if not studies:
            print("No more studies found.")
            break

        # --- Extract desired information from each study ---
        for study in studies:
            # Using .get() with default values to prevent errors if a key is missing
            protocol = study.get('protocolSection', {})
            id_module = protocol.get('identificationModule', {})
            status_module = protocol.get('statusModule', {})
            conditions_module = protocol.get('conditionsModule', {})
            design_module = protocol.get('designModule', {})
            sponsor_module = protocol.get('sponsorCollaboratorsModule', {})

            # Extract specific fields
            nct_id = id_module.get('nctId', 'N/A')
            brief_title = id_module.get('briefTitle', 'N/A')
            overall_status = status_module.get('overallStatus', 'N/A')
            start_date = status_module.get('startDateStruct', {}).get('date', 'N/A')
            completion_date = status_module.get('completionDateStruct', {}).get('date', 'N/A')
            conditions = ", ".join(conditions_module.get('conditions', []))
            study_type = design_module.get('studyType', 'N/A')
            phases = ", ".join(design_module.get('phases', [])) if design_module.get('phases') else 'N/A'
            lead_sponsor = sponsor_module.get('leadSponsor', {}).get('name', 'N/A')


            all_trials_data.append({
                'NCT ID': nct_id,
                'Title': brief_title,
                'Status': overall_status,
                'Conditions': conditions,
                'Study Type': study_type,
                'Phases': phases,
                'Start Date': start_date,
                'Completion Date': completion_date,
                'Sponsor': lead_sponsor
            })

        # --- Handle pagination ---
        page_token = data.get('nextPageToken')
        page_count += 1

        if not page_token or (max_pages is not None and page_count >= max_pages):
            if not page_token:
                print("All pages have been fetched.")
            else:
                print(f"Reached max page limit of {max_pages}.")
            break

        # Be a good citizen and don't spam the API
        time.sleep(0.5)

    return all_trials_data

# ==============================================================================
# Step 4: Run the Extraction and Create a Pandas DataFrame
# ==============================================================================

print(f"Starting extraction for query: '{SEARCH_QUERY}'")
trials_list = fetch_clinical_trials(SEARCH_QUERY, status=STUDY_STATUS, max_pages=MAX_PAGES)

if trials_list:
    # Convert the list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(trials_list)

    print(f"\nSuccessfully extracted data for {len(df)} trials.")

    # Display the first few rows of the DataFrame
    print("\n--- Data Preview ---")
    display(df.head())

    # Display basic information about the DataFrame
    print("\n--- DataFrame Info ---")
    df.info()

    # ==============================================================================
    # Step 5: Save the DataFrame to a CSV File in Google Colab
    # ==============================================================================
    # The file will be saved in the Colab virtual machine's file system.
    # You can find it in the 'Files' tab on the left sidebar.

    file_name = "clinical_trials_data.csv"
    df.to_csv(file_name, index=False)

    print(f"\nData successfully saved to '{file_name}'. You can download it from the Colab file browser.")

else:
    print("\nNo data was extracted. The program will now exit.")

Libraries imported successfully.
Starting extraction for query: 'Ozempic AND Type 2 Diabetes'
Fetching page 1...
All pages have been fetched.

Successfully extracted data for 20 trials.

--- Data Preview ---


Unnamed: 0,NCT ID,Title,Status,Conditions,Study Type,Phases,Start Date,Completion Date,Sponsor
0,NCT05870462,Semaglutide and Vascular Regeneration,RECRUITING,"Atherosclerosis, Cardiovascular Diseases, Diab...",INTERVENTIONAL,PHASE4,2023-04-29,2024-12,Canadian Medical and Surgical Knowledge Transl...
1,NCT06533527,Holding vs. Continuing Incretin-based Therapie...,RECRUITING,"Diabetes Mellitus, Type 2, Obesity, Gastroparesis",INTERVENTIONAL,,2024-07-31,2025-07,The Cleveland Clinic
2,NCT05305794,"Effect of Weekly GLP1 Agonist Treatment in ""do...",RECRUITING,Double Diabetes,INTERVENTIONAL,PHASE3,2022-07-12,2026-08,Centre Hospitalier Universitaire Dijon
3,NCT06634927,"Pharmacokinetic Similarity, Safety, and Immuno...",RECRUITING,Type 2 Diabetes,INTERVENTIONAL,PHASE1,2024-09-20,2025-02-10,"Hangzhou Zhongmei Huadong Pharmaceutical Co., ..."
4,NCT05569772,Semaglutide for the Treatment of Glucose Intol...,RECRUITING,Glucose Intolerance After a Recent History of ...,INTERVENTIONAL,PHASE3,2023-09-14,2028-12,Universitaire Ziekenhuizen KU Leuven



--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   NCT ID           20 non-null     object
 1   Title            20 non-null     object
 2   Status           20 non-null     object
 3   Conditions       20 non-null     object
 4   Study Type       20 non-null     object
 5   Phases           20 non-null     object
 6   Start Date       20 non-null     object
 7   Completion Date  20 non-null     object
 8   Sponsor          20 non-null     object
dtypes: object(9)
memory usage: 1.5+ KB

Data successfully saved to 'clinical_trials_data.csv'. You can download it from the Colab file browser.
