In [1]:
import requests
import pandas as pd
import os

# List of interventions to search for
# Intervention for cancer
interventions_list = ["ociperlimab", "domvanalimab", "sintilimab", "monalizumab", "camrelizumab", "quavonlimab", "adebrelimab", "envalfolimab", "sugemalimab", "vibostolimab", "tiragolumab", "penpulimab", "tilelizumab", "cosibelimab", "envafolimab", "tifcemalimab", "carrelizumab", "socazolimab", "tislelizumab", "envolimab", "botensilimab", "tilesizumab", "cadonilimab", "triprilimab", "zimberelimab", "spartalizumab", "favezelimab", "tebotelimab", "balstilimab", "prolgolimab", "sotigalimab", "budigalimab", "ivonescimab", "catumaxomab", "alnuctamab", "serplulimab", "linvoseltamab", "tarlatamab", "emactuzumab", "fianlimab", "cetrelimab", "cobolimab", "feladilimab", "nofazinlimab", "ezabenlimab", "livmoniplimab", "odronextamab", "rulonilimab"]
# Initialize an empty list to store the combined data
combined_data_list = []

# Function to fetch and process data for a single intervention
def fetch_and_process_data(intervention):
    # Initial URL for the first API call
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "query.intr": intervention,
        "filter.advanced": "AREA[Phase]PHASE1"
    }

    # Initialize an empty list to store the data for this intervention
    data_list = []

    # Loop until there is no nextPageToken
    while True:
        # Send a GET request to the API
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            studies = data.get('studies', [])

            # Loop through each study and extract specific information
            for study in studies:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                official_title = study['protocolSection']['identificationModule'].get('officialTitle', "")
                detailed_description = study['protocolSection']['descriptionModule'].get('detailedDescription', "")

                # Extract interventions
                protocol = study.get("protocolSection", {})
                identification = protocol.get("identificationModule", {})
                arms_interventions = protocol.get("armsInterventionsModule", {})
                arm_groups = arms_interventions.get("armGroups", [])
                interventions = arms_interventions.get("interventions", [])

                arm_dict = {
                    arm.get('label', ''): {
                        'arm_label': arm.get('label', ''),
                        'arm_type': arm.get('type', ''),
                        'arm_description': arm.get('description', ''),
                        'intervention_names': ', '.join(arm.get('interventionNames', []))
                    }
                    for arm in arm_groups
                }

                for intervention_detail in interventions:
                    for arm_label in intervention_detail.get('armGroupLabels', []):
                        if arm_label in arm_dict:
                            row = {
                                'nctId': nctId,
                                'startDate': startDate,
                                'official_title': official_title,
                                'detailed_description': detailed_description,
                                'arm_label': arm_dict[arm_label]['arm_label'],
                                'arm_type': arm_dict[arm_label]['arm_type'],
                                'arm_description': arm_dict[arm_label]['arm_description'],
                                'intervention_names': arm_dict[arm_label]['intervention_names'],
                                'intervention_type': intervention_detail.get('type', ''),
                                'intervention_name': intervention_detail.get('name', ''),
                                'intervention_description': intervention_detail.get('description', ''),
                                'intervention_arm_labels': ', '.join(intervention_detail.get('armGroupLabels', [])),
                                'intervention_other_names': ', '.join(intervention_detail.get('otherNames', [])),
                                'intervention': intervention  # Add the intervention as a label
                            }
                            data_list.append(row)

            # Check for nextPageToken and update the params or break the loop
            nextPageToken = data.get('nextPageToken')
            if nextPageToken:
                params['pageToken'] = nextPageToken  # Set the pageToken for the next request
            else:
                break  # Exit the loop if no nextPageToken is present
        else:
            print("Failed to fetch data for intervention:", intervention, "Status code:", response.status_code)
            break

    # Create a DataFrame from the list of dictionaries
    if data_list:  # Check if data_list is not empty
        df = pd.DataFrame(data_list)

        # Ensure 'startDate' is present in DataFrame columns
        if 'startDate' in df.columns:
            # Convert 'startDate' to datetime format, handling different date formats
            def parse_date(date_str):
                try:
                    return pd.to_datetime(date_str, format='%m/%d/%Y')
                except ValueError:
                    try:
                        return pd.to_datetime(date_str, format='%Y-%m')
                    except ValueError:
                        return pd.to_datetime(date_str, format='%Y-%m-%d', errors='coerce')

            df['startDate'] = df['startDate'].apply(parse_date)

            # Keep only the rows with the earliest date
            df_sorted = df.sort_values(by='startDate')
            earliest_date = df_sorted['startDate'].min()
            earliest_rows = df_sorted[df_sorted['startDate'] == earliest_date]

            # Filter rows with dosage information
            data_with_dosage = earliest_rows[
                earliest_rows['arm_description'].str.contains('mg|mcg', case=False, na=False) |
                earliest_rows['intervention_description'].str.contains('mg|mcg', case=False, na=False)
            ]

            # Add the data to the combined list
            combined_data_list.extend(data_with_dosage.to_dict('records'))

# Loop through each intervention and fetch/process data
for intervention in interventions_list:
    fetch_and_process_data(intervention)

# Create a DataFrame from the combined data list
combined_df = pd.DataFrame(combined_data_list)

# Save the combined DataFrame to a CSV file
output_file_path = "output/Phase1/phase1_result.csv"
# output_file_path = "output/Phase1/phase1_result_carcinoma.csv"

combined_df.to_csv(output_file_path, index=False)

print(f"Saved to {output_file_path}")
print(len(combined_df))

Saved to output/Phase1/phase1_result.csv
140


In [2]:
print(len(combined_df['intervention'].unique()))

20


In [9]:
######### The original keywords to remove, with EGFR etc. and all the marketed drugs
# keywords_to_remove = ['EGFR', 'VEGF', 'HER2', 'gene', 'virus', 'cell', 'Tecartus', 'Kadcyla', 'Proleukin', 'Rybrevant', 'Tecentriq', 
#                       'Bavencio', 'Yescarta', 'Blenrep', 'Avastin', 'Mvasi', 'Zirabev', 'Blincyto', 'Adcetris', 'Libtayo', 'Erbitux', 
#                       'Carvykti', 'Darzalex', 'Darzalex Faspro', 'Jemperli', 'Imfinzi', 'Empliciti', 'Elrexfio', 'Padcev', 'Epkinly', 
#                       'Enhertu', 'Mylotarg', 'Columvi', 'Zevalin', 'Abecma', 'Besponsa', 'Yervoy', 'Sarclisa', 'Breyanzi', 'Zynlonta', 
#                       'Margenza', 'Elahere', 'Poteligeo', 'Lunsumio', 'Adstiladrin', 'Danyelza', 'Portrazza', 'Opdivo', 'Opdualag', 
#                       'Gazyva', 'Gazyvaro', 'Arzerra', 'Vectibix', 'Sylatron', 'Keytruda', 'Perjeta', 'Phesgo', 'Polivy', 'Cyramza', 
#                       'Zynyz', 'Blitzima', 'Riximyo', 'Rituxan', 'MabThera', 'Rixathon', 'Truxima', 'Trodelvy', 'Sylvant', 'Provenge', 
#                       'Minjuvi', 'Monjuvi', 'Elzonris', 'Imlygic', 'Talvey', 'Kimmtrak', 'Tecvayli', 'Kymriah', 'Tivdak', 'Loqtorzi', 
#                       'Herceptin', 'Kanjinti', 'Herzuma', 'Ontruzant', 'Ogivri', 'Trazimera', 'Zercepac', 'Herceptin Hylecta', 'Imjudo', 
#                       'Zaltrap', 'Brexucabtagene autoleucel', 'Ado-trastuzumab emtansine', 'Aldesleukin', 'Amivantamab-vmjw', 'Atezolizumab', 
#                       'Avelumab', 'Axicabtagene ciloleucel', 'Belantamab Mafodotin-blmf', 'Bevacizumab', 'Bevacizumab-awwb', 'Bevacizumab-bvzr', 
#                       'Blinatumomab', 'Brentuximab Vedotin', 'Cemiplimab-rwlc', 'Cetuximab', 'Ciltacabtagene autoleucel', 'Daratumumab', 
#                       'Daratumumab and hyaluronidase-fihj', 'Dostarlimab-gxly', 'Durvalumab', 'Elotuzumab', 'Elranatamab-bcmm', 
#                       'Enfortumab Vedotin-ejfv', 'Epcoritamab-bysp', 'Fam-trastuzumab deruxtecan-nxki', 'Gemtuzumab Ozogamicin', 
#                       'Glofitamab-gxbm', 'Ibritumomab Tiuxetan', 'Idecabtagene Vicleucel', 'Inotuzumab Ozogamicin', 'Ipilimumab', 
#                       'Isatuximab-irfc', 'Lisocabtagene Maraleucel', 'Loncastuximab tesirine-lpy', 'Margetuximab-cmkb', 
#                       'Mirvetuximab soravtansine-gynx', 'Mogamulizumab', 'Mosunetuzumab-axgb', 'Nadofaragene firadenovec-vncg', 
#                       'Naxitamab', 'Necitumumab', 'Nivolumab', 'Nivolumab and relatlimab-rmbw', 'Obinutuzumab', 'Ofatumumab', 'Panitumumab', 
#                       'Peginterferon alfa-2b', 'Pembrolizumab', 'Pertuzumab', 'Pertuzumab, Trastuzumab, hyaluronidase-zzxf combination', 
#                       'Polatuzumab Vedotin-Piiq', 'Ramucirumab', 'Retifanlimab-dlwr', 'Rituximab', 'Rituximab-abbs', 'Sacituzumab Govitecan-hziy', 
#                       'Siltuximab', 'Sipuleucel-T', 'Tafasitamab', 'Tafasitamab-cxix', 'Tagraxofusp-erzs', 'Talimogene laherparepvec', 
#                       'Talquetamab-tgvs', 'Tebentafusp-tebn', 'Teclistamab-cpyv', 'Tisagenlecleucel', 'Tisotumab Vedotin-tftv', 'Toripalimab-tpzi', 
#                       'Trastuzumab', 'Trastuzumab and Hyaluronidase', 'Tremelimumab-actl', 'ziv-Aflibercept']

######### The modified (broken) keywords of marketed drugs
# # The marketed drug list is modified compared to previous one, every single word (from composite words previously) is included
# marketed_drugs_list = ["Proleukin", "Aldesleukin", "Tecentriq", "Atezolizumab", "Bavencio", "Avelumab", "Blincyto", "Blinatumomab", 
#                        "Libtayo", "Cemiplimab-rwlc", "Cemiplimab", "Jemperli", "Dostarlimab-gxly", "Dostarlimab", "Imfinzi", "Durvalumab", 
#                        "Empliciti", "Elotuzumab", "Elrexfio", "Elranatamab-bcmm", "Elranatamab", "Epkinly", "Epcoritamab-bysp", "Epcoritamab", 
#                        "Columvi", "Glofitamab-gxbm", "Glofitamab", "Yervoy", "Ipilimumab", "Lunsumio", "Mosunetuzumab-axgb", "Mosunetuzumab", 
#                        "Opdivo", "Nivolumab", "Opdualag", "Nivolumab and relatlimab-rmbw", "Nivolumab", "relatlimab-rmbw", "relatlimab", 
#                        "Vectibix", "Panitumumab", "Keytruda", "Pembrolizumab", "Zynyz", "Retifanlimab-dlwr", "Retifanlimab", "Talvey", 
#                        "Talquetamab-tgvs", "Talquetamab", "Kimmtrak", "Tebentafusp-tebn", "Tebentafusp",  "Tecvayli", "Teclistamab-cpyv", 
#                        "Teclistamab", "Loqtorzi", "Toripalimab-tpzi", "Toripalimab", "Imjudo", "Tremelimumab-actl", "Tremelimumab"]