In [None]:
import json
import requests
import numpy as np
import pandas as pd

# [Clinical Trials](https://clinicaltrials.gov/) Data ([API](https://clinicaltrials.gov/api/gui/home))

In [None]:
def get_ctgov_data(search_expr, cols, max_rank=100, min_rank=1):
    """
    This function retrieves data from the clinicaltrials.gov API using the specified search criteria and fields to retrieve. 
    The retrieved data is then processed and returned as a pandas DataFrame.

    Args:
        search_expr (str): The search expression to be used in retrieving data from the API.
        cols (list): A list of fields to retrieve from the API.
        max_rank (int): The maximum rank of the data to retrieve. Default is 100.
        min_rank (int): The minimum rank of the data to retrieve. Default is 1.

    Returns:
        pandas.DataFrame: A DataFrame containing the retrieved data.
    """
    url = (f'https://clinicaltrials.gov/api/query/study_fields?expr='
    f'{search_expr}&fields={",".join(cols)}&min_rnk={min_rank}&max_rnk={max_rank}&fmt=JSON')

    data = json.loads(requests.get(url).content)['StudyFieldsResponse']

    base_study = data['StudyFields'][0]
    for k in base_study.keys():
        if k != 'Rank':
            if len(base_study[k]) == 0:
                base_study[k] = [np.nan]
        else:
            base_study[k] = [base_study[k]]

    for s in data['StudyFields'][1:]:
        for k in base_study.keys():
            if k == 'Rank':
                base_study[k] = base_study[k] + [s[k]]
            else:
                if len(s[k]) == 1:
                    base_study[k] = base_study[k] + s[k]
                elif len(s[k]) > 1:
                    base_study[k] = base_study[k] + [", ".join(s[k])]
                else:
                    base_study[k] = base_study[k] + [np.nan]
    return pd.DataFrame(base_study)


def get_inclusion_exclusion(text):
    """
    This function extracts the inclusion and exclusion criteria from the eligibility criteria text of a clinical trial.
    
    Args:
        text (str): The eligibility criteria text of a clinical trial.

    Returns:
        pandas.Series: A Series containing the inclusion and exclusion criteria extracted from the eligibility criteria text.

    """
    if 'Inclusion Criteria:' in text:
        inc_idx = text.index('Inclusion Criteria:')
        inc_start = inc_idx + len('Inclusion Criteria:')
    else:
        inc_idx, inc_start = 500000, 500000
    if 'Exclusion Criteria:' in text:
        exc_idx = text.index('Exclusion Criteria:')
        exc_start = exc_idx + len('Exclusion Criteria:')
    else:
        exc_idx, exc_start = 500000, 500000

    if inc_idx < exc_start:
        inc = text[inc_start:exc_idx].strip("\n").replace('\n', '. ').replace(". .", ".")
        exc = text[exc_start:].strip("\n").replace('\n', '. ').replace(". .", ".")
    else:
        exc = text[inc_start:exc_idx].strip("\n").replace('\n', '. ').replace(". .", ".")
        inc = text[exc_start:].strip("\n").replace('\n', '. ').replace(". .", ".")
    return pd.Series([inc, exc]).replace("", np.nan)


def preprocess_ctgov_data(df):
    """
    This function preprocesses the clinical trial data by filtering out trials that are not recruiting, 
    extracting the minimum and maximum age criteria, and extracting the inclusion and 
    exclusion criteria from the eligibility criteria text.

    Args:
        df (pandas.DataFrame): The DataFrame containing the clinical trial data to preprocess.

    Returns:
        pandas.DataFrame: The preprocessed DataFrame.

    """
    df = df[df['OverallStatus']=='Recruiting'].copy()
    df['MinimumAge'] = df['MinimumAge'].str.findall(r'\d+', ).str[0].astype('float')
    df['MaximumAge'] = df['MaximumAge'].str.findall(r'\d+', ).str[0].astype('float')
    df[['InclusionCriteria', 'ExclusionCriteria']] = df['EligibilityCriteria'].apply(get_inclusion_exclusion)
    return df.drop(columns=['EligibilityCriteria', 'OverallStatus', 'Rank'])

In [None]:
# Fetching data from the ClinicalTrials website
extract_cols = [
    "NCTId", "EligibilityCriteria", "OverallStatus", "MinimumAge",
    "MaximumAge", "Gender", "HealthyVolunteers", "Condition",
]
df_ctgov = get_ctgov_data("heart+attack", extract_cols, min_rank=1, max_rank=100)
df_ctgov = preprocess_ctgov_data(df_ctgov)

In [None]:
df_ctgov.head()

Unnamed: 0,NCTId,MinimumAge,MaximumAge,Gender,HealthyVolunteers,Condition,InclusionCriteria,ExclusionCriteria
4,NCT03412435,19.0,,All,No,"Myocardial Infarction, Acute, Coronary Stenosis",All consecutive acute myocardial infarction pa...,
12,NCT03022552,21.0,99.0,Female,Accepts Healthy Volunteers,Myocardial Infarction,Acute ischemic symptoms compatible with diagno...,"Recent use of vasospastic agents, such as coca..."
17,NCT04050163,20.0,80.0,All,No,Myocardial Infarction,Patients aged 20~80. Acute Myocardial Infarcti...,Age <20 or >80. Pregnant or breast feeding. Po...
23,NCT03968445,21.0,,All,No,Myocardial Infarction,"Participation in UAB IRB protocol ""Neuroinflam...",Contraindication to MRI. Pregnancy. Lactation....
34,NCT03600259,,,All,No,"Myocardial Infarction, Acute",Hospitalized patients with acute myocardial in...,Those who did not meet the diagnosis of acute ...


In [None]:
# Checking for NULL values
print(df_ctgov.shape)
df_ctgov.isnull().sum()

(16, 8)


NCTId                0
MinimumAge           1
MaximumAge           9
Gender               0
HealthyVolunteers    1
Condition            0
InclusionCriteria    1
ExclusionCriteria    2
dtype: int64

In [None]:
# Saving data to disk
df_ctgov.to_parquet("/content/drive/MyDrive/Courses/2. Spring 23/3. BMIN521/Project/data/active_studies.parquet")