# CLINICAL TRIAL ANALYTICAL DASHBOARD

## Before to start


Befor to start, ensure the kernel used to run this Notebook have installed the proper packages. I usually work with uv package manager and create specific Kernels, I proceed with following code to test package availability & installation if required:

In [1]:
!uv pip freeze | grep mysql-connector-python || uv add mysql-connector-python

[1mmysql-connector-python[0m==9.5.0


In [2]:
!uv pip freeze | grep tqdm || uv add tqdm

[1mtqdm[0m==4.67.2


In [3]:
!uv pip freeze | grep schedule || uv add schedule

[1mschedule[0m==1.2.2


In [4]:
!uv pip freeze | grep plotly || uv add plotly

[1mplotly[0m==6.5.2


## 1. IMPORTS


In [90]:
import pandas as pd
import mysql.connector
from mysql.connector import Error
import os
from dotenv import load_dotenv
import numpy as np
from pathlib import Path
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt
from loguru import logger
# Siempre usa display() en lugar de print()
from IPython.display import display


## 2. FUNCTIONS


### 2.1 Data adquisiton

In [6]:

# Database connection with retry for the seed check
def get_db_connection_with_retry(retries=3, delay=1):
    for attempt in range(retries):
        try:
            connection = mysql.connector.connect(
                host=os.getenv('DB_HOST', 'mysql'),
                port=int(os.getenv('DB_PORT', 3306)),
                user=os.getenv('DB_USER', 'user'),
                password=os.getenv('DB_PASSWORD', 'pass'),
                database=os.getenv('DB_NAME', 'clinicaltrials')
            )
            return connection
        except Error as e:
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logger.error(f"Database connection failed after {retries} attempts: {e}")
                return None
    return None


In [7]:
def table_summary(df_dict, table_name):
    df = df_dict[table_name]
    summary = pd.DataFrame({
        'Column': df.columns,
        'Total_Values': df.count(),
        'Different_Values': df.nunique(),
        'Missing_Count': df.isnull().sum(),
        'Missing_%': (df.isnull().sum() / len(df) * 100).round(1),
        'Data_Type': df.dtypes
    })
    summary['Unique_%'] = (summary['Different_Values'] / summary['Total_Values'] * 100).round(1)
    return summary.sort_values('Missing_%', ascending=False).reset_index(drop=True)


In [8]:
def count_empty_strings(df):
    """
    Counts empty strings ("") in ALL columns of the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame to analyze
    
    Returns:
        pd.DataFrame: Report with counts and % of "" per column
    """
    import pandas as pd
    
    # Detect "" in all columns
    empty_counts = {}
    
    for col in df.columns:
        # Count exactly "" (not NaN, not spaces)
        empty_mask = (df[col].astype(str) == "")  # Convert to str for consistency
        count = empty_mask.sum()
        total = len(df)
        pct = (count / total * 100).round(2)
        
        empty_counts[col] = {
            'Empty_Strings': count,
            'Total_Rows': total,
            'Empty_%': pct
        }
    
    # Report sorted by most "" first
    report = pd.DataFrame(empty_counts).T
    report = report.sort_values('Empty_%', ascending=False)
    
    print(f"üîç EMPTY STRINGS ANALYSIS ('') - Shape: {df.shape}")
    print("=" * 60)
    
    if report['Empty_Strings'].sum() == 0:
        print("‚úÖ NO empty strings ('') found in any column")
        print()
    else:
        display(report)
        print(f"\nüìä TOTAL empty strings: {report['Empty_Strings'].sum()}")
        print()
    
    return report



### 2.2 filling functions

In [9]:
# NOT USED IN THIS NOTEBOOK! ONLY FOR REFERENCE

def get_mesh_term(condition_name: str) -> Optional[str]:
    """Fetch MeSH descriptor ID for a condition name."""
    if not condition_name:
        return None

    url = "https://id.nlm.nih.gov/mesh/lookup/descriptor"
    params = {
        "label": condition_name,
        "match": "contains",
    }

    try:
        resp = requests.get(url, params=params, timeout=10)
        resp.raise_for_status()
        results = resp.json() or []
        if not results:
            return None
        resource = results[0].get("resource")
        if not resource:
            return None
        return resource.rsplit("/", 1)[-1]
    except Exception as e:
        logger.error(f"error:{e}")
        return None


### 2.3 Grafication

In [10]:
BLUE_GREEN_PALETTE = [
    "#08306B",  # azul marino
    "#08519C",  # azul profundo
    "#2171B5",  # azul medio
    "#2C7FB8",  # azul verdoso
    "#1D91C0",  # cian oscuro
    "#41B6C4",  # turquesa
    "#2CA25F",  # verde azulado
    "#006D2C",  # verde oscuro
    "#00441B"   # verde muy oscuro
]

# 1. BAR CHART
def plot_bar(df, column_name, top_n=15, title=None):
    """Vertical bar chart"""
    vc = df[column_name].value_counts().head(top_n).reset_index()
    vc.columns = [column_name, 'Count']
    
    fig = px.bar(vc, x=column_name, y='Count',
                title=title or f'{column_name} Distribution',
                text='Count')
    fig.update_traces(textposition='outside')
    fig.update_xaxes(tickangle=45)
    fig.show()

# 2. HORIZONTAL BAR
def plot_barh(df, column_name, top_n=15, title=None):
    """Horizontal bar chart"""
    vc = df[column_name].value_counts().head(top_n).reset_index()
    vc.columns = [column_name, 'Count']
    
    fig = px.bar(vc, y=column_name, x='Count', orientation='h',
                title=title or f'Horizontal {column_name} Distribution',
                text='Count')
    fig.update_traces(textposition='outside')
    fig.show()

# 3. PIE CHART
def plot_pie(df, column_name, top_n=10, title=None):
    """Pie chart for top N categories"""
    vc = df[column_name].value_counts().head(top_n)
    
    fig = px.pie(values=vc.values, names=vc.index,
                title=title or f'{column_name} Proportions (Top {top_n})',
                hole=0.3,  # Donut style
                color_discrete_sequence=BLUE_GREEN_PALETTE)
    fig.show()

# 4. HISTOGRAM OF FREQUENCIES
def plot_hist_freq(df, column_name, title=None):
    """Histogram of value frequencies (frequency distribution)"""
    vc = df[column_name].value_counts()
    freq_vc = vc.value_counts().sort_index()
    
    fig = px.bar(x=freq_vc.index, y=freq_vc.values,
                title=title or f'{column_name} Frequency Histogram',
                labels={'x': 'Count', 'y': 'Number of Values'})
    fig.show()

# 5. BOX PLOT
def plot_box(df, column_name, title=None):
    """Box plot of counts per category"""
    vc = df[column_name].value_counts().reset_index()
    vc.columns = [column_name, 'Count']
    
    fig = px.box(vc, y='Count', x=column_name,
                title=title or f'{column_name} Counts Box Plot')
    fig.show()

# 6. VIOLIN PLOT
def plot_violin(df, column_name, title=None):
    """Violin plot - density distribution"""
    if pd.api.types.is_numeric_dtype(df[column_name]):
        fig = px.violin(df, y=column_name, 
                       title=title or f'{column_name} Violin Plot')
    else:
        vc = df[column_name].value_counts().reset_index()
        vc.columns = [column_name, 'Count']
        fig = px.violin(vc, y='Count', x=column_name,
                       title=title or f'{column_name} Counts Violin Plot')
    fig.show()

# 7. TREEMAP
def plot_treemap(df, column_name, top_n=20, title=None):
    """Interactive treemap visualization"""
    vc = df[column_name].value_counts().head(top_n).reset_index()
    vc.columns = [column_name, 'Count']
    
    fig = px.treemap(vc, path=[column_name], values='Count',
                    title=title or f'{column_name} Treemap')
    fig.show()

# 8. SCATTER PLOT
def plot_scatter(df, column_name, title=None):
    """Scatter plot of counts vs position"""
    vc = df[column_name].value_counts().head(20).reset_index()
    vc.columns = [column_name, 'Count']
    
    fig = px.scatter(vc, x=range(len(vc)), y='Count', size='Count',
                    hover_name=column_name,
                    title=title or f'{column_name} Counts Scatter Plot')
    fig.show()

# 9. HEATMAP
def plot_heatmap(df, column_name, top_n=15, title=None):
    """Heatmap of value frequencies"""
    vc = df[column_name].value_counts().head(top_n)
    
    fig = px.imshow([vc.values], 
                   labels=dict(x=column_name, y="Count", color="Frequency"),
                   title=title or f'{column_name} Heatmap',
                   aspect="auto")
    fig.show()



In [61]:

def plot_category_boxplot(df, cat_col, num_col, title=None):
    """
    Generates interactive Plotly boxplot for categorical predictor vs numerical target.
    
    Parameters:
    - df: pandas DataFrame
    - cat_col: str, categorical column name (x-axis)
    - num_col: str, numerical target column (y-axis)
    - title: str, optional plot title
    
    Returns: Plotly Figure object (fig.show() to display)
    """
    if title is None:
        title = f"{num_col} Distribution by {cat_col}"
    
    fig = px.box(df, x=cat_col, y=num_col,
                 title=title,
                 color=cat_col,  # Color boxes by category
                 points="outliers",  # Show outlier points
                 notched=True)  # Notches for median confidence
    
    fig.update_layout(
        xaxis_title=cat_col,
        yaxis_title=num_col,
        showlegend=False,
        height=500,
        template="plotly_white"
    )
    
    fig.update_traces(quartilemethod="exclusive", boxmean=True)
    
    return fig

# Usage example:
# fig = plot_category_boxplot(merged_df, 'status', 'enrollment_count')
# fig.show()


### 2.4 Data modification and augmentation

In [94]:

def add_duration_columns(studies_df):
    """
    Adds duration_days and duration_years for finished trials only.
    Fixed: Handles pd.NA properly.
    """
    df = studies_df.copy()

    # Convert to datetime (safe)
    for col in ['start_date', 'completion_date', 'primary_completion_date']:
        df[col] = pd.to_datetime(df[col], errors='coerce')

    # End date: completion_date or primary_completion_date
    end_date = df['completion_date'].fillna(df['primary_completion_date'])

    # Finished trials mask
    finished_status = ['COMPLETED', 'TERMINATED', 'WITHDRAWN']
    finished_mask = (
        df['status'].isin(finished_status) &
        df['start_date'].notna() &
        end_date.notna()
    )

    # Duration calculation
    df['duration_days'] = np.nan
    if finished_mask.any():
        duration_series = (end_date[finished_mask] - df.loc[finished_mask, 'start_date']).dt.days
        df.loc[finished_mask, 'duration_days'] = duration_series

    # Convert to numeric safely (handles NaN)
    df['duration_years'] = pd.to_numeric(df['duration_days'], errors='coerce') / 365.25
    
    # Round for readability
    df['duration_years'] = df['duration_years'].round(2)
    
    return df


In [39]:
therapeutic_areas_keywords = {
    "Oncology": ["cancer", "tumor", "carcinoma", "leukemia", "melanoma", "breast cancer", "AML", "glioblastoma", "glioma"],
    "Cardiology": ["heart", "cardiac", "myocardial", "infarction", "heart failure", "LVAD", "coronary", "arrhythmia"],
    "Neurology": ["brain", "stroke", "Alzheimer", "Parkinson", "epilepsy", "multiple sclerosis", "neuropathy", "dementia"],
    "Infectious": ["infection", "virus", "bacterial", "COVID", "HIV", "influenza", "antibiotic", "vaccine"],
    "Endocrine": ["diabetes", "thyroid", "insulin", "hormone", "metabolic", "obesity"],
    "Respiratory": ["asthma", "COPD", "lung", "pulmonary", "pneumonia", "respiration"],
    "Gastroenterology": ["liver", "hepatitis", "IBD", "Crohn", "ulcerative colitis", "colon", "gastrointestinal"],
    "Dermatology": ["skin", "psoriasis", "eczema", "dermatitis", "melanoma"],
    "Immunology": ["immune", "autoimmune", "rheumatoid", "arthritis", "allergy"],
    "Hematology": ["blood", "anemia", "hemoglobin", "lymphoma", "clotting", "thrombosis"]
}

def classify_therapeutic_area(text):
    """
    Classifies text into the most probable therapeutic area based on keyword matches.
    Returns 'area (score: X)' or 'No match'.
    """
    text_lower = text.lower()
    scores = {}
    for area, keywords in therapeutic_areas_keywords.items():
        count = sum(1 for kw in keywords if kw.lower() in text_lower)
        scores[area] = count
    if max(scores.values()) == 0:
        return "Unknown (score: 0)"
    max_area = max(scores, key=scores.get)
    max_score = scores[max_area]
    return f"{max_area} (score: {max_score})"


In [12]:
### 2.1 Grafication

## 3. DDBB CONECTION & DATAFRAMES GENERATION


In case you can not access to ddbb due to run it through a jupyter notebook. You can proceed with the next steps to grant acces to ddbb:
1. Get inside the mysql docker to modify privileges (use the root's passord)
```bash
docker exec -it mysql-clinical-db  mysql -u root -p
```
2.  Inside MySQL (modify the tu_password to the root's passord):
```sql
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'tu_password';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'172.%' IDENTIFIED BY 'tu_password';  -- Cubre Docker IPs
FLUSH PRIVILEGES;
SELECT user, host FROM mysql.user WHERE user='root';  -- Verifica '%'
EXIT;
```
It should print on screen:
```
+------+-----------+
| user | host      |
+------+-----------+
| root | %         |
| root | localhost |
+------+-----------+
```


3.  Restart dockers:
```bash
docker compose down && docker compose up -d
```

4.  Connect to ddbb using root as user:

        os.environ['DB_HOST'] = '127.0.0.1' # e.g., 'your_remote_db_host.com' or '127.0.0.1' if running locally

        os.environ['DB_PORT'] = '3306' # Your specific MySQL port

        os.environ['DB_USER'] = 'root' # Your MySQL username

        os.environ['DB_PASSWORD'] = 'rootpass' # Your MySQL password

        os.environ['DB_NAME'] = 'clinicaltrials' # Your database name





In [13]:
# --- IMPORTANT: Replace these with your actual MySQL database credentials ---
# Ensure your MySQL database is accessible from this Colab environment.


# Uncomment the following two lines if you have a .env file with your credentials
# from dotenv import load_dotenv
# load_dotenv()

# Alternatively, set them directly as environment variables:
# Example placeholders. You MUST change these to your actual database details.
os.environ['DB_HOST'] = '127.0.0.1' # e.g., 'your_remote_db_host.com' or '127.0.0.1' if running locally
os.environ['DB_PORT'] = '3306' # Your specific MySQL port, often 3306
os.environ['DB_USER'] = 'root' # Your MySQL username
os.environ['DB_PASSWORD'] = 'rootpass' # Your MySQL password
os.environ['DB_NAME'] = 'clinicaltrials' # Your database name

print("Placeholder MySQL database credentials have been set. Please update them with your actual details.")
print("After updating, re-run the connection cell below (J1qAT_SARyIP).")

Placeholder MySQL database credentials have been set. Please update them with your actual details.
After updating, re-run the connection cell below (J1qAT_SARyIP).


In [14]:
tables_ddbb= ['studies', 'conditions', 'interventions', 'outcomes', 'sponsors', 'locations', 'study_design']


In [15]:
connection = get_db_connection_with_retry()
dict_df_clinical_trials = {}
if connection:
    try:

        for table in tables_ddbb:
            query = f'SELECT * FROM {table}'
            dict_df_clinical_trials[table] = pd.read_sql_query(query, connection)
            logger.success(f"Data from '{table}' table loaded successfully. Number of records: {len(dict_df_clinical_trials[table])}")
        logger.success("All Data from ddbb loaded successfully.")
    except Error as e:
        logger.error(f"Error reading data from MySQL: {e}")
    finally:
        if connection.is_connected():
            connection.close()
            logger.info("MySQL connection closed.")
else:
    logger.error("Failed to connect to MySQL database.")

  dict_df_clinical_trials[table] = pd.read_sql_query(query, connection)
[32m2026-02-03 18:27:40.877[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [32m[1mData from 'studies' table loaded successfully. Number of records: 10000[0m
[32m2026-02-03 18:27:40.905[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [32m[1mData from 'conditions' table loaded successfully. Number of records: 17750[0m
[32m2026-02-03 18:27:40.938[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [32m[1mData from 'interventions' table loaded successfully. Number of records: 16715[0m
[32m2026-02-03 18:27:41.052[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [32m[1mData from 'outcomes' table loaded successfully. Number of records: 61766[0m
[32m2026-02-03 18:27:41.070[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [32m[1mData from 'sponsors' table lo

## 4. DATA EXPLORATORY ANALYSIS

### 4.0. Tables overviews

In [16]:
for table in tables_ddbb:
    print (f"# # # # # # # {table}   # # # # # # #")
    display(dict_df_clinical_trials[table].head(10))

# # # # # # # studies   # # # # # # #


Unnamed: 0,study_id,nct_id,title,acronym,status,phase,study_type,start_date,completion_date,primary_completion_date,enrollment,enrollment_type,brief_summary,eligibility_criteria,minimum_age,maximum_age,gender,created_at,updated_at
0,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,50.0,ESTIMATED,The investigators will be evaluating the use o...,Inclusion Criteria:\n\n* Cognitively able to c...,18 Years,,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
1,2,NCT03353935,Functional Outcomes After Nerve Sparing Surger...,,COMPLETED,,OBSERVATIONAL,2016-01-01,2017-03-01,2017-03-01,36.0,ACTUAL,Patients who underwent surgery for deep endome...,Inclusion Criteria:\n\n* Deep endometriosis\n*...,18 Years,55 Years,FEMALE,2026-02-02 19:29:25,2026-02-02 19:29:25
2,3,NCT05783635,Alcohol Screening and Preoperative Interventio...,ASPIRE-2,RECRUITING,,INTERVENTIONAL,2023-04-17,2027-02-28,2027-02-28,440.0,ESTIMATED,"This sequential, multiple assignment, randomiz...",Inclusion Criteria:\n\n1. Completed consent fo...,21 Years,75 Years,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
3,4,NCT02341235,Self-monitoring Activity: a Randomized Trial o...,SMARTGOAL,COMPLETED,,INTERVENTIONAL,2015-02-01,2021-06-30,2020-06-30,90.0,ACTUAL,The purpose of this study is to compare an enh...,Inclusion Criteria:\n\n1. Age between 45 and 7...,45 Years,75 Years,FEMALE,2026-02-02 19:29:25,2026-02-02 19:29:25
4,5,NCT04518735,Evolution of COVID-19 in Anticoagulated or Ant...,CORONA,COMPLETED,,OBSERVATIONAL,2020-04-01,2020-06-30,2020-06-30,1707.0,ACTUAL,"CORONA is a retrospective, observational, one ...",Inclusion Criteria:\n\n* Patient admitted for ...,,,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
5,6,NCT01192035,PI or NNRTI as First-line Treatment of HIV in ...,PIONA,COMPLETED,PHASE4,INTERVENTIONAL,2011-05-01,2014-09-01,2014-09-01,400.0,ACTUAL,BACKGROUND: Since 1996 the combination of thre...,Inclusion Criteria:\n\n* Antiretroviral treatm...,18 Years,,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
6,7,NCT06856135,Expanded Access to Vedolizumab for Children an...,,AVAILABLE,,EXPANDED_ACCESS,,,,,,The expanded access program (EAP) allows peopl...,Inclusion Criteria:\n\n1. The participant has ...,2 Years,,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
7,8,NCT05726435,Effects of Soluble Dietary Fiber on Sport Effi...,FiberPlay,COMPLETED,,INTERVENTIONAL,2021-09-01,2021-12-20,2021-10-31,20.0,ACTUAL,Athlete nutrition is becoming an increasingly ...,Inclusion Criteria:\n\n* Professional basketba...,18 Years,22 Years,MALE,2026-02-02 19:29:25,2026-02-02 19:29:25
8,9,NCT01445535,Phase 1 Trial of Siplizumab and Dose-Adjusted ...,,COMPLETED,PHASE1,INTERVENTIONAL,2009-01-13,2020-10-22,2011-04-01,15.0,ACTUAL,Studies conducted at the National Cancer Insti...,* INCLUSION CRITERIA:\n\nCluster of differenti...,18 Years,120 Years,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25
9,10,NCT06455735,68Ga-JH04 PET/CT in Patients With Various Type...,,RECRUITING,EARLY_PHASE1,INTERVENTIONAL,2024-03-01,2026-04-01,2025-12-01,30.0,ESTIMATED,As a novel radiotracer targeting fibroblast ac...,Inclusion Criteria:\n\nVarious solid tumors wi...,18 Years,80 Years,ALL,2026-02-02 19:29:25,2026-02-02 19:29:25


# # # # # # # conditions   # # # # # # #


Unnamed: 0,condition_id,study_id,condition_name,mesh_term
0,1,1,Distal Radius Fracture,
1,2,1,Tendon Rupture,
2,3,2,Endometriosis,
3,4,3,Alcohol Drinking,
4,5,4,Breast Cancer,
5,6,4,Obesity,
6,7,5,Covid19,
7,8,6,HIV-1,
8,9,7,Crohn's Disease,
9,10,7,Ulcerative Colitis,


# # # # # # # interventions   # # # # # # #


Unnamed: 0,intervention_id,study_id,intervention_type,name,description
0,1,1,DEVICE,Versawrap membrane,Versawrap membrane will be placed between dist...
1,2,2,OTHER,Validated questionnaires,Patients were administered pre- and post-opera...
2,3,3,BEHAVIORAL,Enhanced Usual Care (pre-operative),The enhanced usual care will receive standard ...
3,4,3,BEHAVIORAL,Preoperative Virtual Health Coaching,Preoperative Virtual Coaching is based on prin...
4,5,3,BEHAVIORAL,Usual surgical care (post-operative),This group will receive standard post-operativ...
5,6,3,BEHAVIORAL,Postoperative Virtual Health Coaching,Postoperative Virtual Coaching uses the same f...
6,7,3,BEHAVIORAL,On-Track (Post-operative),On-Track is a mobile and web-accessible health...
7,8,4,BEHAVIORAL,Game intervention,The game will target motivation via narrative ...
8,9,4,BEHAVIORAL,Standard intervention,The electronic activity monitor will monitor s...
9,10,5,OTHER,Antithrombotic Therapy (anticoagulant and/or a...,Review of medical records during hospitalizati...


# # # # # # # outcomes   # # # # # # #


Unnamed: 0,outcome_id,study_id,outcome_type,measure,time_frame,description
0,1,1,Primary,Range of Motion: Thumb Interphalangeal and Ind...,6 months,Standardized clinical examination (relative to...
1,2,1,Primary,Range of Motion: Thumb and Index Finger,6 months,Standardized clinical examination (relative to...
2,3,1,Primary,Range of Motion: Wrist,6 months,Standardized clinical examination (relative to...
3,4,1,Primary,Ultrasound Assessment of Flexor Pollicis Longu...,6 Months,Tendons in the volar forearm will be directly ...
4,5,1,Secondary,Complications,6 months,Incidence of any of the following perioperativ...
5,6,1,Secondary,Patient-Rated Wrist Evaluation (PRWE),6 Months,The Patient-Rated Wrist Evaluation (PRWE) meas...
6,7,1,Secondary,"Quick Disabilities of the Arm, Shoulder and Ha...",6 Months,"The Quick Disabilities of the Arm, Shoulder an..."
7,8,1,Secondary,Visual Analog Scale (VAS) pain scores,6 Months,The Visual Analog Scale measures patient repor...
8,9,1,Secondary,Subjective Reporting Specific to Tendon Function,6 Months,The number of participants reporting the follo...
9,10,1,Secondary,Ultrasound Assessment of Flexor Pollicis Longu...,6 Months,Tendons in the volar forearm will be directly ...


# # # # # # # sponsors   # # # # # # #


Unnamed: 0,sponsor_id,study_id,agency,agency_class,lead_or_collaborator
0,1,1,"University of Colorado, Denver",OTHER,lead
1,2,2,Universit√† degli Studi dell'Insubria,OTHER,lead
2,3,3,University of Michigan,OTHER,lead
3,4,3,National Institute on Alcohol Abuse and Alcoho...,NIH,collaborator
4,5,4,"The University of Texas Medical Branch, Galveston",OTHER,lead
5,6,4,"American Cancer Society, Inc.",OTHER,collaborator
6,7,5,Fundaci√≥ Institut de Recerca de l'Hospital de ...,OTHER,lead
7,8,6,University of Aarhus,OTHER,lead
8,9,6,Aarhus University Hospital Skejby,OTHER,collaborator
9,10,6,Bandim Health Project,OTHER,collaborator


# # # # # # # locations   # # # # # # #


Unnamed: 0,location_id,study_id,facility,city,state,country,continent
0,1,1,University of Colorado Health Hospital,Aurora,Colorado,United States,North America
1,2,1,Denver Health Hospital,Denver,Colorado,United States,North America
2,3,2,Department of Obstetrics and Gynecology Univer...,Varese,,Italy,Europe
3,4,3,University of Michigan,Ann Arbor,Michigan,United States,North America
4,5,4,The University of Texas Medical Branch,Galveston,Texas,United States,North America
5,6,5,Hospital de la Santa Creu i Sant Pau,Barcelona,Catalonia,Spain,Europe
6,7,6,Centro de Tratamento Ambulatoria do Hospital N...,Bissau,,Guinea-Bissau,Africa
7,8,7,University of California San Francisco,San Francisco,California,United States,North America
8,9,7,Children's Center for Digestive Healthcare,Atlanta,Georgia,United States,North America
9,10,7,Seattle Children's Hospital,Seattle,Washington,United States,North America


# # # # # # # study_design   # # # # # # #


Unnamed: 0,design_id,study_id,allocation,intervention_model,masking,primary_purpose,observational_model
0,1,1,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,
1,2,2,,,,,CASE_ONLY
2,3,3,RANDOMIZED,SEQUENTIAL,SINGLE,TREATMENT,
3,4,4,RANDOMIZED,PARALLEL,SINGLE,SUPPORTIVE_CARE,
4,5,5,,,,,CASE_CONTROL
5,6,6,RANDOMIZED,PARALLEL,NONE,TREATMENT,
6,7,8,RANDOMIZED,PARALLEL,DOUBLE,OTHER,
7,8,9,,SEQUENTIAL,NONE,TREATMENT,
8,9,10,RANDOMIZED,PARALLEL,NONE,DIAGNOSTIC,
9,10,11,RANDOMIZED,PARALLEL,SINGLE,PREVENTION,


### 4.1. Statistical Analysis

In [27]:
for table in tables_ddbb:
    print (f"# # # # # # # {table}   # # # # # # #")
    display(dict_df_clinical_trials[table].describe())
            

# # # # # # # studies   # # # # # # #


Unnamed: 0,study_id,enrollment,created_at,updated_at
count,10000.0,9872.0,10000,10000
mean,5000.5,2066.886,2026-02-02 19:29:38.556600,2026-02-02 19:29:38.556600
min,1.0,0.0,2026-02-02 19:29:25,2026-02-02 19:29:25
25%,2500.75,30.0,2026-02-02 19:29:32,2026-02-02 19:29:32
50%,5000.5,68.0,2026-02-02 19:29:38,2026-02-02 19:29:38
75%,7500.25,190.0,2026-02-02 19:29:46,2026-02-02 19:29:46
max,10000.0,4238504.0,2026-02-02 19:29:52,2026-02-02 19:29:52
std,2886.89568,59251.77,,


# # # # # # # conditions   # # # # # # #


Unnamed: 0,condition_id,study_id
count,17750.0,17750.0
mean,8875.5,4962.439211
std,5124.127975,2875.612055
min,1.0,1.0
25%,4438.25,2485.0
50%,8875.5,4972.0
75%,13312.75,7408.0
max,17750.0,10000.0


# # # # # # # interventions   # # # # # # #


Unnamed: 0,intervention_id,study_id
count,16715.0,16715.0
mean,8358.0,5018.800239
std,4825.34921,2900.037951
min,1.0,1.0
25%,4179.5,2496.5
50%,8358.0,5032.0
75%,12536.5,7532.0
max,16715.0,10000.0


# # # # # # # outcomes   # # # # # # #


Unnamed: 0,outcome_id,study_id
count,61766.0,61766.0
mean,30883.5,5098.013405
std,17830.4527,2837.621139
min,1.0,1.0
25%,15442.25,2683.0
50%,30883.5,5281.5
75%,46324.75,7454.75
max,61766.0,10000.0


# # # # # # # sponsors   # # # # # # #


Unnamed: 0,sponsor_id,study_id
count,15861.0,15861.0
mean,7931.0,4972.140975
std,4578.820645,2904.195303
min,1.0,1.0
25%,3966.0,2461.0
50%,7931.0,5000.0
75%,11896.0,7460.0
max,15861.0,10000.0


# # # # # # # locations   # # # # # # #


Unnamed: 0,location_id,study_id
count,56916.0,56916.0
mean,28458.5,5297.656406
std,16430.378298,2936.649709
min,1.0,1.0
25%,14229.75,2851.0
50%,28458.5,5353.0
75%,42687.25,7848.0
max,56916.0,10000.0


# # # # # # # study_design   # # # # # # #


Unnamed: 0,design_id,study_id
count,9851.0,9851.0
mean,4926.0,5008.067303
std,2843.883085,2888.100171
min,1.0,1.0
25%,2463.5,2507.5
50%,4926.0,5016.0
75%,7388.5,7509.5
max,9851.0,10000.0


The only true numerical column is number of enrollments. Such variable has a great distribution with a range of 

In [40]:
for table in tables_ddbb:
    print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
    print (f"Missing values:")
    df=pd.DataFrame(dict_df_clinical_trials[table].isna().sum(), columns=['Missing Values'])
    df['% missing'] = ((df['Missing Values'] / len(dict_df_clinical_trials[table])) * 100).round(2)

    df['type'] = dict_df_clinical_trials[table].dtypes
    display(df)
       

# # # # # # studies (10000 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
study_id,0,0.0,int64
nct_id,0,0.0,str
title,0,0.0,str
acronym,7188,71.88,str
status,0,0.0,str
phase,2455,24.55,str
study_type,0,0.0,str
start_date,103,1.03,object
completion_date,285,2.85,object
primary_completion_date,382,3.82,object


# # # # # # conditions (17750 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
condition_id,0,0.0,int64
study_id,0,0.0,int64
condition_name,0,0.0,str
mesh_term,17750,100.0,object


# # # # # # interventions (16715 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
intervention_id,0,0.0,int64
study_id,0,0.0,int64
intervention_type,0,0.0,str
name,5,0.03,str
description,1542,9.23,str


# # # # # # outcomes (61766 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
outcome_id,0,0.0,int64
study_id,0,0.0,int64
outcome_type,0,0.0,str
measure,0,0.0,str
time_frame,874,1.42,str
description,11249,18.21,str


# # # # # # sponsors (15861 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
sponsor_id,0,0.0,int64
study_id,0,0.0,int64
agency,0,0.0,str
agency_class,23,0.15,str
lead_or_collaborator,0,0.0,str


# # # # # # locations (56916 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
location_id,0,0.0,int64
study_id,0,0.0,int64
facility,0,0.0,str
city,0,0.0,str
state,0,0.0,str
country,0,0.0,str
continent,20,0.04,str


# # # # # # study_design (9851 records in total) # # # # # # 
Missing values:


Unnamed: 0,Missing Values,% missing,type
design_id,0,0.0,int64
study_id,0,0.0,int64
allocation,2397,24.33,str
intervention_model,2405,24.41,str
masking,2396,24.32,str
primary_purpose,2431,24.68,str
observational_model,7540,76.54,str


### 4.2. Missing data & Uniqueness analysis

In [51]:
for table in tables_ddbb:
    print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
    df_summary =table_summary(df_dict=dict_df_clinical_trials, table_name=table)
    # print (f"Distribution:")
    # df=pd.DataFrame(dict_df_clinical_trials[table].count(), columns=['Total Values Values'])
    # df['unic values'] = df['Different Values'].unique().sum() 
    display(df_summary)
       

# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,acronym,2812,2758,7188,71.9,str,98.1
1,maximum_age,5367,176,4633,46.3,str,3.3
2,phase,7545,6,2455,24.6,str,0.1
3,minimum_age,9352,121,648,6.5,str,1.3
4,primary_completion_date,9618,2620,382,3.8,object,27.2
5,enrollment_type,9689,2,311,3.1,str,0.0
6,completion_date,9715,2638,285,2.8,object,27.2
7,enrollment,9872,1090,128,1.3,float64,11.0
8,start_date,9897,2916,103,1.0,object,29.5
9,study_id,10000,10000,0,0.0,int64,100.0


# # # # # # conditions (17750 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,mesh_term,0,0,17750,100.0,object,
1,condition_id,17750,17750,0,0.0,int64,100.0
2,study_id,17750,9977,0,0.0,int64,56.2
3,condition_name,17750,8850,0,0.0,str,49.9


# # # # # # interventions (16715 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,description,15173,13948,1542,9.2,str,91.9
1,intervention_id,16715,16715,0,0.0,int64,100.0
2,study_id,16715,8966,0,0.0,int64,53.6
3,intervention_type,16715,11,0,0.0,str,0.1
4,name,16710,13113,5,0.0,str,78.5


# # # # # # outcomes (61766 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,description,50517,42796,11249,18.2,str,84.7
1,time_frame,60892,16646,874,1.4,str,27.3
2,study_id,61766,9694,0,0.0,int64,15.7
3,outcome_id,61766,61766,0,0.0,int64,100.0
4,measure,61766,50876,0,0.0,str,82.4
5,outcome_type,61766,2,0,0.0,str,0.0


# # # # # # sponsors (15861 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,agency_class,15838,8,23,0.1,str,0.1
1,sponsor_id,15861,15861,0,0.0,int64,100.0
2,study_id,15861,10000,0,0.0,int64,63.0
3,agency,15861,6621,0,0.0,str,41.7
4,lead_or_collaborator,15861,2,0,0.0,str,0.0


# # # # # # locations (56916 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,location_id,56916,56916,0,0.0,int64,100.0
1,study_id,56916,9000,0,0.0,int64,15.8
2,facility,56916,29877,0,0.0,str,52.5
3,city,56916,6553,0,0.0,str,11.5
4,state,56916,1660,0,0.0,str,2.9
5,country,56916,149,0,0.0,str,0.3
6,continent,56896,6,20,0.0,str,0.0


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,Column,Total_Values,Different_Values,Missing_Count,Missing_%,Data_Type,Unique_%
0,observational_model,2311,9,7540,76.5,str,0.4
1,primary_purpose,7420,9,2431,24.7,str,0.1
2,intervention_model,7446,5,2405,24.4,str,0.1
3,allocation,7454,3,2397,24.3,str,0.0
4,masking,7455,5,2396,24.3,str,0.1
5,study_id,9851,9851,0,0.0,int64,100.0
6,design_id,9851,9851,0,0.0,int64,100.0


***Coments***: 
- Column **continent** was filled using data from country's column and packages pycountry (done when modifying ddbb with )
- **Mesh_term** data was filled when available using an external API ("https://id.nlm.nih.gov/mesh/lookup/descriptor") and data from condition_name's column.  (done when modifying ddbb with )
- Column *mesh_term* was removed as no clear way to get its value (to be reevaluted)
- Unique_% in certain cases shows warnings of duplicity (for instance in "acronym", "title", "brief_summary" columns in "studies" table, values near 100% but lower indicate duplicity) --> to be checked!
- On the other side, unique_% close to 0 values indicates low --> to be depeer studied!

### 4.3. Duplicity & Distribution analysis

In [95]:
for table in tables_ddbb:
    print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
    for col in dict_df_clinical_trials[table].columns:
        if "_id" not in col:
            vc = pd.DataFrame(dict_df_clinical_trials[table][col].value_counts().head(5)).reset_index()
            display(vc) 

# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,title,count
0,[Trial of device that is not approved or clear...,23
1,Efficacy and Safety Study of SHP647 as Inducti...,2
2,Quantitative and Clinical Assessment of Flexor...,1
3,Functional Outcomes After Nerve Sparing Surger...,1
4,Alcohol Screening and Preoperative Interventio...,1


Unnamed: 0,acronym,count
0,RCT,5
1,SMART,4
2,PREDICT,3
3,SCOPE,3
4,SAFE,3


Unnamed: 0,status,count
0,COMPLETED,5406
1,UNKNOWN,1566
2,RECRUITING,1155
3,TERMINATED,616
4,NOT_YET_RECRUITING,440


Unnamed: 0,phase,count
0,,3773
1,PHASE2,1217
2,PHASE1,1127
3,PHASE3,696
4,PHASE4,631


Unnamed: 0,study_type,count
0,INTERVENTIONAL,7546
1,OBSERVATIONAL,2416
2,,23
3,EXPANDED_ACCESS,15


Unnamed: 0,start_date,count
0,2015-01-01,46
1,2014-01-01,44
2,2013-01-01,43
3,2012-01-01,41
4,2016-01-01,40


Unnamed: 0,completion_date,count
0,2025-12-31,73
1,2025-12-01,71
2,2026-12-31,65
3,2024-12-31,59
4,2026-12-01,52


Unnamed: 0,primary_completion_date,count
0,2025-12-01,69
1,2025-12-31,68
2,2015-12-01,57
3,2024-12-31,52
4,2026-12-31,52


Unnamed: 0,enrollment,count
0,30.0,367
1,60.0,348
2,40.0,315
3,20.0,304
4,100.0,286


Unnamed: 0,enrollment_type,count
0,ACTUAL,6164
1,ESTIMATED,3525


Unnamed: 0,brief_summary,count
0,,23
1,The proposed study will explore whether remote...,2
2,Investigators are building an empirical eviden...,2
3,The investigators will be evaluating the use o...,1
4,Patients who underwent surgery for deep endome...,1


Unnamed: 0,eligibility_criteria,count
0,,27
1,No eligibility criteria,9
2,Inclusion Criteria:\n\n* Patients must have pe...,2
3,Inclusion Criteria:\n\n* Normal color vision\n...,2
4,"Inclusion Criteria:\n\n* healthy, adult subjec...",2


Unnamed: 0,minimum_age,count
0,18 Years,6312
1,20 Years,324
2,40 Years,204
3,21 Years,199
4,50 Years,173


Unnamed: 0,maximum_age,count
0,65 Years,644
1,80 Years,535
2,75 Years,517
3,70 Years,451
4,60 Years,278


Unnamed: 0,gender,count
0,ALL,8617
1,FEMALE,974
2,MALE,379
3,All,30


Unnamed: 0,created_at,count
0,2026-02-02 19:29:38,447
1,2026-02-02 19:29:32,444
2,2026-02-02 19:29:37,438
3,2026-02-02 19:29:49,437
4,2026-02-02 19:29:47,434


Unnamed: 0,updated_at,count
0,2026-02-02 19:29:38,447
1,2026-02-02 19:29:32,444
2,2026-02-02 19:29:37,438
3,2026-02-02 19:29:49,437
4,2026-02-02 19:29:47,434


# # # # # # conditions (17750 records in total) # # # # # # 


Unnamed: 0,condition_name,count
0,Healthy,198
1,Breast Cancer,136
2,Obesity,119
3,Stroke,95
4,Hypertension,91


Unnamed: 0,mesh_term,count


# # # # # # interventions (16715 records in total) # # # # # # 


Unnamed: 0,intervention_type,count
0,DRUG,6866
1,OTHER,2968
2,DEVICE,1657
3,BEHAVIORAL,1571
4,PROCEDURE,1569


Unnamed: 0,name,count
0,Placebo,649
1,placebo,63
2,Dexamethasone,35
3,Pembrolizumab,32
4,laboratory biomarker analysis,32


Unnamed: 0,description,count
0,Given IV,121
1,Correlative studies,82
2,Given PO,61
3,Ancillary studies,44
4,Placebo,37


# # # # # # outcomes (61766 records in total) # # # # # # 


Unnamed: 0,outcome_type,count
0,Secondary,41157
1,Primary,20609


Unnamed: 0,measure,count
0,Overall survival,107
1,Overall Survival (OS),89
2,Overall Survival,79
3,Overall survival (OS),74
4,Adverse events,65


Unnamed: 0,time_frame,count
0,6 months,999
1,12 months,860
2,1 year,724
3,Baseline,655
4,2 years,586


Unnamed: 0,description,count
0,"Major events: all-cause death, cardiac death, ...",154
1,"NICMs will include but not limit to: DCM, HCM,...",147
2,Diagnostic concordance in terms of sensitivity...,98
3,Multimodal diagnostic workup is a combination ...,98
4,The multimodal diagnostic workup is a combinat...,98


# # # # # # sponsors (15861 records in total) # # # # # # 


Unnamed: 0,agency,count
0,National Cancer Institute (NCI),243
1,AstraZeneca,96
2,GlaxoSmithKline,88
3,Pfizer,83
4,"National Heart, Lung, and Blood Institute (NHLBI)",74


Unnamed: 0,agency_class,count
0,OTHER,10251
1,INDUSTRY,3095
2,UNKNOWN,830
3,NIH,800
4,OTHER_GOV,535


Unnamed: 0,lead_or_collaborator,count
0,lead,10000
1,collaborator,5861


# # # # # # locations (56916 records in total) # # # # # # 


Unnamed: 0,facility,count
0,Research Site,3467
1,,2848
2,Novartis Investigative Site,1462
3,GSK Investigational Site,1377
4,Local Institution,467


Unnamed: 0,city,count
0,New York,561
1,Seoul,498
2,Houston,460
3,Boston,437
4,London,418


Unnamed: 0,state,count
0,,20522
1,California,2687
2,Florida,1771
3,Texas,1676
4,Ohio,1417


Unnamed: 0,country,count
0,United States,26516
1,China,3360
2,France,2870
3,Germany,2059
4,Spain,1885


Unnamed: 0,continent,count
0,North America,28462
1,Europe,17383
2,Asia,8436
3,South America,1129
4,Africa,750


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,allocation,count
0,RANDOMIZED,4955
1,,1673
2,NON_RANDOMIZED,826


Unnamed: 0,intervention_model,count
0,PARALLEL,4492
1,SINGLE_GROUP,2026
2,CROSSOVER,591
3,SEQUENTIAL,234
4,FACTORIAL,103


Unnamed: 0,masking,count
0,NONE,4139
1,SINGLE,1134
2,DOUBLE,1035
3,QUADRUPLE,634
4,TRIPLE,513


Unnamed: 0,primary_purpose,count
0,TREATMENT,4787
1,PREVENTION,816
2,SUPPORTIVE_CARE,428
3,OTHER,424
4,BASIC_SCIENCE,351


Unnamed: 0,observational_model,count
0,COHORT,1430
1,CASE_ONLY,301
2,CASE_CONTROL,280
3,OTHER,239
4,ECOLOGIC_OR_COMMUNITY,26


Comments:
- Same title was found for 2 different studies, let's check it: in the next point.
- Generic title "[Trial of device that is not approved or cleared by the U.S. FDA]" was used 23 times 

Warnings:
- values "" are counted as a value instead of missing values (see "state", "facility",...) --> to be checked if there is relevance on missing value determination for such values
- in "measure" from table "outcomes" 4 different options that should be the same due to no difference in reporting Overall survival 107 / Overall Survival (OS)	89 / Overall Survival	79 / Overall survival (OS)74  --> to be deeper studied if info considered relevant!

Data duplicity for a title of one study was considered as a warning signal, let's deep in this case

In [74]:
dict_df_clinical_trials[table][dict_df_clinical_trials["studies"]["title"]=="Efficacy and Safety Study of SHP647 as Induction Therapy in Participants With Moderate to Severe Ulcerative Colitis"]

  dict_df_clinical_trials[table][dict_df_clinical_trials["studies"]["title"]=="Efficacy and Safety Study of SHP647 as Induction Therapy in Participants With Moderate to Severe Ulcerative Colitis"]


Unnamed: 0,design_id,study_id,allocation,intervention_model,masking,primary_purpose,observational_model
3981,3982,4057,RANDOMIZED,CROSSOVER,NONE,OTHER,
6839,6840,6955,,SINGLE_GROUP,NONE,TREATMENT,


Actually, the 2 studies shares the title but have a different design. It's not a relevant issue.

I will checked the pressence of empty strin ("") in all the tables

In [93]:
for table in tables_ddbb:
    print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
    count_empty_strings(dict_df_clinical_trials[table])

# # # # # # studies (10000 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (10000, 19)


Unnamed: 0,Empty_Strings,Total_Rows,Empty_%
eligibility_criteria,27.0,10000.0,0.27
study_type,23.0,10000.0,0.23
brief_summary,23.0,10000.0,0.23
title,0.0,10000.0,0.0
nct_id,0.0,10000.0,0.0
study_id,0.0,10000.0,0.0
acronym,0.0,10000.0,0.0
start_date,0.0,10000.0,0.0
completion_date,0.0,10000.0,0.0
status,0.0,10000.0,0.0



üìä TOTAL empty strings: 73.0

# # # # # # conditions (17750 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (17750, 4)
‚úÖ NO empty strings ('') found in any column

# # # # # # interventions (16715 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (16715, 5)
‚úÖ NO empty strings ('') found in any column

# # # # # # outcomes (61766 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (61766, 6)
‚úÖ NO empty strings ('') found in any column

# # # # # # sponsors (15861 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (15861, 5)
‚úÖ NO empty strings ('') found in any column

# # # # # # locations (56916 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (56916, 7)


Unnamed: 0,Empty_Strings,Total_Rows,Empty_%
state,20522.0,56916.0,36.06
facility,2848.0,56916.0,5.0
location_id,0.0,56916.0,0.0
study_id,0.0,56916.0,0.0
city,0.0,56916.0,0.0
country,0.0,56916.0,0.0
continent,0.0,56916.0,0.0



üìä TOTAL empty strings: 23370.0

# # # # # # study_design (9851 records in total) # # # # # # 
üîç EMPTY STRINGS ANALYSIS ('') - Shape: (9851, 7)
‚úÖ NO empty strings ('') found in any column



Columns that have a relevant number of "" values are **state** and **facility** from **locations** table.

In [111]:
for table in tables_ddbb:
    
    for col in dict_df_clinical_trials[table].columns:
        if "_id" not in col:
            vc = pd.DataFrame(dict_df_clinical_trials[table][col].value_counts().reset_index())
            if 0<len(vc) <=4:
                print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
                display(vc) 
                plot_pie(dict_df_clinical_trials[table], col, top_n=10, title=f"Ratios of {col} in table {table}")
            

# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,study_type,count
0,INTERVENTIONAL,7546
1,OBSERVATIONAL,2416
2,,23
3,EXPANDED_ACCESS,15


# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,enrollment_type,count
0,ACTUAL,6164
1,ESTIMATED,3525


# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,gender,count
0,ALL,8617
1,FEMALE,974
2,MALE,379
3,All,30


# # # # # # outcomes (61766 records in total) # # # # # # 


Unnamed: 0,outcome_type,count
0,Secondary,41157
1,Primary,20609


# # # # # # sponsors (15861 records in total) # # # # # # 


Unnamed: 0,lead_or_collaborator,count
0,lead,10000
1,collaborator,5861


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,allocation,count
0,RANDOMIZED,4955
1,,1673
2,NON_RANDOMIZED,826


Insights:


* **Study type**: The dataset is strongly skewed toward interventional studies.

    Interventional: 7,546 (75.5%) ‚Äî dominant study type

    Observational: 2,416 (24.2%)

    Unspecified / missing: 23 (0.2%)

    Expanded access: 15 (0.15%)

* **Enrollment type**: Most studies report final (actual) enrollment, but over one-third rely on estimates.

    Actual enrollment: 6,164 (61.6%)

    Estimated enrollment: 3,525 (35.3%)

    Missing enrollment type: 311 (3.1%)


* **Gender eligibility**: The vast majority of studies are open to all genders

    All genders: 8,617 (86.2%)

    Female only: 974 (9.7%)

    Male only: 379 (3.8%)
 

* **Outcomes**: Studies define about twice as many secondary outcomes as primary outcomes.

    Secondary outcomes: 41,157 (66.6%)

    Primary outcomes: 20,609 (33.4%)

* **Sponsors**:  Every study has exactly one lead sponsor, with many involving additional collaborators.

    Lead sponsors: 10,000 (63.1%)

    Collaborators: 5,861 (36.9%)



*  **Study design ‚Äì Allocation** Roughly half of the studies are randomized, but a substantial portion lack allocation data


    Randomized: 4,955 (50.3%)

    Non-randomized: 826 (8.4%)

    Not applicable / missing (NA): 1,673 (17.0%)

    Unaccounted records: 2,397 (24.3%)


In [110]:
for table in tables_ddbb:
    
    for col in dict_df_clinical_trials[table].columns:
        if "_id" not in col:
            vc = pd.DataFrame(dict_df_clinical_trials[table][col].value_counts().reset_index())
            if 4<len(vc) <=9:
                print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
                display(vc) 
                plot_bar(dict_df_clinical_trials[table], col, top_n=15, title= f"Horizontal Bar Chart of {col} in table {table}")
                plot_pie(dict_df_clinical_trials[table], col, top_n=10, title=f"Ratios of {col} in table {table}")

# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,phase,count
0,,3773
1,PHASE2,1217
2,PHASE1,1127
3,PHASE3,696
4,PHASE4,631
5,EARLY_PHASE1,101


# # # # # # sponsors (15861 records in total) # # # # # # 


Unnamed: 0,agency_class,count
0,OTHER,10251
1,INDUSTRY,3095
2,UNKNOWN,830
3,NIH,800
4,OTHER_GOV,535
5,FED,178
6,NETWORK,132
7,INDIV,17


# # # # # # locations (56916 records in total) # # # # # # 


Unnamed: 0,continent,count
0,North America,28462
1,Europe,17383
2,Asia,8436
3,South America,1129
4,Africa,750
5,Oceania,736


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,intervention_model,count
0,PARALLEL,4492
1,SINGLE_GROUP,2026
2,CROSSOVER,591
3,SEQUENTIAL,234
4,FACTORIAL,103


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,masking,count
0,NONE,4139
1,SINGLE,1134
2,DOUBLE,1035
3,QUADRUPLE,634
4,TRIPLE,513


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,primary_purpose,count
0,TREATMENT,4787
1,PREVENTION,816
2,SUPPORTIVE_CARE,428
3,OTHER,424
4,BASIC_SCIENCE,351
5,DIAGNOSTIC,335
6,HEALTH_SERVICES_RESEARCH,197
7,SCREENING,64
8,DEVICE_FEASIBILITY,18


# # # # # # study_design (9851 records in total) # # # # # # 


Unnamed: 0,observational_model,count
0,COHORT,1430
1,CASE_ONLY,301
2,CASE_CONTROL,280
3,OTHER,239
4,ECOLOGIC_OR_COMMUNITY,26
5,CASE_CROSSOVER,17
6,DEFINED_POPULATION,9
7,FAMILY_BASED,8
8,NATURAL_HISTORY,1


* **Phases**: The high count of NA (3773) and early-stage trials (PHASE2 at 1217, PHASE1 at 1127) suggests a trend toward exploratory research in pharma, but critically, this indicates high failure risks in drug development pipelines, with only 696 PHASE3 trials signaling limited near-term commercialization.

* **Sponsors**: "OTHER" agencies dominate at 10251, followed by INDUSTRY (3095), highlighting academia/non-profit leadership in trials; this could boost trends in raw material futures for lab supplies (e.g., via commodity indices), yet risks include funding volatility from UNKNOWN (830) and government sources, potentially disrupting ETF stability in healthcare sectors.

* **Locations**: North America leads with 28462 trials, far ahead of Europe (17383), pointing to a concentrated innovation hub that may increase regional demand for pharma raw materials like APIs; critically, this geographic imbalance risks supply chain disruptions from geopolitical tensions, affecting global ETFs like VHT.

* **Intervention Model**: PARALLEL designs prevail at 4492, emphasizing efficiency in comparative testing.

* **Masking**: NONE masking is most common (55%), indicating open-label studies for practicality.

* **Primary Purpose**: TREATMENT dominates nearly with 2 thrids of total, underscoring a focus on therapeutic advancements;far away are PREVENTION with 15% and other ones with around 4-5% UPPORTIVE_CARE,	BASIC_SCIENCE or DIAGNOSTICs .

* **Observational Model**: COHORT models lead with 6 of each 10 studies, favoring longitudinal data collection.

In [109]:
for table in tables_ddbb:
    
    for col in dict_df_clinical_trials[table].columns:
        if "_id" not in col:
            vc = pd.DataFrame(dict_df_clinical_trials[table][col].value_counts().reset_index())
            if 9<len(vc)<21:
                print (f"# # # # # # {table} ({len(dict_df_clinical_trials[table])} records in total) # # # # # # ")
                vc["ratio %"]= (100* vc[ 'count']/vc['count'].sum()).round(2)
                display(vc) 
                plot_barh(dict_df_clinical_trials[table], col, top_n=15, title= f"Horizontal Bar Chart of {col} in table {table}")

        

# # # # # # studies (10000 records in total) # # # # # # 


Unnamed: 0,status,count,ratio %
0,COMPLETED,5406,54.06
1,UNKNOWN,1566,15.66
2,RECRUITING,1155,11.55
3,TERMINATED,616,6.16
4,NOT_YET_RECRUITING,440,4.4
5,ACTIVE_NOT_RECRUITING,390,3.9
6,WITHDRAWN,274,2.74
7,ENROLLING_BY_INVITATION,91,0.91
8,SUSPENDED,24,0.24
9,WITHHELD,23,0.23


# # # # # # interventions (16715 records in total) # # # # # # 


Unnamed: 0,intervention_type,count,ratio %
0,DRUG,6866,41.08
1,OTHER,2968,17.76
2,DEVICE,1657,9.91
3,BEHAVIORAL,1571,9.4
4,PROCEDURE,1569,9.39
5,BIOLOGICAL,757,4.53
6,DIETARY_SUPPLEMENT,483,2.89
7,DIAGNOSTIC_TEST,453,2.71
8,RADIATION,228,1.36
9,COMBINATION_PRODUCT,82,0.49


* **Studies Status**: Most studies (54.06%) are COMPLETED, indicating a majority have reached conclusion.

* **Interventions**: DRUG interventions dominate at 41.08%, far ahead of other types.


## 5. ANSWER RELEVANT QUESTIONS

**Business Questions to Answer**
You must address the following core business questions:
1. Trial Landscape Overview: What is the distribution of clinical trials by phase, status, and therapeutic area? How has this
evolved over time?
2. Completion Analysis: Which factors are associated with higher trial completion rates? Are there patterns in trials that get
terminated or withdrawn?
3. Enrollment Performance: What are the trends in patient enrollment across different trial types? Which conditions attract
the most participants?
4. Geographic Insights: How are clinical trials distributed globally? Are there regional specializations in certain therapeutic
areas?
5. Duration Analysis: What is the typical duration of trials by phase and therapeutic area? Which trials take significantly
longer than expected?

In [17]:
# To answer the questions all info will be merged into one unic dataframe taking study_id as the key to join on)
merged_df = dict_df_clinical_trials["studies"].copy()

for name, df_other in dict_df_clinical_trials.items():
    if name == "studies":
        continue
    merged_df = merged_df.merge(
        df_other,
        on="study_id",
        how="left",
        suffixes=("", f"_{name}")
    )

display(merged_df.head())

Unnamed: 0,study_id,nct_id,title,acronym,status,phase,study_type,start_date,completion_date,primary_completion_date,...,city,state,country,continent,design_id,allocation,intervention_model,masking,primary_purpose,observational_model
0,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,...,Aurora,Colorado,United States,North America,1.0,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,
1,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,...,Denver,Colorado,United States,North America,1.0,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,
2,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,...,Aurora,Colorado,United States,North America,1.0,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,
3,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,...,Denver,Colorado,United States,North America,1.0,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,
4,1,NCT04976335,Quantitative and Clinical Assessment of Flexor...,,RECRUITING,,INTERVENTIONAL,2021-09-13,2027-07-01,2027-07-01,...,Aurora,Colorado,United States,North America,1.0,RANDOMIZED,SINGLE_GROUP,DOUBLE,PREVENTION,


### 5.1. Trial Landscape Overview: What is the distribution of clinical trials by phase, status, and therapeutic area? How has this evolved over time?

To answer this question regarding therapeutic area, i propose the following calssification of 9 different  the new column "therapeutic_area"

In [22]:
dict_df_clinical_trials['studies'].columns

Index(['study_id', 'nct_id', 'title', 'acronym', 'status', 'phase',
       'study_type', 'start_date', 'completion_date',
       'primary_completion_date', 'enrollment', 'enrollment_type',
       'brief_summary', 'eligibility_criteria', 'minimum_age', 'maximum_age',
       'gender', 'created_at', 'updated_at'],
      dtype='str')

In [45]:
dict_df_clinical_trials['studies']['therapeutic_area_score']= dict_df_clinical_trials['studies']['brief_summary'].apply(classify_therapeutic_area)

In [48]:
dict_df_clinical_trials['studies']['therapeutic_area']= dict_df_clinical_trials['studies']['therapeutic_area_score'].apply(lambda x: x.split(" ")[0])

In [49]:

df_ther_area = pd.DataFrame(dict_df_clinical_trials['studies']['therapeutic_area'].value_counts()).reset_index()
df_ther_area

Unnamed: 0,therapeutic_area,count
0,Unknown,4423
1,Oncology,1584
2,Infectious,703
3,Cardiology,587
4,Endocrine,579
5,Neurology,571
6,Gastroenterology,442
7,Hematology,393
8,Respiratory,330
9,Immunology,208


In [50]:
plot_pie(dict_df_clinical_trials['studies'], 'therapeutic_area', top_n=15, title="Ratios of therapeutic_area in studies")

Deleting  studies where therapeutic ares is unkown, to check ratios:

In [51]:
plot_pie(dict_df_clinical_trials['studies'][dict_df_clinical_trials['studies']['therapeutic_area']!="Unknown"], 'therapeutic_area', top_n=15, title="Ratios of therapeutic_area in studies")

Deleting studies with Not available phase definition,  to check ratios:

In [52]:
plot_pie(dict_df_clinical_trials['studies'], 'phase', top_n=15, title="Ratios of phase in studies")

In [53]:
plot_pie(dict_df_clinical_trials['studies'][dict_df_clinical_trials['studies']['phase']!="NA"], 'phase', top_n=15, title="Ratios of phase in studies")

### 5.2. Completion Analysis: Which factors are associated with higher trial completion rates? Are there patterns in trials that get terminated or withdrawn?

Steps to answer the question:
1. Clinical trial statuses can be messy. Before analysis, it should be **collapsed and filtered** into a binary target variable to gain clarity:

    Success Group: "Completed."

    Failure/Attrition Group: "Terminated," "Withdrawn," or "Suspended."

    Exclude: "Recruiting" or "Active, not recruiting" (as their outcome is not yet known).

2. Univariate "**Risk Factor**" Identification

Identify which individual variables show the strongest "pull" toward termination.
    For Categorical Factors (e.g., Phase, Study Type): Use Chi-Square Tests of Independence or Cram√©r's V. This tells you if, for example, Phase II trials have a statistically higher withdrawal rate than Phase III. (Odds ratio could be another way stenghs of correlations)

    For Numerical Factors (e.g., Enrollment Goal, Duration): Use a Point Biserial Correlation study.

3. Multivariate study with a Multivariate Logistic Regression (one-hot-encoding, standarize numeric variables, avoid missing data and higly correlated variables)

In [54]:
dict_df_clinical_trials['studies']["status"].unique()

<StringArray>
[             'RECRUITING',               'COMPLETED',
               'AVAILABLE',   'ACTIVE_NOT_RECRUITING',
               'WITHDRAWN',                 'UNKNOWN',
 'ENROLLING_BY_INVITATION',      'NOT_YET_RECRUITING',
              'TERMINATED',               'SUSPENDED',
     'NO_LONGER_AVAILABLE',                'WITHHELD',
  'APPROVED_FOR_MARKETING']
Length: 13, dtype: str

### 5.3. Enrollment Performance: What are the trends in patient enrollment across different trial types? Which conditions attract the most participants?

In [74]:
dict_df_clinical_trials['studies']["study_type"].unique()

<StringArray>
['INTERVENTIONAL', 'OBSERVATIONAL', 'EXPANDED_ACCESS', '']
Length: 4, dtype: str

In [72]:

conn = get_db_connection_with_retry()
if conn is None:
    logger.error("Failed to connect to database")
    raise Exception("Database connection failed")

try:
    # Query 1: Trends by trial characteristics (MySQL median workaround)
    trends_query = """
    SELECT 
        study_type,
        phase,
        status,
        COUNT(*) as trial_count,
        ROUND(AVG(enrollment), 0) as avg_enrollment,
        SUM(enrollment) as total_enrollment,
        (SELECT enrollment FROM studies s2 
            WHERE s2.study_type = s1.study_type 
            AND s2.phase = s1.phase 
            AND s2.status = s1.status
            ORDER BY ABS(enrollment - (SELECT AVG(enrollment) FROM studies s3 
                                    WHERE s3.study_type = s1.study_type 
                                    AND s3.phase = s1.phase 
                                    AND s3.status = s1.status))
            LIMIT 1) as approx_median
    FROM studies s1 
    WHERE enrollment > 0 
    GROUP BY study_type, phase, status
    ORDER BY avg_enrollment DESC
    """
    
    # Query 2: Top conditions
    conditions_query = """
    SELECT 
        c.condition_name,
        COUNT(DISTINCT s.study_id) as trial_count,
        SUM(s.enrollment) as total_enrollment,
        ROUND(AVG(s.enrollment), 0) as avg_per_trial
    FROM conditions c
    JOIN studies s ON c.study_id = s.study_id
    WHERE s.enrollment > 0
    GROUP BY c.condition_name
    ORDER BY avg_per_trial DESC
    LIMIT 20
    """
    
    # Execute queries
    df_trends = pd.read_sql(trends_query, conn)
    df_conditions = pd.read_sql(conditions_query, conn)
    
    print("Trends shape:", df_trends.shape)
    print("Top conditions shape:", df_conditions.shape)
    
except Error as e:
    logger.error(f"Query error: {e}")
finally:
    conn.close()



# Visualize
if df_trends is not None:
    fig = plot_category_boxplot(df_trends, 'study_type', 'avg_enrollment')
    fig.show()


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Trends shape: (58, 7)
Top conditions shape: (20, 4)



pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Much more enrollment in Observational clinical trial.

In [73]:
display(df_conditions)

Unnamed: 0,condition_name,trial_count,total_enrollment,avg_per_trial
0,Gastroesophageal Reflux Disease (GERD),1,4238504.0,4238504.0
1,Neonatal Mortality,1,1750000.0,1750000.0
2,Community-acquired Pneumonia,4,4239041.0,1059760.0
3,"Colorectal, Cancer",1,1000000.0,1000000.0
4,Lung Cancers,1,1000000.0,1000000.0
5,Gastric Cancers,1,1000000.0,1000000.0
6,Head and Neck Tumor,1,670000.0,670000.0
7,CNS Tumor,1,670000.0,670000.0
8,Bronchial Cancer,1,670000.0,670000.0
9,Substance Abuse Detection,1,646620.0,646620.0


In the top ten: 6 of then are from **condition related with cancer**. And the top 2 are big clinical trials wiht huge participation. 

### 5.4. Geographic Insights: How are clinical trials distributed globally? Are there regional specializations in certain therapeutic areas?

In [76]:
global_dist_query='''SELECT 
    l.continent,
    COUNT(DISTINCT l.study_id) as unique_trials,
    COUNT(*) as total_locations,
    ROUND(AVG(s.enrollment), 0) as avg_enrollment_per_trial
FROM locations l
JOIN studies s ON l.study_id = s.study_id 
WHERE s.enrollment > 0
GROUP BY l.continent
ORDER BY unique_trials DESC;
'''


regional_specialization_query='''
WITH continent_conditions AS (
    SELECT 
        l.continent,
        c.condition_name,
        COUNT(DISTINCT s.study_id) as trial_count,
        SUM(s.enrollment) as total_enrollment,
        ROW_NUMBER() OVER (PARTITION BY l.continent ORDER BY COUNT(DISTINCT s.study_id) DESC) as rn
    FROM locations l
    JOIN studies s ON l.study_id = s.study_id
    JOIN conditions c ON s.study_id = c.study_id
    WHERE s.enrollment > 0
    GROUP BY l.continent, c.condition_name
)
SELECT * FROM continent_conditions 
WHERE rn <= 3
ORDER BY continent, trial_count DESC;

'''


In [80]:
conn = get_db_connection_with_retry()
df_global = pd.read_sql(global_dist_query, conn)
df_special = pd.read_sql(regional_specialization_query, conn)

# Global map/pie
fig1 = px.pie(df_global, values='unique_trials', names='continent', 
              title="Trials by Continent")

# Heatmap specializations
fig2 = px.density_heatmap(df_special, x='continent', y='condition_name',
                         z='trial_count', title="Regional Specializations")

fig1.show(); fig2.show()



pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



First graph shows **quantity distribution** across continents. 

**Specializations** (acording to previous graph):
- USA: Obesity
- Asia: Stroke 
- Europe: Artial Fibrilation
- Africa: HIV and Malaria
- Oceania: Advanced Solid Tumors and Ulcerative Colitis
- South America: Rheumatoid Arthitis


### 5.5. Duration Analysis: What is the typical duration of trials by phase and therapeutic area? Which trials take significantly longer than expected?

In [98]:
df_with_duration = add_duration_columns(dict_df_clinical_trials['studies'])

In [None]:

# Filter valid durations
df_duration = df_with_duration[
    df_with_duration['duration_years'].notna() & 
    (df_with_duration['duration_years'] > 0) &
    df_with_duration['phase'].notna() &
    df_with_duration['therapeutic_area'].notna()
].copy()

print(f"Valid duration trials: {len(df_duration):,}")

# 1. Summary by phase & therapeutic_area
duration_summary = df_duration.groupby(['phase', 'therapeutic_area']).agg(
    trial_count=('study_id', 'count'),
    avg_duration=('duration_years', 'mean'),
    median_duration=('duration_years', 'median'),
    duration_std=('duration_years', 'std')
).round(2).reset_index()

duration_summary = duration_summary[duration_summary['trial_count'] >= 5]
print("\nTop 10 Longest (avg duration):")
print(duration_summary.nlargest(10, 'avg_duration'))

# 2. FIXED Outlier detection
phase_stats = df_duration.groupby('phase')['duration_years'].agg(['mean', 'std']).reset_index()
phase_stats.columns = ['phase', 'phase_mean', 'phase_std']

df_duration = df_duration.merge(phase_stats, on='phase', how='left')
# upper limit for outliers mean  + 2√óstd 
df_outliers = df_duration[
    df_duration['duration_years'] > (df_duration['phase_mean'] + 2 * df_duration['phase_std'])
].copy()

print(f"\nOutliers (>2SD from phase mean): {len(df_outliers)}")

# 3. Visualizations
fig1 = plot_category_boxplot(df_duration, 'phase', 'duration_years', 
                            "Trial Duration by Phase")
fig1.show()

# Heatmap top areas
top_areas = duration_summary.nlargest(20, 'avg_duration')
fig2 = px.density_heatmap(top_areas, x='phase', y='therapeutic_area', z='avg_duration',
                         title="Avg Duration Heatmap (Years)", color_continuous_scale='Reds')
fig2.show()


Valid duration trials: 4,812

Top 10 Longest (avg duration):
           phase therapeutic_area  trial_count  avg_duration  median_duration  \
51        PHASE3         Oncology           74          5.11             4.20   
40        PHASE2         Oncology          243          4.61             3.82   
43        PHASE3       Cardiology           26          4.04             3.56   
6   EARLY_PHASE1        Neurology            6          3.92             2.12   
36        PHASE2       Hematology           35          3.85             2.58   
29        PHASE1         Oncology          206          3.79             3.38   
18            NA         Oncology          150          3.71             2.82   
47        PHASE3       Hematology           12          3.56             1.74   
59        PHASE4       Immunology           12          3.17             2.02   
48        PHASE3       Immunology           16          3.08             2.74   

    duration_std  
51          4.40  
40       

To response to question it was considered a minim of 5 different trials to report results by grouped sudies by phase and therapeutic area. In next table are denoted the typical duration (median_duration)

In [101]:
display(duration_summary)

Unnamed: 0,phase,therapeutic_area,trial_count,avg_duration,median_duration,duration_std
2,EARLY_PHASE1,Gastroenterology,5,1.59,0.83,1.89
6,EARLY_PHASE1,Neurology,6,3.92,2.12,4.22
7,EARLY_PHASE1,Oncology,13,2.15,1.92,1.46
9,EARLY_PHASE1,Unknown,23,1.99,1.31,1.75
10,,Cardiology,124,2.38,1.92,1.83
11,,Dermatology,43,1.58,1.09,1.89
12,,Endocrine,173,2.15,1.75,1.64
13,,Gastroenterology,131,2.31,1.92,1.72
14,,Hematology,84,1.82,1.21,1.83
15,,Immunology,42,1.95,1.33,1.93


In [129]:

# Top 10 longest individual trials
top_longest = df_duration.nlargest(30, 'duration_years')[
    ['nct_id', 'phase', 'therapeutic_area', 'duration_years', 'title']
]
print("\nTop 20 Longest Individual Trials:")
# Merge top_longest with duration_summary on phase + therapeutic_area
top_longest_with_summary = top_longest.merge(
    duration_summary[['phase', 'therapeutic_area', 'median_duration']],
    on=['phase', 'therapeutic_area'],
    suffixes=('', '_summary'),
    how='left'
)


# Rename for clarity
top_longest_with_summary = top_longest_with_summary.rename(columns={
    'median_duration': 'typical_median_duration'
}) [['nct_id', 'phase', 'therapeutic_area', 'duration_years',
       'typical_median_duration', 'title']]



display(top_longest_with_summary.head(20))


Top 20 Longest Individual Trials:


Unnamed: 0,nct_id,phase,therapeutic_area,duration_years,typical_median_duration,title
0,NCT00178932,,Unknown,25.54,1.42,Improving Outcome in Schizophrenia Through Ide...
1,NCT00008450,PHASE1,Immunology,21.37,1.25,Total-Body Irradiation Followed By Cyclosporin...
2,NCT00588523,PHASE2,Oncology,20.42,3.82,Intensive Chemotherapy and Autotransplantation...
3,NCT00047008,PHASE3,Oncology,19.89,4.2,Chemotherapy and Radiation Therapy With or Wit...
4,NCT00583050,,Unknown,19.0,1.42,Endovascular Exclusion of TAAA/AAA Utilizing F...
5,NCT00010244,PHASE3,Oncology,18.5,4.2,Comparison of Radiation Therapy Regimens in Tr...
6,NCT00004205,PHASE3,Oncology,18.33,4.2,Letrozole or Tamoxifen in Treating Postmenopau...
7,NCT00278915,PHASE2,Unknown,17.46,2.01,Faslodex in McCune-Albright Syndrome
8,NCT00574353,,Oncology,16.57,2.82,Study Using Fluorine-18-Labeled Fluoro-Misonid...
9,NCT00390325,PHASE2,Oncology,16.13,3.82,Sorafenib Tosylate in Treating Patients With M...


## 6. DATA QUERIES (SQL)
Next text is generated by AI when asking to group all sql queries performed in the clinical dashboard platform

SQL Queries Extracted

Query: SELECT COUNT(*) as total FROM studies

Counts total studies; used to detect if DB is empty (0 rows ‚Üí empty).‚Äã
Row Counts

Query: SELECT COUNT(*) as count FROM {selected_table} (dynamic: studies, conditions, etc.)

Fetches row count for any selected table; displays as metric.‚Äã
Describe Tables

Query: DESCRIBE {selected_table}

Lists columns/types for data availability analysis.‚Äã
Data Availability

Query: SELECT COUNT(*) as total, SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_count FROM {selected_table}

Per-column null/total counts ‚Üí availability % (e.g., non-null/total*100).‚Äã
Complete Records

Query: SELECT COUNT(*) as complete_count FROM {selected_table} WHERE {col1} IS NOT NULL AND {col2} IS NOT NULL ...

Counts rows with NO nulls across all columns.‚Äã
Summary Stats

Query: SELECT COUNT(*) as total, COUNT(DISTINCT {col}) as distinct_count FROM {selected_table} WHERE {col} IS NOT NULL

Total vs unique non-null values per key column (status, phase, etc.).‚Äã
Top Values

Query: SELECT {col}, COUNT(*) as count FROM {selected_table} WHERE {col}IS NOT NULL GROUP BY{col} ORDER BY count DESC LIMIT 5

Top 5 frequent values per column (e.g., status, condition_name).‚Äã
Studies Distributions

Queries:

    SELECT status, COUNT(*) as count FROM studies WHERE status IS NOT NULL GROUP BY status ORDER BY count DESC

    SELECT phase, COUNT(*) as count FROM studies WHERE phase IS NOT NULL GROUP BY phase ORDER BY count DESC

    SELECT gender, COUNT(*) as count FROM studies WHERE gender IS NOT NULL GROUP BY gender ORDER BY count DESC

Aggregates for bar/pie charts on status, phase, gender.‚Äã
Enrollment Stats

Queries:

    SELECT enrollment FROM studies WHERE enrollment IS NOT NULL (for histogram)

    SELECT AVG(enrollment) as avg_enrollment, MIN(enrollment) as min, MAX(enrollment) as max, COUNT(*) as total_studies FROM studies WHERE enrollment IS NOT NULL

    SELECT COUNT(*) as zero_enrollment FROM studies WHERE enrollment = 0

Stats (avg/min/max/zeros) for enrollment metrics/histogram.‚Äã
Top Entities

Queries:

    SELECT condition_name, COUNT(*) as count FROM conditions GROUP BY condition_name ORDER BY count DESC LIMIT 15

    SELECT intervention_type, COUNT(*) as count FROM interventions WHERE intervention_type IS NOT NULL GROUP BY intervention_type ORDER BY count DESC

    SELECT outcome_type, COUNT(*) as count FROM outcomes WHERE outcome_type IS NOT NULL GROUP BY outcome_type ORDER BY count DESC

    SELECT agency, COUNT(*) as count FROM sponsors GROUP BY agency ORDER BY count DESC LIMIT 15

    SELECT country, COUNT(*) as count FROM locations WHERE country IS NOT NULL GROUP BY country ORDER BY count DESC LIMIT 15

    SELECT allocation, COUNT(*) as count FROM study_design WHERE allocation IS NOT NULL GROUP BY allocation ORDER BY count DESC

    SELECT primary_purpose, COUNT(*) as count FROM study_design WHERE primary_purpose IS NOT NULL GROUP BY primary_purpose ORDER BY count DESC

Top-N (5-15) by frequency for viz (bars, lollipops).‚Äã
Time Trends

Queries:

    SELECT YEAR(start_date) as year, COUNT(*) as study_count FROM studies WHERE start_date IS NOT NULL GROUP BY YEAR(start_date) ORDER BY year

    SELECT YEAR(completion_date) as year, COUNT(*) as completed_count FROM studies WHERE completion_date IS NOT NULL GROUP BY YEAR(completion_date) ORDER BY year

    SELECT YEAR(start_date) as year, AVG(enrollment) as avg_enrollment, COUNT(*) as study_count FROM studies WHERE start_date IS NOT NULL AND enrollment IS NOT NULL GROUP BY YEAR(start_date) ORDER BY year

    SELECT YEAR(start_date) as year, phase, COUNT(*) as count FROM studies WHERE start_date IS NOT NULL AND phase IS NOT NULL GROUP BY YEAR(start_date), phase ORDER BY year, phase

    SELECT YEAR(s.start_date) as year, c.condition_name, COUNT(*) as count FROM studies s JOIN conditions c ON s.study_id = c.study_id WHERE s.start_date IS NOT NULL GROUP BY YEAR(s.start_date), c.condition_name HAVING COUNT(*) >= 3 ORDER BY year, count DESC

Yearly trends (line/area/bar) for starts, completions, enrollment avg, phase evolution, top conditions.‚Äã
Search Studies

Query: SELECT * FROM studies WHERE title LIKE %s OR description LIKE %s LIMIT 50

Full-text search on title/description; params prevent injection.‚Äã