In [1]:
from graphviz import Digraph
from IPython.display import Image, display
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import scipy.stats as stats
from scipy.stats import nct, t


In [2]:
df = pd.read_csv('data/cohorte_endo_processed.csv')

In [3]:

def probability_significant_difference(n_positives, n_negatives, alpha=0.05, effect_size=0):
    """
    Calculate the probability of observing significant differences between two distributions analytically.
    
    Parameters:
        n_positives (int): Number of positive cases.
        n_negatives (int): Number of negative cases.
        alpha (float): Significance level for the test (default is 0.05).
        effect_size (float): Effect size (Cohen's d), default is 0 (no true difference).
    
    Returns:
        float: Estimated probability of observing significant differences by chance.
    """
    # Degrees of freedom for Welch's t-test
    df = (n_positives - 1) + (n_negatives - 1)
    
    # Critical t-value for the given alpha
    t_critical = t.ppf(1 - alpha / 2, df)
    
    # Non-centrality parameter
    pooled_std = 1  # Assuming standard normal distributions (mean=0, std=1)
    ncp = effect_size * ((n_positives * n_negatives) / (n_positives + n_negatives))**0.5 / pooled_std
    
    # Compute power (probability of rejecting the null hypothesis)
    power = nct.cdf(-t_critical, df, ncp) + (1 - nct.cdf(t_critical, df, ncp))
    
    # If effect_size is zero, power represents the probability of false positives (alpha level)
    return power if effect_size > 0 else alpha


In [4]:
def calculate_power(n_positives, n_negatives, effect_size, alpha=0.05, sd=1):
    """
    Calculate the statistical power of a two-sample t-test.
    
    Parameters:
        n_positives (int): Number of observations in the first group.
        n_negatives (int): Number of observations in the second group.
        effect_size (float): The true difference in means between the groups.
        alpha (float): Significance level for the test (default is 0.05).
        sd (float): Standard deviation of the data (assuming equal SD for both groups).
    
    Returns:
        float: Statistical power of the test.
    """
    # Pooled standard deviation
    s_pooled = sd * ((1 / n_positives) + (1 / n_negatives)) ** 0.5
    
    # Degrees of freedom
    df = n_positives + n_negatives - 2
    
    # Non-centrality parameter
    delta = effect_size / s_pooled
    
    # Critical t-value for two-tailed test
    t_critical = stats.t.ppf(1 - alpha / 2, df)
    
    # Calculate power
    power = stats.nct.sf(t_critical, df, delta) + stats.nct.cdf(-t_critical, df, delta)
    
    return power

In [5]:
def compute_probabilities_for_targets(
    df, target_columns, alpha=0.05
):
    """
    Compute probabilities of observing significant differences for each target column in a DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        target_columns (list of str): List of columns to analyze for differences (e.g., ['endometriosis', 'hormone_treatment']).
        alpha (float): Significance level for the test (default is 0.05).
        n_simulations (int): Number of simulations to perform (default is 1,000).

    Returns:
        pd.DataFrame: DataFrame with probabilities for each target column.
    """
    results = []

    for include_maybe in [True, False]:
        for target_column in target_columns:
            # Exclude rows with 'maybe' if include_maybe is False
            if not include_maybe and f"{target_column}_maybe" in df.columns:
                filtered_df = df[df[f"{target_column}_maybe"] == 0]
            else:
                filtered_df = df

            # Count positives and negatives in the target column
            n_positives = filtered_df[filtered_df[target_column] == 1].shape[0]
            n_negatives = filtered_df[filtered_df[target_column] == 0].shape[0]

            if n_positives > 0 and n_negatives > 0:
                # Compute the probability using the function
                probability = probability_significant_difference(
                    n_positives=n_positives,
                    n_negatives=n_negatives,
                    alpha=alpha,
                )
                power = calculate_power(n_positives, n_negatives, effect_size=0.5)
            else:
                probability = None  # Not enough data for a valid test
                power = None

            # Store the result
            results.append(
                {
                    "target_column": target_column,
                    "include_maybe": include_maybe,
                    "n_positives": n_positives,
                    "n_negatives": n_negatives,
                    "probability": probability,
                    "power": power,
                }
            )

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df


In [6]:
# Define the columns to group by
target_columns = [c for c in df.columns if c.startswith("symptom") or c.startswith("subtype")] + ["hormone_treatment", "endometriosis"]

# Compute probabilities for each subgroup
probabilities_df = compute_probabilities_for_targets(df, target_columns)

probabilities_df.sort_values(by="target_column", ascending=False).dropna()


Unnamed: 0,target_column,include_maybe,n_positives,n_negatives,probability,power
3,symptom_unknown,True,147,348,0.05,0.999074
20,symptom_unknown,False,147,348,0.05,0.999074
1,symptom_pain,True,243,252,0.05,0.999835
18,symptom_pain,False,243,252,0.05,0.999835
2,symptom_infertility,True,85,410,0.05,0.987033
19,symptom_infertility,False,85,410,0.05,0.987033
0,symptom_asymptomatic,True,74,421,0.05,0.977193
17,symptom_asymptomatic,False,74,421,0.05,0.977193
31,subtype_unknown,False,102,393,0.05,0.994309
14,subtype_unknown,True,102,393,0.05,0.994309


In [7]:
def create_probability_graph(probabilities_df, group_columns):
    """
    Create a GraphViz graph to visualize subgroups and their probabilities.

    Parameters:
        probabilities_df (pd.DataFrame): DataFrame with subgroup probabilities.
        group_columns (list of str): Columns defining the subgroup hierarchy.
    
    Returns:
        Digraph: GraphViz Digraph object.
    """
    # Initialize the graph
    graph = Digraph(format="png", engine="dot")
    graph.attr(rankdir="LR", size="12,8")

    # Iterate over each row in the DataFrame
    for _, row in probabilities_df.iterrows():
        # Construct the subgroup label from the group_columns
        subgroup_label = ", ".join(f"{col}={row[col]}" for col in group_columns)
        
        # Node label includes subgroup label and probability
        node_label = f"{subgroup_label}\nProbability: {row['probability']:.4f}" if row['probability'] is not None else f"{subgroup_label}\nNo valid test"
        
        # Determine node style based on 'include_maybe'
        node_color = "gray" if row['include_maybe'] else "black"
        
        # Add the node to the graph
        graph.node(subgroup_label, label=node_label, color=node_color, fontcolor=node_color)
        
        # Connect nodes logically (from without "maybe" to with "maybe")
        if row['include_maybe']:
            without_maybe_label = ", ".join(f"{col}={row[col]}" for col in group_columns)
            graph.edge(without_maybe_label, subgroup_label, label="Enable Maybe", color="gray")

    return graph

In [8]:
# Create the GraphViz graph
probability_graph = create_probability_graph(probabilities_df, group_columns)

# Display the graph in the juptyer notebook
probability_graph.render('probability_graph', format='png', cleanup=True)
display(Image('probability_graph.png'))

NameError: name 'group_columns' is not defined