In [1]:
import pandas as pd
import numpy as np

# Define common categories for all centers
tumor_sizes = ['T1 (<2cm)', 'T2 (2-5cm)', 'T3 (>5cm)', 'T4 (>7cm)']
lymph_nodes = ['N0', 'N1', 'N2', 'N3']
metastasis = ['M0', 'M1']
responses = ['Complete', 'Partial', 'Stable', 'Progressive']

# Dataset 1: Breast Cancer Center - Higher proportion of early stage disease
breast_center = pd.DataFrame({
    'Tumor_Size': np.random.choice(tumor_sizes, 100, p=[0.4, 0.3, 0.2, 0.1]),
    'Lymph_Node_Status': np.random.choice(lymph_nodes, 100, p=[0.5, 0.3, 0.1, 0.1]),
    'Metastasis': np.random.choice(metastasis, 100, p=[0.8, 0.2]),
    'Treatment_Response': np.random.choice(responses, 100, p=[0.4, 0.3, 0.2, 0.1])
})

# Dataset 2: Lung Cancer Center - Higher proportion of advanced disease
lung_center = pd.DataFrame({
    'Tumor_Size': np.random.choice(tumor_sizes, 100, p=[0.1, 0.2, 0.3, 0.4]),
    'Lymph_Node_Status': np.random.choice(lymph_nodes, 100, p=[0.2, 0.3, 0.3, 0.2]),
    'Metastasis': np.random.choice(metastasis, 100, p=[0.6, 0.4]),
    'Treatment_Response': np.random.choice(responses, 100, p=[0.2, 0.3, 0.3, 0.2])
})

# Dataset 3: Sarcoma Center - More evenly distributed stages
sarcoma_center = pd.DataFrame({
    'Tumor_Size': np.random.choice(tumor_sizes, 100, p=[0.25, 0.25, 0.25, 0.25]),
    'Lymph_Node_Status': np.random.choice(lymph_nodes, 100, p=[0.25, 0.25, 0.25, 0.25]),
    'Metastasis': np.random.choice(metastasis, 100, p=[0.7, 0.3]),
    'Treatment_Response': np.random.choice(responses, 100, p=[0.25, 0.25, 0.25, 0.25])
})

# Convert all columns to categorical
for df in [breast_center, lung_center, sarcoma_center]:
    for col in df.columns:
        df[col] = df[col].astype('category')

In [5]:
# Create cohort 1 and 2 by randomly splitting each center's data
np.random.seed(42) # For reproducibility

# Function to split a dataframe into two cohorts
def split_into_cohorts(df, cohort_1_fraction=0.6):
    mask = np.random.rand(len(df)) < cohort_1_fraction
    cohort_1 = df[mask].copy()
    cohort_2 = df[~mask].copy()
    return cohort_1, cohort_2

# Split each center's data
breast_cohort_1, breast_cohort_2 = split_into_cohorts(breast_center)
lung_cohort_1, lung_cohort_2 = split_into_cohorts(lung_center)
sarcoma_cohort_1, sarcoma_cohort_2 = split_into_cohorts(sarcoma_center)

# Combine centers into cohort 1 and cohort 2
cohort_1 = pd.concat([breast_cohort_1, lung_cohort_1, sarcoma_cohort_1], axis=0)
cohort_2 = pd.concat([breast_cohort_2, lung_cohort_2, sarcoma_cohort_2], axis=0)

print("Cohort 1 size:", len(cohort_1))
print("Cohort 2 size:", len(cohort_2))


Cohort 1 size: 177
Cohort 2 size: 123


In [7]:
center_1_dfs = [breast_cohort_1, breast_cohort_2]
center_2_dfs = [lung_cohort_1, lung_cohort_2]
center_3_dfs = [sarcoma_cohort_1, sarcoma_cohort_2]
cohort_names = ['Cohort 1', 'Cohort 2']


In [8]:
variables = breast_center.columns.tolist()

In [18]:
# Print value counts for all variables in each cohort
for cohort in center_1_dfs:
    print(cohort.apply(pd.value_counts).sort_index())
    print("\n")



             Tumor_Size  Lymph_Node_Status  Metastasis  Treatment_Response
Complete            NaN                NaN         NaN                21.0
M0                  NaN                NaN        46.0                 NaN
M1                  NaN                NaN        17.0                 NaN
N0                  NaN               29.0         NaN                 NaN
N1                  NaN               26.0         NaN                 NaN
N2                  NaN                4.0         NaN                 NaN
N3                  NaN                4.0         NaN                 NaN
Partial             NaN                NaN         NaN                20.0
Progressive         NaN                NaN         NaN                 5.0
Stable              NaN                NaN         NaN                17.0
T1 (<2cm)          27.0                NaN         NaN                 NaN
T2 (2-5cm)         20.0                NaN         NaN                 NaN
T3 (>5cm)          13.0  

In [None]:
from typing import List, Dict
import pandas as pd

def categorical_counts_partial(dataframes: List[pd.DataFrame], cohort_names: List[str]) -> Dict[str, List[Dict[str, Dict[str, int]]]]:
    """
    Get value counts for each variable in each cohort.

    Parameters
    ----------
    dataframes : list of pandas.DataFrame
        List of DataFrames representing different cohorts
    cohort_names : list of str
        List of names for each cohort

    Returns
    -------
    dict
        Nested dictionary structure containing value counts for each variable in each cohort.
        The structure is:
        ```
        {
            'Cohort 1': [
                {'Column1': {'value1': count1, 'value2': count2, ...}},
                {'Column2': {'value1': count1, 'value2': count2, ...}},
                ...
            ],
            'Cohort 2': [
                ...
            ]
        }
        ```

    Notes
    -----
    Creates a dictionary that stores value counts for each variable in each cohort.
    For each cohort, counts the frequency of values in each column and organizes them
    into a nested dictionary structure.
    """
    counts = {}
    for mdf, name in zip(dataframes, cohort_names):
        counts[name] = list(map(lambda x : {x: mdf[x].value_counts().to_dict()}, mdf.columns))
    return counts

counts_center1 = categorical_counts_partial(center_1_dfs, cohort_names)
counts_center2 = categorical_counts_partial(center_2_dfs, cohort_names)

def combine_center_counts(*center_counts: dict) -> pd.DataFrame:
    """Combine counts from N centers into a single DataFrame."""
    rows = []
    for cohort in center_counts[0].keys():
        for var_dict in center_counts[0][cohort]:
            var = list(var_dict.keys())[0]
            levels = var_dict[var].keys()
            for level in levels:
                rows.append({
                    'Cohort': cohort,
                    'Variable': var,
                    'Level': level,
                    **{f'Center {i+1}': center[cohort][center_counts[0][cohort].index(var_dict)][var].get(level, 0)
                       for i, center in enumerate(center_counts)}
                })
    return pd.DataFrame(rows).sort_values(['Cohort', 'Variable', 'Level'])

combined_counts = combine_center_counts(counts_center1, counts_center2)
display(combined_counts)

from scipy import stats

def get_center_columns(df):
    """Get columns that represent different centers."""
    return [col for col in df.columns if col.startswith('Center')]

def compute_chi_squared_test(data, center_cols):
    """Compute chi-squared test for contingency table."""
    return stats.chi2_contingency(data[center_cols].values)[:2]

def create_chi_squared_results(combined_counts):
    """Create DataFrame with chi-squared test results for each cohort and variable."""
    center_cols = get_center_columns(combined_counts)
    results = []

    for cohort in combined_counts['Cohort'].unique():
        cohort_data = combined_counts[combined_counts['Cohort'] == cohort]
        for var in cohort_data['Variable'].unique():
            var_data = cohort_data[cohort_data['Variable'] == var]
            chi2, p_val = compute_chi_squared_test(var_data, center_cols)
            results.append([cohort, var, chi2, p_val])

    return pd.DataFrame(results, columns=['Cohort', 'Variable', 'Chi-squared', 'P-value'])

# Generate and display results
chi_squared_df = create_chi_squared_results(combined_counts)
display(chi_squared_df.sort_values(['Cohort', 'Variable']))








Unnamed: 0,Cohort,Variable,Level,Center 1,Center 2
4,Cohort 1,Lymph_Node_Status,N0,29,13
5,Cohort 1,Lymph_Node_Status,N1,26,17
6,Cohort 1,Lymph_Node_Status,N2,4,20
7,Cohort 1,Lymph_Node_Status,N3,4,9
8,Cohort 1,Metastasis,M0,46,36
9,Cohort 1,Metastasis,M1,17,23
10,Cohort 1,Treatment_Response,Complete,21,11
11,Cohort 1,Treatment_Response,Partial,20,19
13,Cohort 1,Treatment_Response,Progressive,5,13
12,Cohort 1,Treatment_Response,Stable,17,16


Unnamed: 0,Cohort,Variable,Chi-squared,P-value
0,Cohort 1,Lymph_Node_Status,20.459549,0.0001363033
1,Cohort 1,Metastasis,1.483252,0.2232664
2,Cohort 1,Treatment_Response,6.61246,0.08533131
3,Cohort 1,Tumor_Size,38.233482,2.522379e-08
4,Cohort 2,Lymph_Node_Status,12.999029,0.004638705
5,Cohort 2,Metastasis,1.583497,0.2082577
6,Cohort 2,Treatment_Response,5.893079,0.1169296
7,Cohort 2,Tumor_Size,27.517615,4.58534e-06


In [37]:
def compute_local_counts(dfs: list[pd.DataFrame], cohort_names: List[str]) -> Dict[str, List[Dict[str, Dict[str, int]]]]:
    """
    Compute local categorical value counts for each variable for multiple dataframes.

    Parameters
    ----------
    *dfs : pandas.DataFrame
        One or more dataframes containing the data
    cohort_names : List[str]
        Names of the cohorts corresponding to each dataframe

    Returns
    -------
    dict
        Nested dictionary with counts per cohort, variable and category level
        Structure: {
            'cohort_name': [
                {'variable_name': {'level': count, ...}},
                ...
            ]
        }
    """
    if len(dfs) != len(cohort_names):
        raise ValueError("Number of dataframes must match number of cohort names")

    results = {}
    for df, cohort in zip(dfs, cohort_names):
        variables = df.select_dtypes(include=['category']).columns
        results[cohort] = [
            {var: df[var].value_counts().to_dict()}
            for var in variables
        ]
    return results

def combine_center_results(
    center_results: List[Dict[str, List[Dict[str, Dict[str, int]]]]]
) -> pd.DataFrame:
    """
    Combine results from multiple centers and compute chi-squared tests.

    Parameters
    ----------
    center_results : list
        List of dictionaries containing counts from each center

    Returns
    -------
    tuple
        (combined_counts, chi_squared_results) where:
        - combined_counts: DataFrame with counts from all centers
        - chi_squared_results: DataFrame with chi-squared test results
    """
    # Combine counts
    rows = []
    for cohort in center_results[0].keys():
        for var_dict in center_results[0][cohort]:
            var = list(var_dict.keys())[0]
            levels = var_dict[var].keys()
            for level in levels:
                rows.append({
                    'Cohort': cohort,
                    'Variable': var,
                    'Level': level,
                    **{f'Center {i+1}': center[cohort][center_results[0][cohort].index(var_dict)][var].get(level, 0)
                       for i, center in enumerate(center_results)}
                })

    combined_df = pd.DataFrame(rows).sort_values(['Cohort', 'Variable', 'Level'])

    # Compute chi-squared tests
    center_cols = [col for col in combined_df.columns if col.startswith('Center')]
    results = []

    for cohort in combined_df['Cohort'].unique():
        cohort_data = combined_df[combined_df['Cohort'] == cohort]
        for var in cohort_data['Variable'].unique():
            var_data = cohort_data[cohort_data['Variable'] == var]
            chi2, p_val = stats.chi2_contingency(var_data[center_cols].values)[:2]
            results.append([cohort, var, chi2, p_val])

    chi_squared_df = pd.DataFrame(
        results,
        columns=['Cohort', 'Variable', 'Chi-squared', 'P-value']
    ).sort_values(['Cohort', 'Variable'])

    return combined_df.to_json(), chi_squared_df.to_json()

In [39]:
# At each center
local_results_1 = compute_local_counts(center_1_dfs, cohort_names)
local_results_2 = compute_local_counts(center_2_dfs, cohort_names)

# At central analyzer
combined_counts, chi_squared_results = combine_center_results([local_results_1, local_results_2])

In [43]:
combined_counts.to_json()

'{"Cohort":{"4":"Cohort 1","5":"Cohort 1","6":"Cohort 1","7":"Cohort 1","8":"Cohort 1","9":"Cohort 1","10":"Cohort 1","11":"Cohort 1","13":"Cohort 1","12":"Cohort 1","0":"Cohort 1","1":"Cohort 1","2":"Cohort 1","3":"Cohort 1","18":"Cohort 2","19":"Cohort 2","21":"Cohort 2","20":"Cohort 2","22":"Cohort 2","23":"Cohort 2","25":"Cohort 2","24":"Cohort 2","27":"Cohort 2","26":"Cohort 2","15":"Cohort 2","14":"Cohort 2","16":"Cohort 2","17":"Cohort 2"},"Variable":{"4":"Lymph_Node_Status","5":"Lymph_Node_Status","6":"Lymph_Node_Status","7":"Lymph_Node_Status","8":"Metastasis","9":"Metastasis","10":"Treatment_Response","11":"Treatment_Response","13":"Treatment_Response","12":"Treatment_Response","0":"Tumor_Size","1":"Tumor_Size","2":"Tumor_Size","3":"Tumor_Size","18":"Lymph_Node_Status","19":"Lymph_Node_Status","21":"Lymph_Node_Status","20":"Lymph_Node_Status","22":"Metastasis","23":"Metastasis","25":"Treatment_Response","24":"Treatment_Response","27":"Treatment_Response","26":"Treatment_Respo