In [219]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from scipy.stats import chi2_contingency, kruskal
import os
import networkx as nx
import warnings
warnings.filterwarnings("ignore")

## Statistical tests

In [220]:
import scipy.stats as stats

def compare_communities(
    df, community_col, feature, comm1, comm2, feature_type="auto", positive_value=None
):
    """
    Compare a feature between two communities.
    - df: DataFrame
    - community_col: column with community assignment
    - feature: feature to compare
    - comm1, comm2: community IDs to compare
    - feature_type: 'auto', 'numerical', or 'categorical'
    - positive_value: for binary/categorical, value to count as 'positive' (e.g., 'n-CLL')
    """
    # Subset data
    # group1 = df[df[community_col] == comm1][feature].dropna()
    # group2 = df[df[community_col] == comm2][feature].dropna()
    group1 = df[df[community_col] == comm1][feature]
    group2 = df[df[community_col] == comm2][feature]


    # Auto-detect feature type if needed
    if feature_type == "auto":
        if pd.api.types.is_numeric_dtype(group1) and pd.api.types.is_numeric_dtype(group2):
            feature_type = "numerical"
        else:
            feature_type = "categorical"

    print(f"\nComparing '{feature}' between communities {comm1} and {comm2} ({feature_type})")

    if feature_type == "numerical":
        # Mann-Whitney U test (non-parametric, robust for medians)
        group1 = group1.dropna()
        group2 = group2.dropna()
        stat, p = stats.mannwhitneyu(group1, group2, alternative='two-sided')
        med1, med2 = group1.median(), group2.median()
        print(f"Median (Comm {comm1}): {med1:.2f}, Median (Comm {comm2}): {med2:.2f}")
        print(f"Mann-Whitney U statistic: {stat:.3f}, p-value: {p:.3f}")
    else:
        # Categorical: compare proportions of positive_value
        if positive_value is None:
            # Use the most frequent value in both groups
            positive_value = pd.concat([group1, group2]).value_counts().idxmax()
        count1 = (group1 == positive_value).sum()
        count2 = (group2 == positive_value).sum()
        n1, n2 = len(group1), len(group2)
        pct1 = 100 * count1 / n1 if n1 else 0
        pct2 = 100 * count2 / n2 if n2 else 0
        print(f"% {positive_value} (Comm {comm1}): {pct1:.2f}% ({count1}/{n1}), (Comm {comm2}): {pct2:.2f}% ({count2}/{n2})")
        table = np.array([[count1, n1 - count1], [count2, n2 - count2]])
        # Use Fisher's exact if any cell < 5, else Chi-squared
        if (table < 5).any():
            stat, p = stats.fisher_exact(table)
            print(f"Fisher's exact test odds ratio: {stat:.3f}, p-value: {p:.3f}")
        else:
            stat, p, _, _ = stats.chi2_contingency(table)
            print(f"Chi-squared statistic: {stat:.3f}, p-value: {p:.3f}")

    if p < 0.05:
        print("Result: Significant difference (p < 0.05)")
    else:
        print("Result: No significant difference (p >= 0.05)")

## Load SLPA data

In [221]:
base_path = "../data/full"
algo = "w_slpa"
patient_df = pd.read_csv("../../data/thesis/cll_broad_2022_clinical_data_thesis.csv")
mutation_df = pd.read_csv("../../data/thesis/cll_broad_2022_mutations_thesis.csv")
community_df = pd.read_csv(os.path.join(base_path, algo, "community_assignments.csv"))

In [222]:
merged_df = pd.merge(patient_df, community_df, on='patientId', how='inner')
merged_df.shape

(1062, 30)

In [223]:
# Convert FFS_STATUS and OS_STATUS to binary event columns
merged_df['FFS_STATUS_EVENT'] = merged_df['FFS_STATUS'].str.startswith('1:').astype(int)
merged_df['OS_STATUS_EVENT'] = merged_df['OS_STATUS'].str.startswith('1:').astype(int)

In [224]:
compare_communities(merged_df, 'communityId', 'MUTATION_COUNT', 0, 1)
compare_communities(merged_df, 'communityId', 'AGE_SAMPLING', 0, 1)
compare_communities(merged_df, 'communityId', 'CLL_EPITYPE', 0, 1, feature_type='categorical', positive_value='n-CLL')
compare_communities(merged_df, 'communityId', 'TUMOR_MOLECULAR_SUBTYPE', 0, 1, feature_type='categorical', positive_value='U-CLL')


Comparing 'MUTATION_COUNT' between communities 0 and 1 (numerical)
Median (Comm 0): 29.00, Median (Comm 1): 26.00
Mann-Whitney U statistic: 8787.000, p-value: 0.248
Result: No significant difference (p >= 0.05)

Comparing 'AGE_SAMPLING' between communities 0 and 1 (numerical)
Median (Comm 0): 66.00, Median (Comm 1): 60.50
Mann-Whitney U statistic: 12338.500, p-value: 0.000
Result: Significant difference (p < 0.05)

Comparing 'CLL_EPITYPE' between communities 0 and 1 (categorical)
% n-CLL (Comm 0): 58.04% (166/286), (Comm 1): 28.79% (19/66)
Chi-squared statistic: 17.251, p-value: 0.000
Result: Significant difference (p < 0.05)

Comparing 'TUMOR_MOLECULAR_SUBTYPE' between communities 0 and 1 (categorical)
% U-CLL (Comm 0): 45.80% (131/286), (Comm 1): 27.27% (18/66)
Chi-squared statistic: 6.804, p-value: 0.009
Result: Significant difference (p < 0.05)


In [225]:
compare_communities(merged_df, 'communityId', 'AGE_SAMPLING', 1, 2)
compare_communities(merged_df, 'communityId', 'MUTATION_COUNT', 1, 2)
compare_communities(merged_df, 'communityId', 'TREATMENT_AFTER_SAMPLING', 1, 2, feature_type='categorical', positive_value='Chemo + Ab')


Comparing 'AGE_SAMPLING' between communities 1 and 2 (numerical)
Median (Comm 1): 60.50, Median (Comm 2): 62.00
Mann-Whitney U statistic: 10205.500, p-value: 0.399
Result: No significant difference (p >= 0.05)

Comparing 'MUTATION_COUNT' between communities 1 and 2 (numerical)
Median (Comm 1): 26.00, Median (Comm 2): 25.00
Mann-Whitney U statistic: 10296.000, p-value: 0.551
Result: No significant difference (p >= 0.05)

Comparing 'TREATMENT_AFTER_SAMPLING' between communities 1 and 2 (categorical)
% Chemo + Ab (Comm 1): 69.70% (46/66), (Comm 2): 32.02% (106/331)
Chi-squared statistic: 31.478, p-value: 0.000
Result: Significant difference (p < 0.05)


In [226]:
compare_communities(merged_df, 'communityId', 'AGE_SAMPLING', 2, 3)
compare_communities(merged_df, 'communityId', 'MUTATION_COUNT', 2, 3)
compare_communities(merged_df, 'communityId', 'CLL_EPITYPE', 2, 3, feature_type='categorical', positive_value='m-CLL')
compare_communities(merged_df, 'communityId', 'IGHV_MUTATION_STATUS', 2, 3, feature_type='categorical', positive_value='mutated')


Comparing 'AGE_SAMPLING' between communities 2 and 3 (numerical)
Median (Comm 2): 62.00, Median (Comm 3): 64.00
Mann-Whitney U statistic: 57882.500, p-value: 0.076
Result: No significant difference (p >= 0.05)

Comparing 'MUTATION_COUNT' between communities 2 and 3 (numerical)
Median (Comm 2): 25.00, Median (Comm 3): 23.00
Mann-Whitney U statistic: 60326.500, p-value: 0.006
Result: Significant difference (p < 0.05)

Comparing 'CLL_EPITYPE' between communities 2 and 3 (categorical)
% m-CLL (Comm 2): 23.26% (77/331), (Comm 3): 52.51% (199/379)
Chi-squared statistic: 62.366, p-value: 0.000
Result: Significant difference (p < 0.05)

Comparing 'IGHV_MUTATION_STATUS' between communities 2 and 3 (categorical)
% mutated (Comm 2): 40.48% (134/331), (Comm 3): 75.20% (285/379)
Chi-squared statistic: 86.603, p-value: 0.000
Result: Significant difference (p < 0.05)


## Overlapping analysis

In [227]:
import scipy.stats as stats

def compare_overlap_groups(
    merged, community_df, overlap_pairs, comm_id_1, comm_id_2, feature, feature_type="auto", positive_value=None
):
    """
    Compare a feature between:
      - patients in both comm_id_1 and comm_id_2 (overlap group)
      - patients only in comm_id_1
      - patients only in comm_id_2

    Prints stats for:
      - overlap vs only comm_id_1
      - overlap vs only comm_id_2

    Parameters:
      - merged: DataFrame with patient info
      - community_df: DataFrame with community assignments and is_overlap
      - overlap_pairs: DataFrame with patientId and comm_pair
      - comm_id_1, comm_id_2: community IDs
      - feature: feature to compare
      - feature_type: 'auto', 'numerical', or 'categorical'
      - positive_value: for categorical, value to count as 'positive'
    """
    comm_pair_str = f'[{comm_id_1}, {comm_id_2}]'
    # Find patient ids for each group
    overlap_ids = set(overlap_pairs[overlap_pairs['comm_pair'] == comm_pair_str]['patientId'])
    only_1_ids = set(community_df[(community_df['communityId'] == comm_id_1) & (~community_df['is_overlap'])]['patientId'])
    only_2_ids = set(community_df[(community_df['communityId'] == comm_id_2) & (~community_df['is_overlap'])]['patientId'])

    # Subset merged for each group
    df_overlap = merged[merged['patientId'].isin(overlap_ids)]
    df_only1 = merged[merged['patientId'].isin(only_1_ids)]
    df_only2 = merged[merged['patientId'].isin(only_2_ids)]

    def do_compare(groupA, groupB, labelA, labelB):
        gA = groupA[feature].dropna()
        gB = groupB[feature].dropna()
        # Auto-detect feature type
        ftype = feature_type
        if ftype == "auto":
            if pd.api.types.is_numeric_dtype(gA) and pd.api.types.is_numeric_dtype(gB):
                ftype = "numerical"
            else:
                ftype = "categorical"
        print(f"\nComparing '{feature}' between {labelA} and {labelB} ({ftype})")
        if ftype == "numerical":
            stat, p = stats.mannwhitneyu(gA, gB, alternative='two-sided')
            medA, medB = gA.median(), gB.median()
            print(f"Median ({labelA}): {medA:.2f}, Median ({labelB}): {medB:.2f}")
            print(f"Mann-Whitney U statistic: {stat:.3f}, p-value: {p:.4f}")
        else:
            pos_val = positive_value
            if pos_val is None:
                pos_val = pd.concat([gA, gB]).value_counts().idxmax()
            countA = (gA == pos_val).sum()
            countB = (gB == pos_val).sum()
            nA, nB = len(gA), len(gB)
            pctA = 100 * countA / nA if nA else 0
            pctB = 100 * countB / nB if nB else 0
            print(f"% {pos_val} ({labelA}): {pctA:.1f}% ({countA}/{nA}), ({labelB}): {pctB:.1f}% ({countB}/{nB})")
            table = np.array([[countA, nA - countA], [countB, nB - countB]])
            if (table < 5).any():
                stat, p = stats.fisher_exact(table)
                print(f"Fisher's exact test odds ratio: {stat:.3f}, p-value: {p:.4f}")
            else:
                stat, p, _, _ = stats.chi2_contingency(table)
                print(f"Chi-squared statistic: {stat:.3f}, p-value: {p:.4f}")
        if p < 0.05:
            print("Result: Significant difference (p < 0.05)")
        else:
            print("Result: No significant difference (p >= 0.05)")

    do_compare(df_overlap, df_only1, f"Overlap [{comm_id_1},{comm_id_2}]", f"Only [{comm_id_1}]")
    do_compare(df_overlap, df_only2, f"Overlap [{comm_id_1},{comm_id_2}]", f"Only [{comm_id_2}]")

In [228]:
# generate overlap pairs DataFrame
comm_count = community_df.groupby('patientId').size().reset_index(name='comm_count')
overlapping = comm_count[comm_count['comm_count'] > 1]['patientId']
community_df['is_overlap'] = community_df['patientId'].isin(overlapping)

overlap_pairs = community_df[community_df['is_overlap']].groupby('patientId')['communityId'].apply(sorted).reset_index()
overlap_pairs['comm_pair'] = overlap_pairs['communityId'].apply(lambda x: str(x))

In [229]:
# Example usage:
compare_overlap_groups(merged_df, community_df, overlap_pairs, 1, 2, 'AGE_SAMPLING')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 1, 2, 'MUTATION_COUNT')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 1, 2, 'CLL_EPITYPE', feature_type='categorical', positive_value='n-CLL')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 1, 2, 'IGHV_MUTATION_STATUS', feature_type='categorical', positive_value='mutated')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 1, 2, 'TUMOR_MOLECULAR_SUBTYPE', feature_type='categorical', positive_value='U-CLL')


Comparing 'AGE_SAMPLING' between Overlap [1,2] and Only [1] (numerical)
Median (Overlap [1,2]): 62.00, Median (Only [1]): 55.00
Mann-Whitney U statistic: 1001.000, p-value: 0.1864
Result: No significant difference (p >= 0.05)

Comparing 'AGE_SAMPLING' between Overlap [1,2] and Only [2] (numerical)
Median (Overlap [1,2]): 62.00, Median (Only [2]): 62.00
Mann-Whitney U statistic: 13176.000, p-value: 0.6671
Result: No significant difference (p >= 0.05)

Comparing 'MUTATION_COUNT' between Overlap [1,2] and Only [1] (numerical)
Median (Overlap [1,2]): 25.00, Median (Only [1]): 28.00
Mann-Whitney U statistic: 653.000, p-value: 0.2829
Result: No significant difference (p >= 0.05)

Comparing 'MUTATION_COUNT' between Overlap [1,2] and Only [2] (numerical)
Median (Overlap [1,2]): 25.00, Median (Only [2]): 25.00
Mann-Whitney U statistic: 12069.000, p-value: 0.9844
Result: No significant difference (p >= 0.05)

Comparing 'CLL_EPITYPE' between Overlap [1,2] and Only [1] (categorical)
% n-CLL (Over

In [230]:
compare_overlap_groups(merged_df, community_df, overlap_pairs, 2, 3, 'AGE_SAMPLING')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 2, 3, 'MUTATION_COUNT')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 2, 3, 'CLL_EPITYPE', feature_type='categorical', positive_value='n-CLL')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 2, 3, 'IGHV_MUTATION_STATUS', feature_type='categorical', positive_value='mutated')
compare_overlap_groups(merged_df, community_df, overlap_pairs, 2, 3, 'TUMOR_MOLECULAR_SUBTYPE', feature_type='categorical', positive_value='U-CLL')


Comparing 'AGE_SAMPLING' between Overlap [2,3] and Only [2] (numerical)
Median (Overlap [2,3]): 57.00, Median (Only [2]): 62.00
Mann-Whitney U statistic: 665.000, p-value: 0.0052
Result: Significant difference (p < 0.05)

Comparing 'AGE_SAMPLING' between Overlap [2,3] and Only [3] (numerical)
Median (Overlap [2,3]): 57.00, Median (Only [3]): 64.00
Mann-Whitney U statistic: 840.000, p-value: 0.0029
Result: Significant difference (p < 0.05)

Comparing 'MUTATION_COUNT' between Overlap [2,3] and Only [2] (numerical)
Median (Overlap [2,3]): 33.00, Median (Only [2]): 25.00
Mann-Whitney U statistic: 1541.000, p-value: 0.0235
Result: Significant difference (p < 0.05)

Comparing 'MUTATION_COUNT' between Overlap [2,3] and Only [3] (numerical)
Median (Overlap [2,3]): 33.00, Median (Only [3]): 23.00
Mann-Whitney U statistic: 2128.000, p-value: 0.0063
Result: Significant difference (p < 0.05)

Comparing 'CLL_EPITYPE' between Overlap [2,3] and Only [2] (categorical)
% n-CLL (Overlap [2,3]): 0.0% (0

## Load Leiden data

In [231]:
base_path = "../data/full"
algo = "reverse_hybrid"
patient_df = pd.read_csv("../../data/thesis/cll_broad_2022_clinical_data_thesis.csv")
mutation_df = pd.read_csv("../../data/thesis/cll_broad_2022_mutations_thesis.csv")
community_df = pd.read_csv(os.path.join(base_path, algo, "community_assignments.csv"))

In [232]:
merged_df = pd.merge(patient_df, community_df, on='patientId', how='inner')
merged_df.shape

(1062, 30)

In [233]:
# Convert FFS_STATUS and OS_STATUS to binary event columns
merged_df['FFS_STATUS_EVENT'] = merged_df['FFS_STATUS'].str.startswith('1:').astype(int)
merged_df['OS_STATUS_EVENT'] = merged_df['OS_STATUS'].str.startswith('1:').astype(int)

In [234]:
compare_communities(merged_df, 'communityId', 'AGE_SAMPLING', 3, 4)
compare_communities(merged_df, 'communityId', 'IGHV_MUTATION_STATUS', 3, 4, feature_type='categorical', positive_value='unmutated')
compare_communities(merged_df, 'communityId', 'CLL_EPITYPE', 3, 4, feature_type='categorical', positive_value='i-CLL')
compare_communities(merged_df, 'communityId', 'TUMOR_MOLECULAR_SUBTYPE', 3, 4, feature_type='categorical', positive_value='U-CLL')
compare_communities(merged_df, 'communityId', 'TREATMENT_AFTER_SAMPLING', 3, 4, feature_type='categorical', positive_value='Chemo + Ab')


Comparing 'AGE_SAMPLING' between communities 3 and 4 (numerical)
Median (Comm 3): 57.00, Median (Comm 4): 62.00
Mann-Whitney U statistic: 400.000, p-value: 0.079
Result: No significant difference (p >= 0.05)

Comparing 'IGHV_MUTATION_STATUS' between communities 3 and 4 (categorical)
% unmutated (Comm 3): 41.38% (12/29), (Comm 4): 56.76% (21/37)
Chi-squared statistic: 0.984, p-value: 0.321
Result: No significant difference (p >= 0.05)

Comparing 'CLL_EPITYPE' between communities 3 and 4 (categorical)
% i-CLL (Comm 3): 10.34% (3/29), (Comm 4): 10.81% (4/37)
Fisher's exact test odds ratio: 0.952, p-value: 1.000
Result: No significant difference (p >= 0.05)

Comparing 'TUMOR_MOLECULAR_SUBTYPE' between communities 3 and 4 (categorical)
% U-CLL (Comm 3): 24.14% (7/29), (Comm 4): 29.73% (11/37)
Chi-squared statistic: 0.052, p-value: 0.820
Result: No significant difference (p >= 0.05)

Comparing 'TREATMENT_AFTER_SAMPLING' between communities 3 and 4 (categorical)
% Chemo + Ab (Comm 3): 75.86%

In [235]:
compare_communities(merged_df, 'communityId', 'AGE_SAMPLING', 5, 6)
compare_communities(merged_df, 'communityId', 'MUTATION_COUNT', 5, 6)
compare_communities(merged_df, 'communityId', 'CLL_EPITYPE', 5, 6, feature_type='categorical', positive_value='n-CLL')
compare_communities(merged_df, 'communityId', 'TUMOR_MOLECULAR_SUBTYPE', 5, 6, feature_type='categorical', positive_value='U-CLL')
compare_communities(merged_df, 'communityId', 'IGHV_MUTATION_STATUS', 5, 6, feature_type='categorical', positive_value='mutated')
compare_communities(merged_df, 'communityId', 'TREATMENT_AFTER_SAMPLING', 5, 6, feature_type='categorical', positive_value='Chemo + Ab')


Comparing 'AGE_SAMPLING' between communities 5 and 6 (numerical)
Median (Comm 5): 62.00, Median (Comm 6): 62.00
Mann-Whitney U statistic: 6153.500, p-value: 0.884
Result: No significant difference (p >= 0.05)

Comparing 'MUTATION_COUNT' between communities 5 and 6 (numerical)
Median (Comm 5): 25.50, Median (Comm 6): 24.00
Mann-Whitney U statistic: 5924.000, p-value: 0.160
Result: No significant difference (p >= 0.05)

Comparing 'CLL_EPITYPE' between communities 5 and 6 (categorical)
% n-CLL (Comm 5): 55.56% (65/117), (Comm 6): 30.77% (32/104)
Chi-squared statistic: 12.747, p-value: 0.000
Result: Significant difference (p < 0.05)

Comparing 'TUMOR_MOLECULAR_SUBTYPE' between communities 5 and 6 (categorical)
% U-CLL (Comm 5): 40.17% (47/117), (Comm 6): 35.58% (37/104)
Chi-squared statistic: 0.317, p-value: 0.573
Result: No significant difference (p >= 0.05)

Comparing 'IGHV_MUTATION_STATUS' between communities 5 and 6 (categorical)
% mutated (Comm 5): 39.32% (46/117), (Comm 6): 50.00% (