In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv("childpugh.csv")
print(df.dtypes)

print(f"Length of Dataframe: {len(df)}")

EXPERIMENTS_START_SEED = 57

Bilirubin (Total)             float64
Albumin                       float64
INR                           float64
Ascites                         int64
Hepatic encephalopathy          int64
Re-Bleeding within 14 days      int64
dtype: object
Length of Dataframe: 913


In [39]:
def calculate_child_pugh_score(row):
    """
    Calculate Child-Pugh score based on the 5 parameters.
    Returns total score (5-15 points).
    """
    score = 0
    
    # 1. Bilirubin (Total) in mg/dL
    bilirubin = row['Bilirubin (Total)']
    if bilirubin < 2:
        score += 1
    elif 2 <= bilirubin <= 3:
        score += 2
    else:  # > 3
        score += 3
    
    # 2. Albumin in g/dL
    albumin = row['Albumin']
    if albumin > 3.5:
        score += 1
    elif 2.8 <= albumin <= 3.5:
        score += 2
    else:  # < 2.8
        score += 3
    
    # 3. INR
    inr = row['INR']
    if inr < 1.7:
        score += 1
    elif 1.7 <= inr <= 2.3:
        score += 2
    else:  # > 2.3
        score += 3
    
    # 4. Ascites
    # Assuming: 0=None(1pt), 1=Mild(2pt), 2=Severe(3pt)
    ascites = row['Ascites']
    score += (ascites + 1)
    
    # 5. Hepatic Encephalopathy
    # Assuming: 0=None(1pt), 1=Grade1-2(2pt), 2=Grade3-4(3pt)
    hepatic_enc = row['Hepatic encephalopathy']
    score += (hepatic_enc + 1)
    
    return int(score)

df["child_pugh_score"] = df.apply(calculate_child_pugh_score, axis=1)

print(f"Min: {df['child_pugh_score'].min()}, Max: {df['child_pugh_score'].max()}, Mean: {df['child_pugh_score'].mean():.2f}, Median: {df['child_pugh_score'].median():.2f}")
print(df['child_pugh_score'].describe())
print(df['child_pugh_score'].value_counts().sort_index())

Min: 5, Max: 12, Mean: 7.62, Median: 8.00
count    913.000000
mean       7.616648
std        1.431103
min        5.000000
25%        6.000000
50%        8.000000
75%        9.000000
max       12.000000
Name: child_pugh_score, dtype: float64
child_pugh_score
5      52
6     189
7     195
8     185
9     219
10     60
11     12
12      1
Name: count, dtype: int64


In [40]:
# Performance metrics are calculated using 100 samples 
def bootstrap_model_evaluation(df, model, random_state_start, n_iterations=100, test_size=0.3):
    """
    Bootstrap train/test splits with different seeds and evaluate model performance.
    Optimal threshold selected using Youden's J statistic on train set.
    
    
    Returns:
    --------
    results_df : pandas.DataFrame
        AUROC, Accuracy, F1 for each iteration
    summary_stats : pandas.DataFrame
        Mean, std, and 95% CI for each metric
    """
    
    # Prepare features and target
    Y_score = df['child_pugh_score']
    Y = df['Re-Bleeding within 14 days']
    
    results = []
    
    for i in range(n_iterations):
        seed = random_state_start + i
        
        # Create train/test split
        y_score_train, y_score_test, y_train, y_test = train_test_split(
            Y_score, Y, test_size=test_size, random_state=seed, stratify=Y
        )
        
        
        # Find optimal threshold using Youden's J statistic on TRAIN set
        fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)
        youden_j = tpr_train - fpr_train
        optimal_idx = np.argmax(youden_j)
        optimal_threshold = thresholds_train[optimal_idx]
        
        # Apply optimal threshold to TEST set
        y_test_pred = (y_score_test >= optimal_threshold).astype(int)
        
        # Calculate metrics on TEST set
        metrics = {
            'seed': seed,
            'optimal_threshold': optimal_threshold,
            'auroc': roc_auc_score(y_test, y_score_test),
            'accuracy': accuracy_score(y_test, y_test_pred),
            'f1': f1_score(y_test, y_test_pred)
        }
        
        results.append(metrics)
        
        if (i + 1) % 20 == 0:
            print(f"Completed {i + 1}/{n_iterations} iterations...")
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Calculate summary statistics
    metric_cols = ['auroc', 'accuracy', 'f1']
    summary_stats = pd.DataFrame({
        'mean': results_df[metric_cols].mean(),
        'std': results_df[metric_cols].std(),
        'min': results_df[metric_cols].min(),
        'max': results_df[metric_cols].max(),
        '95%_CI_lower': results_df[metric_cols].quantile(0.025),
        '95%_CI_upper': results_df[metric_cols].quantile(0.975)
    })
    
    return results_df, summary_stats


# Call the function with your dataset
results_df, summary_stats = bootstrap_model_evaluation(df, model=None, random_state_start=EXPERIMENTS_START_SEED)

print("\nChild-Pugh Score Bootstrap Results (100 iterations):")
print("="*60)
print(summary_stats.round(4))
print("\n")
print("Detailed metrics:")
print(f"AUROC:    {summary_stats.loc['auroc', 'mean']:.4f} ± {summary_stats.loc['auroc', 'std']:.4f} "
      f"[{summary_stats.loc['auroc', '95%_CI_lower']:.4f}, {summary_stats.loc['auroc', '95%_CI_upper']:.4f}]")
print(f"Accuracy: {summary_stats.loc['accuracy', 'mean']:.4f} ± {summary_stats.loc['accuracy', 'std']:.4f} "
      f"[{summary_stats.loc['accuracy', '95%_CI_lower']:.4f}, {summary_stats.loc['accuracy', '95%_CI_upper']:.4f}]")
print(f"F1 Score: {summary_stats.loc['f1', 'mean']:.4f} ± {summary_stats.loc['f1', 'std']:.4f} "
      f"[{summary_stats.loc['f1', '95%_CI_lower']:.4f}, {summary_stats.loc['f1', '95%_CI_upper']:.4f}]")

#Save Results
results_df.to_csv('child_pugh_bootstrap_results.csv', index=False)


Completed 20/100 iterations...
Completed 40/100 iterations...
Completed 60/100 iterations...
Completed 80/100 iterations...
Completed 100/100 iterations...

Child-Pugh Score Bootstrap Results (100 iterations):
            mean     std     min     max  95%_CI_lower  95%_CI_upper
auroc     0.5737  0.0622  0.4153  0.7668        0.4681        0.6836
accuracy  0.5626  0.0919  0.3102  0.7153        0.4635        0.7099
f1        0.1701  0.0322  0.0588  0.2500        0.1009        0.2352


Detailed metrics:
AUROC:    0.5737 ± 0.0622 [0.4681, 0.6836]
Accuracy: 0.5626 ± 0.0919 [0.4635, 0.7099]
F1 Score: 0.1701 ± 0.0322 [0.1009, 0.2352]
