In [None]:
# Read data
# Survival, tumor stage, tumor sites, subtypes, age


In [None]:
# ------------------------------
# Kaplan-Meier Estimation and Visualization
# ------------------------------
kmf = KaplanMeierFitter()

plt.figure(figsize=(8,6))
for cluster_name, cluster_df in df.groupby('cluster'):
    kmf.fit(cluster_df['time'], event_observed=cluster_df['event'], label=cluster_name)
    kmf.plot_survival_function(ci_show=True)  # Plot survival function with confidence intervals

plt.title("Kaplan-Meier Survival Curves by Cluster")
plt.xlabel("Time")
plt.ylabel("Survival Probability")
plt.legend(title="Cluster")
plt.show()

# ------------------------------
# Log-Rank Test for Comparing Survival Curves
# ------------------------------
# Here we assume two groups; for more than two, consider a multivariate comparison.
cluster1 = df[df['cluster'] == 'Cluster1']
cluster2 = df[df['cluster'] == 'Cluster2']

logrank_results = logrank_test(
    cluster1['time'], cluster2['time'],
    event_observed_A=cluster1['event'],
    event_observed_B=cluster2['event']
)

print("Log-Rank Test p-value:", logrank_results.p_value)

# ------------------------------
# Cox Proportional Hazards Model for Hazard Ratios
# ------------------------------
# For the Cox model, we need to encode the categorical "cluster" variable.
# Here we create dummy variables; note that drop_first=True sets one cluster as reference.
df_encoded = pd.get_dummies(df[['cluster']], drop_first=True)
# Combine with the survival data
df_model = pd.concat([df[['time', 'event']], df_encoded], axis=1)

cph = CoxPHFitter()
cph.fit(df_model, duration_col='time', event_col='event')
cph.print_summary()

In [None]:
# ============================================================
# Creating Stacked Bar Plots for Each Categorical Variable
# ============================================================
# We will produce one stacked bar plot per variable.
# In each plot the x-axis corresponds to the groups (here: clusters)
# and the bars show the relative percentages of each category for the variable.

variables = ['tumor_stage', 'tumor_site', 'subtype']

# Create one subplot per variable
fig, axes = plt.subplots(1, len(variables), figsize=(6 * len(variables), 6))

# Iterate over the variables and plot each as a stacked bar chart
for ax, var in zip(axes, variables):
    # Create a contingency table of cluster vs the current variable.
    # The option normalize='index' computes the proportions within each cluster,
    # and multiplying by 100 converts those to percentages.
    ct = pd.crosstab(df['cluster'], df[var], normalize='index') * 100
    
    # Plot the stacked bar chart.
    ct.plot(kind='bar', stacked=True, ax=ax)
    
    # Set labels and title.
    ax.set_ylabel('Percentage')
    ax.set_title(f'{var.capitalize()} Distribution by Cluster')
    
    # Place the legend outside for better readability.
    ax.legend(title=var, bbox_to_anchor=(1.05, 1), loc='upper left')

# Improve layout so the subplots don’t overlap.
plt.tight_layout()
plt.show()

In [None]:
# ============================================================
# Create Boxplot: Age distribution by Cluster
# ============================================================
plt.figure(figsize=(8, 6))
sns.boxplot(x='cluster', y='age', data=df)
plt.title('Age Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()