Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from sksurv.ensemble import RandomSurvivalForest

Step 2: Load Dataset

In [2]:
# Reading the dataset from the provided Excel file
file_path = "../Data/RADCURE_Clinical_v04_20241219.xlsx"
df = pd.read_excel(file_path)

Step 3: Data Overview

In [3]:
# Display the first few rows
display(df.head())

# Show dataset information
df.info()

Unnamed: 0,patient_id,Age,Sex,ECOG PS,Smoking PY,Smoking Status,Ds Site,Subsite,T,N,...,Local,Date Local,Regional,Date Regional,Distant,Date Distant,2nd Ca,Date 2nd Ca,RADCURE-challenge,ContrastEnhanced
0,RADCURE-0005,62.6,Female,ECOG 0,50,Ex-smoker,Oropharynx,post wall,T4b,N2c,...,,NaT,,NaT,,NaT,,NaT,0,0
1,RADCURE-0006,87.3,Male,ECOG 2,25,Ex-smoker,Larynx,Glottis,T1b,N0,...,,NaT,,NaT,,NaT,,NaT,0,1
2,RADCURE-0007,49.9,Male,ECOG 1,15,Ex-smoker,Oropharynx,Tonsil,T3,N2b,...,,NaT,,NaT,,NaT,,NaT,0,1
3,RADCURE-0009,72.3,Male,ECOG 1,30,Ex-smoker,Unknown,,T0,N2c,...,,NaT,,NaT,,NaT,S (suspicious),2008-05-27,0,0
4,RADCURE-0010,59.7,Female,ECOG 0,0,Non-smoker,Oropharynx,Tonsillar Fossa,T4b,N0,...,,NaT,,NaT,,NaT,,NaT,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3346 entries, 0 to 3345
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   patient_id         3346 non-null   object        
 1   Age                3346 non-null   float64       
 2   Sex                3346 non-null   object        
 3   ECOG PS            3345 non-null   object        
 4   Smoking PY         3341 non-null   object        
 5   Smoking Status     3346 non-null   object        
 6   Ds Site            3346 non-null   object        
 7   Subsite            2972 non-null   object        
 8   T                  3334 non-null   object        
 9   N                  3333 non-null   object        
 10  M                  3332 non-null   object        
 11  Stage              3319 non-null   object        
 12  Path               3346 non-null   object        
 13  HPV                1717 non-null   object        
 14  Tx Modal

## Dataset Overview
- **Rows:** 3,346 patients
- **Columns:** 34 variables
- **Key Variables for Survival Analysis:**
  - **Survival Time:** `Length FU` (Follow-up duration in months)
  - **Event Indicator:** `Status` (Alive/Dead indicator)
  - **Covariates:** `Age`, `Sex`, `Smoking Status`, `Stage`, `Tx Modality`

Step 4: Data Cleaning & Preprocessing

In [5]:
# Convert Status column into a binary event indicator
# Assume 'Dead' corresponds to 1 and 'Alive' to 0
df['Event'] = df['Status'].apply(lambda x: 1 if x == 'Dead' else 0)

# Selecting relevant columns for analysis
columns_needed = ['Length FU', 'Event', 'Age', 'Sex', 'Smoking Status', 'Stage', 'Tx Modality']
df_selected = df[columns_needed].dropna()

# Step 5: Display Cleaned Data
print("Cleaned Data Preview:")
display(df_selected.head())

# Save cleaned dataset as CSV
df_selected.to_csv("../Data/RADCURE_Cleaned.csv", index=False)

Cleaned Data Preview:


Unnamed: 0,Length FU,Event,Age,Sex,Smoking Status,Stage,Tx Modality
0,1.317808,1,62.6,Female,Ex-smoker,IVB,RT alone
1,1.520548,1,87.3,Male,Ex-smoker,I,RT alone
2,2.126027,0,49.9,Male,Ex-smoker,IVA,RT alone
3,5.791781,0,72.3,Male,Ex-smoker,IVA,RT alone
4,9.512329,0,59.7,Female,Non-smoker,IVB,RT alone


# Markdown Test: Data Cleaning Summary
"""
## Data Cleaning Summary
- Removed missing values in critical columns
- Converted `Status` to a binary event indicator
- Selected key covariates for analysis
- Shape after cleaning: {} rows, {} columns
""".format(df_selected.shape[0], df_selected.shape[1])

In [None]:
# Step 6: Kaplan-Meier Survival Analysis
# Initialize the Kaplan-Meier fitter
kmf = KaplanMeierFitter()

# Fit the model to the data
time_col = 'Length FU'  # Survival time column
event_col = 'Event'  # Event indicator column
kmf.fit(durations=df_selected[time_col], event_observed=df_selected[event_col])

# Plot the survival curve
plt.figure(figsize=(8,6))
kmf.plot_survival_function()
plt.title("Kaplan-Meier Survival Curve")
plt.xlabel("Time (Months)")
plt.ylabel("Survival Probability")
plt.grid()
plt.show()

# Markdown Test: Kaplan-Meier Summary
"""
## Kaplan-Meier Survival Analysis
- Generated a survival curve for the dataset
- The curve shows the probability of survival over time
- This step helps visualize patient survival patterns
"""

# Step 7: Group-wise Kaplan-Meier Analysis
# Perform survival analysis for different treatment modalities
plt.figure(figsize=(10,6))
for modality in df_selected['Tx Modality'].unique():
    kmf.fit(durations=df_selected[df_selected['Tx Modality'] == modality][time_col],
            event_observed=df_selected[df_selected['Tx Modality'] == modality][event_col],
            label=modality)
    kmf.plot_survival_function()

plt.title("Kaplan-Meier Survival Curves by Treatment Modality")
plt.xlabel("Time (Months)")
plt.ylabel("Survival Probability")
plt.legend()
plt.grid()
plt.show()

# Markdown Test: Kaplan-Meier by Treatment Groups
"""
## Kaplan-Meier Analysis by Groups
- The survival curves represent different treatment modalities
- Differences in survival rates can be compared visually
- A log-rank test can further quantify these differences
"""

# Step 8: Log-Rank Test to Compare Groups
from lifelines.statistics import logrank_test

# Define two treatment groups
group1 = df_selected[df_selected['Tx Modality'] == df_selected['Tx Modality'].unique()[0]]
group2 = df_selected[df_selected['Tx Modality'] == df_selected['Tx Modality'].unique()[1]]

# Perform log-rank test
log_rank_result = logrank_test(
    group1[time_col], group2[time_col],
    event_observed_A=group1[event_col], event_observed_B=group2[event_col]
)

print("Log-Rank Test p-value:", log_rank_result.p_value)

# Markdown Test: Log-Rank Test Results
"""
## Log-Rank Test Results
- The log-rank test compares survival distributions between two groups
- A small p-value (< 0.05) suggests a significant survival difference
"""