In [None]:
!pip install scikit-survival lifelines

In [79]:
import os
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from joblib import dump

In [None]:
import config

In [66]:
file_path = "f{DATASET_PATH}.tsv"
df = pd.read_csv(file_path, sep="\t")

display(df.head())

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Publication Version Type,Alcohol History Documented,Biopsy Site,Cancer Type,Cancer Type Detailed,Last Communication Contact from Initial Pathologic Diagnosis Date,...,Project State,Race Category,Number of Samples Per Patient,Sample Type,Sample type id,Sex,Years Smoked,Person Cigarette Smoking History Pack Year Value,Patient's Vital Status,Year of Diagnosis
0,paad_tcga_gdc,TCGA-2J-AAB1,TCGA-2J-AAB1-01,65,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Male,26.0,25.0,Dead,2012.0
1,paad_tcga_gdc,TCGA-2J-AAB4,TCGA-2J-AAB4-01,48,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,729.0,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
2,paad_tcga_gdc,TCGA-2J-AAB6,TCGA-2J-AAB6-01,75,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Male,,,Dead,2012.0
3,paad_tcga_gdc,TCGA-2J-AAB8,TCGA-2J-AAB8-01,71,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,80.0,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
4,paad_tcga_gdc,TCGA-2J-AAB9,TCGA-2J-AAB9-01,70,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Female,,,Dead,2012.0


In [67]:
df = df.rename(columns={
    "Overall Survival (Months)": "duration",
    "Overall Survival Status": "event"
})

df["event"] = df["event"].str.contains("DECEASED").astype(int)

initial_rows = len(df)
df = df.dropna(subset=["duration", "event"])

display(df.head())

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Publication Version Type,Alcohol History Documented,Biopsy Site,Cancer Type,Cancer Type Detailed,Last Communication Contact from Initial Pathologic Diagnosis Date,...,Project State,Race Category,Number of Samples Per Patient,Sample Type,Sample type id,Sex,Years Smoked,Person Cigarette Smoking History Pack Year Value,Patient's Vital Status,Year of Diagnosis
0,paad_tcga_gdc,TCGA-2J-AAB1,TCGA-2J-AAB1-01,65,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Male,26.0,25.0,Dead,2012.0
1,paad_tcga_gdc,TCGA-2J-AAB4,TCGA-2J-AAB4-01,48,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,729.0,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
2,paad_tcga_gdc,TCGA-2J-AAB6,TCGA-2J-AAB6-01,75,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Male,,,Dead,2012.0
3,paad_tcga_gdc,TCGA-2J-AAB8,TCGA-2J-AAB8-01,71,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,80.0,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
4,paad_tcga_gdc,TCGA-2J-AAB9,TCGA-2J-AAB9-01,70,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,,...,released,WHITE,1,Primary Tumor,1,Female,,,Dead,2012.0


In [68]:
leakage_cols = [
    "Death from Initial Pathologic Diagnosis Date",
    "Last Communication Contact from Initial Pathologic Diagnosis Date",
    "Disease Free (Months)",
    "Disease Free Status"
]
id_cols = ["Study ID", "Patient ID", "Sample ID", "Other Patient ID", "Other Sample ID"]

df = df.drop(columns=leakage_cols + id_cols, errors='ignore')

display(df.head())

Unnamed: 0,Diagnosis Age,American Joint Committee on Cancer Publication Version Type,Alcohol History Documented,Biopsy Site,Cancer Type,Cancer Type Detailed,Birth from Initial Pathologic Diagnosis Date,Disease Type,Ethnicity Category,Fraction Genome Altered,...,Project State,Race Category,Number of Samples Per Patient,Sample Type,Sample type id,Sex,Years Smoked,Person Cigarette Smoking History Pack Year Value,Patient's Vital Status,Year of Diagnosis
0,65,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,-23962,Pancreas-Adenocarcinoma-Other Subtype,,0.062,...,released,WHITE,1,Primary Tumor,1,Male,26.0,25.0,Dead,2012.0
1,48,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,-17794,Pancreas-Adenocarcinoma-Other Subtype,,0.1955,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
2,75,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,-27600,Pancreas-Adenocarcinoma Ductal Type,,0.3519,...,released,WHITE,1,Primary Tumor,1,Male,,,Dead,2012.0
3,71,7th,False,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,-26028,Pancreas-Adenocarcinoma Ductal Type,,0.0543,...,released,WHITE,1,Primary Tumor,1,Male,,,Alive,2012.0
4,70,7th,True,Pancreas,Pancreatic Adenocarcinoma,Pancreatic Adenocarcinoma,-25920,Pancreas-Adenocarcinoma Ductal Type,,0.0038,...,released,WHITE,1,Primary Tumor,1,Female,,,Dead,2012.0


In [69]:
initial_cols = len(df.columns)
df = df.dropna(axis=1, thresh=0.6 * len(df))
df = df.loc[:, df.nunique() > 1]

display(df.head())

Unnamed: 0,Diagnosis Age,American Joint Committee on Cancer Publication Version Type,Alcohol History Documented,Birth from Initial Pathologic Diagnosis Date,Disease Type,Ethnicity Category,Fraction Genome Altered,ICD-10 Classification,Morphology,Mutation Count,...,AJCC Pathologic Stage,AJCC Pathologic T-Stage,Primary Diagnosis,Race Category,Number of Samples Per Patient,Sample Type,Sample type id,Sex,Patient's Vital Status,Year of Diagnosis
0,65,7th,True,-23962,Pancreas-Adenocarcinoma-Other Subtype,,0.062,C25.0,8480/3,55.0,...,Stage IIB,T3,Pancreas-Adenocarcinoma-Other Subtype,WHITE,1,Primary Tumor,1,Male,Dead,2012.0
1,48,7th,False,-17794,Pancreas-Adenocarcinoma-Other Subtype,,0.1955,C25.8,8140/3,26.0,...,Stage IIB,T2,Pancreas-Adenocarcinoma-Other Subtype,WHITE,1,Primary Tumor,1,Male,Alive,2012.0
2,75,7th,False,-27600,Pancreas-Adenocarcinoma Ductal Type,,0.3519,C25.1,8500/3,49.0,...,Stage IIA,T3,Pancreas-Adenocarcinoma Ductal Type,WHITE,1,Primary Tumor,1,Male,Dead,2012.0
3,71,7th,False,-26028,Pancreas-Adenocarcinoma Ductal Type,,0.0543,C25.0,8500/3,40.0,...,Stage IIB,T3,Pancreas-Adenocarcinoma Ductal Type,WHITE,1,Primary Tumor,1,Male,Alive,2012.0
4,70,7th,True,-25920,Pancreas-Adenocarcinoma Ductal Type,,0.0038,C25.0,8500/3,13.0,...,Stage IIB,T3,Pancreas-Adenocarcinoma Ductal Type,WHITE,1,Primary Tumor,1,Female,Dead,2012.0


In [70]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
num_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
num_cols = [c for c in num_cols if c not in ["duration", "event"]]

cat_cols = [c for c in df.select_dtypes(include="object").columns if c not in ["duration", "event"]]
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

display(df_encoded.head())

Unnamed: 0,Diagnosis Age,Birth from Initial Pathologic Diagnosis Date,Fraction Genome Altered,Mutation Count,duration,event,Number of Samples Per Patient,Sample type id,Year of Diagnosis,American Joint Committee on Cancer Publication Version Type_6th,...,AJCC Pathologic T-Stage_T4,AJCC Pathologic T-Stage_TX,Primary Diagnosis_Pancreas-Adenocarcinoma-Other Subtype,Primary Diagnosis_Pancreas-Colloid (mucinous non-cystic) Carcinoma,Primary Diagnosis_Pancreas-Undifferentiated Carcinoma,Race Category_BLACK OR AFRICAN AMERICAN,Race Category_WHITE,Sample Type_Primary Tumor,Sex_Male,Patient's Vital Status_Dead
0,65,-23962,0.062,55.0,2.1682,1,1,1,2012.0,False,...,False,False,True,False,False,False,True,True,True,True
1,48,-17794,0.1955,26.0,23.948752,0,1,1,2012.0,False,...,False,False,True,False,False,False,True,True,True,False
2,75,-27600,0.3519,49.0,9.625493,1,1,1,2012.0,False,...,False,False,False,False,False,False,True,True,True,True
3,71,-26028,0.0543,40.0,2.628121,0,1,1,2012.0,False,...,False,False,False,False,False,False,True,True,True,False
4,70,-25920,0.0038,13.0,20.597898,1,1,1,2012.0,False,...,False,False,False,False,False,False,True,True,False,True


In [71]:
imputer = IterativeImputer(max_iter=20, random_state=42)
X = df_encoded.drop(columns=["duration", "event"])
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

display(X.head())

Unnamed: 0,Diagnosis Age,Birth from Initial Pathologic Diagnosis Date,Fraction Genome Altered,Mutation Count,Number of Samples Per Patient,Sample type id,Year of Diagnosis,American Joint Committee on Cancer Publication Version Type_6th,American Joint Committee on Cancer Publication Version Type_7th,Alcohol History Documented_True,...,AJCC Pathologic T-Stage_T4,AJCC Pathologic T-Stage_TX,Primary Diagnosis_Pancreas-Adenocarcinoma-Other Subtype,Primary Diagnosis_Pancreas-Colloid (mucinous non-cystic) Carcinoma,Primary Diagnosis_Pancreas-Undifferentiated Carcinoma,Race Category_BLACK OR AFRICAN AMERICAN,Race Category_WHITE,Sample Type_Primary Tumor,Sex_Male,Patient's Vital Status_Dead
0,65.0,-23962.0,0.062,55.0,1.0,1.0,2012.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1,48.0,-17794.0,0.1955,26.0,1.0,1.0,2012.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
2,75.0,-27600.0,0.3519,49.0,1.0,1.0,2012.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
3,71.0,-26028.0,0.0543,40.0,1.0,1.0,2012.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,70.0,-25920.0,0.0038,13.0,1.0,1.0,2012.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [72]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [73]:
y = df_encoded[["duration", "event"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.concat([y_train, X_train], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

In [None]:
cph = CoxPHFitter(penalizer=0.1, l1_ratio=0.5)
cph.fit(train_df, duration_col="duration", event_col="event")

In [75]:
train_cindex = cph.concordance_index_
test_pred = cph.predict_partial_hazard(test_df)
test_cindex = concordance_index(y_test["duration"], -test_pred, y_test["event"])

print(f"\nTrain C-index: {train_cindex:.3f}")
print(f"Test  C-index: {test_cindex:.3f}")


Train C-index: 0.843
Test  C-index: 0.747


In [None]:
if not os.path.exists('bin'):
    os.makedirs('bin')
else:
  pass

dump(cph, 'bin/model.joblib')
