In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot

import re
import sklearn
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from datetime import datetime
from lifelines import KaplanMeierFitter
from lifelines.plotting import plot_lifetimes 
from lifelines import CoxPHFitter

In [2]:
df = pd.read_excel('Cancer patient data.xlsx')
df.shape

(8468, 14)

In [None]:
# drop the -360 

df.drop(df[df['diagnosis_days_min'] < 0].index, inplace = True)
df.shape

In [None]:
# live & dead

live = df.loc[df['dead'] == 0]
print('live num:', live.shape[0])
dead = df.loc[df['dead'] == 1]
print('dead num:', dead.shape[0])

live['diagnosis_days_min'] = (datetime(2016,8,25) - live['min_DT']).astype('timedelta64[D]')
live['diagnosis_years_min'] = (datetime(2016,8,25) - live['min_DT']).astype('timedelta64[Y]')
live['diagnosis_days_max'] = (datetime(2016,8,25) - live['max_DT']).astype('timedelta64[D]')
live['diagnosis_years_max'] = (datetime(2016,8,25) - live['max_DT']).astype('timedelta64[Y]')

dead['diagnosis_days_min'] = (dead['DEATH_DATE_off'] - dead['min_DT']).astype('timedelta64[D]')
dead['diagnosis_years_min'] = (dead['DEATH_DATE_off'] - dead['min_DT']).astype('timedelta64[Y]')
dead['diagnosis_days_max'] = (dead['DEATH_DATE_off'] - dead['max_DT']).astype('timedelta64[D]')
dead['diagnosis_years_max'] = (dead['DEATH_DATE_off'] - dead['max_DT']).astype('timedelta64[Y]')

df = pd.concat([live,dead])
print('final df:',df.shape[0])

In [None]:
# preparation 
durations = df1['diagnosis_days_min'] 
event_observed = df1['dead']

km = KaplanMeierFitter()
km.fit(durations, event_observed,label='Kaplan Meier Estimate')
km.plot()

In [None]:
# by gender 
kmf_m = KaplanMeierFitter()
kmf_f = KaplanMeierFitter()
kmf_m.fit(durations = Male['diagnosis_days'], event_observed = Male['dead'], label = 'Male')
kmf_f.fit(durations = Female['diagnosis_days'], event_observed = Female['dead'], label = 'Female')
kmf_m.plot()
kmf_f.plot()

In [None]:
#kmf_m.event_table
#kmf_f.event_table
#kmf_m.survival_function_
#kmf_f.survival_function_

In [None]:
# cumulative density
df_cd = kmf_m.cumulative_density_.merge(kmf_f.cumulative_density_,how='left', on='timeline')
print(df_cd.head())

# plot cumulative density for gender groups 
kmf_m.plot_cumulative_density()
kmf_f.plot_cumulative_density()
plt.title('Cumulative Density for Gender')
plt.xlabel('Number of days')
plt.ylabel('Probability')

In [None]:
# cox
df2 = df1.loc[:,['sex','age_Group','race','dead','diagnosis_days','diagnosis_years']]
df2.info()

df_dummy = pd.get_dummies(df2, drop_first=True)
df_dummy.head()

# Using Cox Proportional Hazards model
cph = CoxPHFitter()   ## Instantiate the class to create a cph object
cph.fit(df_dummy, 'diagnosis_years', event_col='dead')   ## Fit the data to train the model
cph.print_summary()

cph.plot()

In [None]:
# manually calculate: considering different senarios
df_age = df1.copy()
df_age['diagnosis_days_a'] = (datetime(2016,8,25) - df_age['min_DT']).astype('timedelta64[D]')
df_age['dead_duration_a'] = (df_age['DEATH_DATE_off'] - df_age['min_DT']).astype('timedelta64[D]')
# 3851
df_age_live = df_age.loc[df_age['dead'] == 0]
print('live from:', df_age_live.shape[0])
# 297 -- 190
df_age_dead = df_age.loc[df_age['dead'] == 1]
print('dead from:', df_age_dead.shape[0])
mask = (df_age_dead['dead_duration_a'] > 400)
df_age_dead['dead'][mask] = 0

df_age_newdead = df_age_dead.loc[df_age_dead['dead'] == 1]
print('dead to:', df_age_newdead.shape[0])

# 3393
df_age_live = df_age_live.loc[df_age_live['diagnosis_days_a'] >= 400]
print('live to:', df_age_live.shape[0])

# 3690
final_df_age = pd.concat([df_age_live,df_age_dead])
print('final age group > 400 days from:',df1.shape[0],'to:',final_df_age.shape[0])

In [None]:
T = final_df_age['diagnosis_days_min']     
E = final_df_age['dead']

age_group = final_df_age['age_min_Group']            
le10 = (age_group == '<=10')  
_11to20 = (age_group == '11-20')
_21to30 = (age_group == '21-30')
_31to40 = (age_group == '31-40')
_41to50 = (age_group == '41-50')
_51to60 = (age_group == '51-60')
_61to70 = (age_group == '61-70')
_71to80 = (age_group == '71-80')
_81to90 = (age_group == '81-90')


kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[le10], E[le10], label='<=10')

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_11to20], E[_11to20], label='_11to20')

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_21to30], E[_21to30], label='_21to30')  

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_31to40], E[_31to40], label='_31to40')  

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_41to50], E[_41to50], label='_41to50')  

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_51to60], E[_51to60], label='_51to60')   

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_61to70], E[_61to70], label='_61to70') 

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_71to80], E[_71to80], label='_71to80') 

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

#kmf_age = KaplanMeierFitter() 
kmf_age.fit(T[_81to90], E[_81to90], label='_81to90') 

kmf_age_sf = kmf_age.survival_function_
kmf_age_sf.reset_index(inplace = True)
print(kmf_age_sf[kmf_age_sf['timeline'] >= 400].head(1))

print('Thus, 81-90 age group has the lowest survival after 400 days since cancer diagnosis.')