In [1]:
# # pip install lifelines --user
import numpy as np
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("lung-temp.csv", index_col = 0)
data.head()

Unnamed: 0,inst,time,status,age,sex,ph.ecog,ph.karno,pat.karno,meal.cal,wt.loss
1,3.0,306,2,74,1,1.0,90.0,100.0,1175.0,
2,3.0,455,2,68,1,0.0,90.0,90.0,1225.0,15.0
3,3.0,1010,1,56,1,0.0,90.0,90.0,,15.0
4,5.0,210,2,57,1,1.0,90.0,60.0,1150.0,11.0
5,1.0,883,2,60,1,0.0,100.0,90.0,,0.0


In [3]:
data.shape

(228, 10)

In [4]:
data.dtypes

inst         float64
time           int64
status         int64
age            int64
sex            int64
ph.ecog      float64
ph.karno     float64
pat.karno    float64
meal.cal     float64
wt.loss      float64
dtype: object

In [5]:
data = data[['time', 'status', 'age', 'sex', 'myc', 'smok1ng','gene_alterat1on_status', 'patholog1cal_stage', 'pstage_1or2']]
data["status"] = data["status"] - 1
data["sex"] = data["sex"] - 1
data.head()

KeyError: "['myc', 'smok1ng', 'gene_alterat1on_status', 'patholog1cal_stage', 'pstage_1or2'] not in index"

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
# 直接替换整列
data["smok1ng"] = data["smok1ng"].fillna(data["smok1ng"].mean())
data["gene_alterat1on_status"] = data["gene_alterat1on_status"].fillna(data["gene_alterat1on_status"].mean())
data["patholog1cal_stage"] = data["patholog1cal_stage"].fillna(data["patholog1cal_stage"].mean())
data["pstage_1or2"] = data["pstage_1or2"].fillna(data["pstage_1or2"].mean())
data.dropna(inplace=True)
data["myc"] = data["myc"].astype("int64")

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
T = data["time"]
E = data["status"]
plt.hist(T, bins = 50)
plt.show()

In [None]:
#Fitting a non-parametric model [Kaplan Meier Curve]
kmf = KaplanMeierFitter()
kmf.fit(durations = T, event_observed = E)
kmf.plot_survival_function()

In [None]:
kmf.survival_function_.plot()
plt.title('Survival function')

In [None]:
kmf.plot_cumulative_density()

In [None]:
kmf.median_survival_time_

In [None]:
from lifelines.utils import median_survival_times

median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(median_)
print(median_confidence_interval_)

In [None]:
ax = plt.subplot(111)

m = (data["sex"] == 0)

kmf.fit(durations = T[m], event_observed = E[m], label = "Male")
kmf.plot_survival_function(ax = ax)

kmf.fit(T[~m], event_observed = E[~m], label = "Female")
kmf.plot_survival_function(ax = ax, at_risk_counts = True)

plt.title("Survival of different gender group")

In [None]:
ecog_types = data.sort_values(by = ['myc'])["myc"].unique()

for i, ecog_types in enumerate(ecog_types):
    ax = plt.subplot(2, 2, i + 1)
    ix = data['myc'] == ecog_types
    kmf.fit(T[ix], E[ix], label = ecog_types)
    kmf.plot_survival_function(ax = ax, legend = False)
    plt.title(ecog_types)
    plt.xlim(0, 1200)

plt.tight_layout()