In [1]:
import numpy as np
import pandas as pd

from scipy.stats import chi2
from lifelines import CoxPHFitter

Read the data

In [2]:
DatOriginal = pd.read_csv("DatasetsMedicalResearch/Treatment of hypernephroma.dat", sep="\s+")

  DatOriginal = pd.read_csv("DatasetsMedicalResearch/Treatment of hypernephroma.dat", sep="\s+")


In [3]:
DatOriginal.head()

Unnamed: 0,nephrectomy,age,time,status
0,0,1,9,1
1,0,1,6,1
2,0,1,21,1
3,0,2,15,1
4,0,2,8,1


---

Function that calculates the test stadistic and $P$-value to compare two nested models based on the log-likelihood ratio test.

In [4]:
def LoglikeRatioTest(minus2loglike_model1, minus2loglike_model2, df_model1, df_model2):
    # model1 must be nested in model2, i.e. model2 has more variables than model1
    test_statistic = minus2loglike_model1 - minus2loglike_model2
    p_value = chi2.sf(test_statistic, df_model2-df_model1)
    return test_statistic, p_value

---

The data only has four prognostic variables: age, shb, size, and index.

We can create a model for each possible combination of these variables, i.e., we can build 16 models.

In [5]:
formulas = [
    'C(age)', 'C(nephrectomy)', 'C(age)+C(nephrectomy)', 'C(age)+C(nephrectomy)+C(age)*C(nephrectomy)'
]

MinusTwoLogLike = []
Aic = []
DegreesFreedom = []

for i,f in enumerate(formulas):

    cph = CoxPHFitter()
    Model = cph.fit(DatOriginal, 'time', 'status', formula=f)

    if i==0:
        _ = -2*Model.log_likelihood_ + Model.log_likelihood_ratio_test().test_statistic
        MinusTwoLogLike.append(_)
        DegreesFreedom.append(0)
        Aic.append(_)

    MinusTwoLogLike.append(-2*Model.log_likelihood_)
    DegreesFreedom.append(len(Model.summary))
    Aic.append(Model.AIC_partial_)

formulas.insert(0, 'None')

ModelSummary = pd.DataFrame({'formula': formulas, '-2log-L': MinusTwoLogLike, 'AIC': Aic, 'df': DegreesFreedom})

In [6]:
ModelSummary.round(3)

Unnamed: 0,formula,-2log-L,AIC,df
0,,176.586,176.586,0
1,C(age),170.801,174.801,2
2,C(nephrectomy),169.022,171.022,1
3,C(age)+C(nephrectomy),164.223,170.223,3
4,C(age)+C(nephrectomy)+C(age)*C(nephrectomy),160.833,170.833,5


Is there an interaction between age and nephrectomy?

In [7]:
# Comparing model C(age)+C(nephrectomy)+C(age)*C(nephrectomy) against model C(age)+C(nephectomy), i.e. omitting C(age)*C(nephrectomy)
np.round(LoglikeRatioTest(
    ModelSummary.loc[3]['-2log-L'],
    ModelSummary.loc[4]['-2log-L'],
    ModelSummary.loc[3]['df'],
    ModelSummary.loc[4]['df']
),3)

#We do not reject the null hypothesis. So, there is no interaction between age and nephrectomy

array([3.39 , 0.184])

Is age needed in the model?

In [8]:
# Comparing model C(age)+C(nephrectomy) against model C(nephectomy), i.e. omitting C(age)
np.round(LoglikeRatioTest(
    ModelSummary.loc[2]['-2log-L'],
    ModelSummary.loc[3]['-2log-L'],
    ModelSummary.loc[2]['df'],
    ModelSummary.loc[3]['df']
),3)

#We reject the null hypothesis. So, we need age

array([4.799, 0.091])

Is age nephrectomy in the model?

In [9]:
# Comparing model C(age)+C(nephrectomy) against model C(age), i.e. omitting C(nephrectomy)
np.round(LoglikeRatioTest(
    ModelSummary.loc[1]['-2log-L'],
    ModelSummary.loc[3]['-2log-L'],
    ModelSummary.loc[1]['df'],
    ModelSummary.loc[3]['df']
),3)

#We reject the null hypothesis. So, we need nephrectomy

array([6.577, 0.01 ])

We could have checked the model with the smallest AIC

In [11]:
ModelSummary.loc[ModelSummary['AIC'].argmin()]

formula    C(age)+C(nephrectomy)
-2log-L               164.223322
AIC                   170.223322
df                             3
Name: 3, dtype: object