# Building models to compare two treatments for prostatic cancer

In [1]:
import numpy as np
import pandas as pd

from scipy.stats import chi2
from lifelines import CoxPHFitter

Read the data

In [2]:
DatOriginal = pd.read_csv("DatasetsMedicalResearch/Survival of multiple myeloma patients.dat", sep="\s+")

  DatOriginal = pd.read_csv("DatasetsMedicalResearch/Survival of multiple myeloma patients.dat", sep="\s+")


In [3]:
DatOriginal.head()

Unnamed: 0,patient,time,status,age,sex,bun,ca,hb,pcells,protein
0,1,13,1,66,1,25,10,14.6,18,1
1,2,52,0,66,1,13,11,12.0,100,0
2,3,6,1,53,2,15,13,11.4,33,1
3,4,40,1,69,1,10,10,10.2,30,1
4,5,10,1,65,1,20,10,13.2,66,0


---

Function that calculates the test stadistic and $P$-value to compare two nested models based on the log-likelihood ratio test.

In [4]:
def LoglikeRatioTest(minus2loglike_model1, minus2loglike_model2, df_model1, df_model2):
    # model1 must be nested in model2, i.e. model2 has more variables than model1
    test_statistic = minus2loglike_model1 - minus2loglike_model2
    p_value = chi2.sf(test_statistic, df_model2-df_model1)
    return test_statistic, p_value

---

In this case, we have seven explanatory variables, so it becomes intractable to implement models for all the combinations. Thus, we will follow the next strategy.

1. First we fit models that contain each of the variables at a time. The variables that appear important, compared to the Null model are then consider in the subsequent step.

In [5]:
formulas = ['age', 'sex', 'bun', 'ca', 'hb', 'pcells', 'protein']

MinusTwoLogLike = []
Aic = []
DegreesFreedom = []

for i,f in enumerate(formulas):

    cph = CoxPHFitter()
    Model = cph.fit(DatOriginal, 'time', 'status', formula=f)

    if i==0:
        _ = -2*Model.log_likelihood_ + Model.log_likelihood_ratio_test().test_statistic
        MinusTwoLogLike.append(_)
        DegreesFreedom.append(0)
        Aic.append(_)

    MinusTwoLogLike.append(-2*Model.log_likelihood_)
    DegreesFreedom.append(len(Model.summary))
    Aic.append(Model.AIC_partial_)

formulas.insert(0, 'None')

ModelSummary = pd.DataFrame({'formula': formulas, '-2log-L': MinusTwoLogLike, 'AIC': Aic, 'df': DegreesFreedom})

In [6]:
PValues = []

for i in range(len(formulas)):
    _, Pvalue = LoglikeRatioTest(ModelSummary.loc[0]['-2log-L'], ModelSummary.loc[i]['-2log-L'], 0, ModelSummary.loc[i]['df'])
    PValues.append(Pvalue)

ModelSummary['Pvalue'] = PValues

In [7]:
ModelSummary.round(3)

Unnamed: 0,formula,-2log-L,AIC,df,Pvalue
0,,214.678498,214.678498,0,
1,age,214.54065,216.54065,1,0.710431
2,sex,214.645484,216.645484,1,0.855822
3,bun,205.316102,207.316102,1,0.002215
4,ca,214.20391,216.20391,1,0.490884
5,hb,209.661617,211.661617,1,0.025101
6,pcells,214.601188,216.601188,1,0.780977
7,protein,212.510535,214.510535,1,0.140913


From the previous $P$-values, considering a threshold of 0.15, we deduce that bun, hb and possible protein appear to be important.

2. The variables that appear to be important from step 1 are then fitted together. In the presence of certain variables, others may cease to be important. Those that do not lead to a significant increase in the value of $-2\log \hat{L}$ when omittted can be discarded.

In [8]:
formulas = [
    'bun+hb', 'bun+protein', 'hb+protein',
    'bun+hb+protein'
]

MinusTwoLogLike = []
Aic = []
DegreesFreedom = []

for i,f in enumerate(formulas):

    cph = CoxPHFitter()
    Model = cph.fit(DatOriginal, 'time', 'status', formula=f)

    MinusTwoLogLike.append(-2*Model.log_likelihood_)
    DegreesFreedom.append(len(Model.summary))
    Aic.append(Model.AIC_partial_)

_ = pd.DataFrame(
    {
        'formula': formulas,
        '-2log-L': MinusTwoLogLike,
        'AIC':Aic,
        'df':DegreesFreedom
    }
)

ModelSummary = pd.concat([ModelSummary, _], ignore_index=True)

In [9]:
ModelSummary.round(3)

Unnamed: 0,formula,-2log-L,AIC,df,Pvalue
0,,214.678498,214.678498,0,
1,age,214.54065,216.54065,1,0.710431
2,sex,214.645484,216.645484,1,0.855822
3,bun,205.316102,207.316102,1,0.002215
4,ca,214.20391,216.20391,1,0.490884
5,hb,209.661617,211.661617,1,0.025101
6,pcells,214.601188,216.601188,1,0.780977
7,protein,212.510535,214.510535,1,0.140913
8,bun+hb,200.698995,204.698995,2,
9,bun+protein,200.988705,204.988705,2,


In [10]:
# Comparing model bun+hb+protein against model bun+hb, i.e. omitting protein
np.round(LoglikeRatioTest(
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[8]['df'],
    ModelSummary.loc[11]['df']
),3)
# protein might not be omitted, but let's ommit it for now

array([2.798, 0.094])

In [11]:
# Comparing model bun+hb+protein against model bun+protein, i.e. omitting hb
np.round(LoglikeRatioTest(
    ModelSummary.loc[9]['-2log-L'],
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[9]['df'],
    ModelSummary.loc[11]['df']
),3)
# hb should not be omitted

array([3.087, 0.079])

In [12]:
# Comparing model bun+hb+protein against model hb+protein, i.e. omitting bun
np.round(LoglikeRatioTest(
    ModelSummary.loc[10]['-2log-L'],
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[10]['df'],
    ModelSummary.loc[11]['df']
),3)
# bun should not be omitted

array([1.0452e+01, 1.0000e-03])

For now, let us ommit the variable protein, so we consider just bun and hb.

We now compare the model that has bun and hb with the models that ommit any of them.

In [13]:
# Comparing model bun+hb against model bun, i.e. omitting hb
np.round(LoglikeRatioTest(
    ModelSummary.loc[3]['-2log-L'],
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[3]['df'],
    ModelSummary.loc[8]['df']
),3)
# hb should not be omitted

array([4.617, 0.032])

In [23]:
# Comparing model bun+hb against model hb, i.e. omitting bun
np.round(LoglikeRatioTest(
    ModelSummary.loc[5]['-2log-L'],
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[5]['df'],
    ModelSummary.loc[8]['df']
),3)
# bun should not be omitted

array([8.963e+00, 3.000e-03])

Neither bun nor hb can be excluded from the model.

3. Variables that were not important on their own may become important in the presence of others. We add such variables one at a time to the model. Those that reduce $-2\log\hat{L}$ significantly are retained.

In [15]:
formulas = [
    'bun+hb+age', 'bun+hb+sex', 'bun+hb+ca', 'bun+hb+pcells'
]

MinusTwoLogLike = []
Aic = []
DegreesFreedom = []

for i,f in enumerate(formulas):

    cph = CoxPHFitter()
    Model = cph.fit(DatOriginal, 'time', 'status', formula=f)

    MinusTwoLogLike.append(-2*Model.log_likelihood_)
    DegreesFreedom.append(len(Model.summary))
    Aic.append(Model.AIC_partial_)

_ = pd.DataFrame(
    {
        'formula': formulas,
        '-2log-L': MinusTwoLogLike,
        'AIC':Aic,
        'df':DegreesFreedom
    }
)

ModelSummary = pd.concat([ModelSummary, _], ignore_index=True)

In [16]:
ModelSummary.round(3)

Unnamed: 0,formula,-2log-L,AIC,df,Pvalue
0,,214.678498,214.678498,0,
1,age,214.54065,216.54065,1,0.710431
2,sex,214.645484,216.645484,1,0.855822
3,bun,205.316102,207.316102,1,0.002215
4,ca,214.20391,216.20391,1,0.490884
5,hb,209.661617,211.661617,1,0.025101
6,pcells,214.601188,216.601188,1,0.780977
7,protein,212.510535,214.510535,1,0.140913
8,bun+hb,200.698995,204.698995,2,
9,bun+protein,200.988705,204.988705,2,


In [17]:
# Comparing model bun+hb against model bun+hb+age, i.e. adding age
np.round(LoglikeRatioTest(
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[12]['-2log-L'],
    ModelSummary.loc[8]['df'],
    ModelSummary.loc[12]['df']
),3)
# age should not be added

array([0.246, 0.62 ])

In [18]:
# Comparing model bun+hb against model bun+hb+sex, i.e. adding sex
np.round(LoglikeRatioTest(
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[13]['-2log-L'],
    ModelSummary.loc[8]['df'],
    ModelSummary.loc[13]['df']
),3)
# sex should not be added

array([0.393, 0.531])

In [19]:
# Comparing model bun+hb against model bun+hb+ca, i.e. adding ca
np.round(LoglikeRatioTest(
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[14]['-2log-L'],
    ModelSummary.loc[8]['df'],
    ModelSummary.loc[14]['df']
),3)
# ca should not be added

array([0.001, 0.979])

In [20]:
# Comparing model bun+hb against model bun+hb+pcell, i.e. adding pcells
np.round(LoglikeRatioTest(
    ModelSummary.loc[8]['-2log-L'],
    ModelSummary.loc[15]['-2log-L'],
    ModelSummary.loc[8]['df'],
    ModelSummary.loc[15]['df']
),3)
# pcells should not be added

array([0.225, 0.635])

---

When we compare the models bun+hb+protein and bun+hb, we got a $P$-value of 0.094. So, there is a slight evidence that the models are different, and we can decide to keep protein instead of omitting it.

Let's keep it now, and add each of the variables that where considered as unimportant.

In [25]:
formulas = [
    'bun+hb+protein+age', 'bun+hb+protein+sex', 'bun+hb+protein+ca', 'bun+hb+protein+pcells'
]

MinusTwoLogLike = []
Aic = []
DegreesFreedom = []

for i,f in enumerate(formulas):

    cph = CoxPHFitter()
    Model = cph.fit(DatOriginal, 'time', 'status', formula=f)

    MinusTwoLogLike.append(-2*Model.log_likelihood_)
    DegreesFreedom.append(len(Model.summary))
    Aic.append(Model.AIC_partial_)

_ = pd.DataFrame(
    {
        'formula': formulas,
        '-2log-L': MinusTwoLogLike,
        'AIC':Aic,
        'df':DegreesFreedom
    }
)

ModelSummary = pd.concat([ModelSummary, _], ignore_index=True)

In [26]:
ModelSummary.round(3)

Unnamed: 0,formula,-2log-L,AIC,df,Pvalue
0,,214.678498,214.678498,0,
1,age,214.54065,216.54065,1,0.710431
2,sex,214.645484,216.645484,1,0.855822
3,bun,205.316102,207.316102,1,0.002215
4,ca,214.20391,216.20391,1,0.490884
5,hb,209.661617,211.661617,1,0.025101
6,pcells,214.601188,216.601188,1,0.780977
7,protein,212.510535,214.510535,1,0.140913
8,bun+hb,200.698995,204.698995,2,
9,bun+protein,200.988705,204.988705,2,


In [27]:
# Comparing model bun+hb+protein against model bun+hb+protein+age, i.e. adding age
np.round(LoglikeRatioTest(
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[16]['-2log-L'],
    ModelSummary.loc[11]['df'],
    ModelSummary.loc[16]['df']
),3)
# age should not be added

array([0.299, 0.584])

In [28]:
# Comparing model bun+hb+protein against model bun+hb+protein+age, i.e. adding sex
np.round(LoglikeRatioTest(
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[17]['-2log-L'],
    ModelSummary.loc[11]['df'],
    ModelSummary.loc[17]['df']
),3)
# sex should not be added

array([0.327, 0.568])

In [29]:
# Comparing model bun+hb+protein against model bun+hb+protein+age, i.e. adding ca
np.round(LoglikeRatioTest(
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[18]['-2log-L'],
    ModelSummary.loc[11]['df'],
    ModelSummary.loc[18]['df']
),3)
# ca should not be added

array([0.04 , 0.842])

In [30]:
# Comparing model bun+hb+protein against model bun+hb+protein+age, i.e. adding pcellls
np.round(LoglikeRatioTest(
    ModelSummary.loc[11]['-2log-L'],
    ModelSummary.loc[19]['-2log-L'],
    ModelSummary.loc[11]['df'],
    ModelSummary.loc[19]['df']
),3)
# pcells should not be added

array([0.   , 0.985])

Thus, we prefer the model that includes the variables bun, hb, and protein.

Furthermore, we can also look at the AIC, which give the same suggestion.

Ultimately, one might wish to include protein in the model so as not to miss anything.

In [34]:
ModelSummary.iloc[ModelSummary['AIC'].argmin()]

formula    bun+hb+protein
-2log-L        197.901273
AIC            203.901273
df                      3
Pvalue                NaN
Name: 11, dtype: object