## **Combination of Deep Neural Network & Cox Model**
#### **For the Final Project of the Master Degree in JNU**

### 1 Related Package

In [1]:
%pip install pandas lifelines


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### 2 Read Dataset

In [2]:
import pandas as pd
import numpy as np

file_path = 'data/gbsg.csv'

# 数据读取
if '.csv' in file_path:
    data_df = pd.read_csv(file_path)
else:
    data_df = pd.read_excel(file_path)

# 计算持续时间
# data_df['duration'] = data_df['stop'] - data_df['start']
print(data_df.columns)

# 原始数据
ori_data_df = data_df
# 用于神经网络的数据
dnn_data_df = data_df

Index(['Unnamed: 0', 'pid', 'age', 'meno', 'size', 'grade', 'nodes', 'pgr',
       'er', 'hormon', 'rfstime', 'status'],
      dtype='object')


In [3]:
dnn_data_df

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
0,1,132,49,0,18,2,2,0,0,0,1838,0
1,2,1575,55,1,20,3,16,0,0,0,403,1
2,3,1140,56,1,40,3,3,0,0,0,1603,0
3,4,769,45,0,25,3,1,0,4,0,177,0
4,5,130,65,1,30,2,5,0,36,1,1855,0
...,...,...,...,...,...,...,...,...,...,...,...,...
681,682,586,51,0,30,3,2,1152,38,1,1760,0
682,683,1273,64,1,26,2,2,1356,1144,1,1152,0
683,684,1525,57,1,35,3,1,1490,209,1,1342,0
684,685,736,44,0,21,2,3,1600,70,0,629,0


### Deep Neural Network

In [14]:
import torch  
import torch.nn as nn

from sklearn.model_selection import train_test_split  
from torch.utils.data import DataLoader, TensorDataset  
from sklearn.preprocessing import StandardScaler 

class FeatureAugDNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(FeatureAugDNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 64) 
        self.fc3 = nn.Linear(64, 8)   
        self.fc4 = nn.Linear(8, num_classes) 
        self.relu = nn.ReLU()

    def forward(self, x):    
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))  
        x = self.relu(self.fc4(x))  
        out = x  
        
        return out  

co_var = ['age', 'meno', 'size', 'grade', 'nodes', 'pgr', 'er', 'hormon']
Y_df = dnn_data_df['status']
X_df = dnn_data_df[co_var]


X_data_array = X_df.values
Y_data_array = Y_df.values

batch_size = 8 

# 归一化
scaler = StandardScaler() 
X_data_array = scaler.fit_transform(X_data_array)  


X_train, X_test, y_train, y_test = train_test_split(X_data_array, Y_data_array, test_size=0.2, random_state=2024)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)  
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)  
y_test_tensor = torch.tensor(y_test, dtype=torch.long) 

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)  
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False) 


model = FeatureAugDNN(input_dim=len(co_var), num_classes=2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.0001)


# 训练模型  
num_epochs = 800

for epoch in range(num_epochs):  
    for i, (inputs, labels) in enumerate(train_loader):  
        # 前向传播 
        outputs = model(inputs)  
        loss = criterion(outputs, labels) 
        # 反向传播和优化  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step() 
    if (epoch+1) % 20 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')  

# 在测试集上评估模型（这里只是打印损失，你可以添加更多的评估指标如准确率）  
model.eval()  # 设置模型为评估模式  
with torch.no_grad():  # 不需要计算梯度  
    test_loss = 0  
    for inputs, labels in DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32):  
        outputs = model(inputs)  
        test_loss += criterion(outputs, labels).item()  

    test_loss /= len(DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32))  
    print(f'Test Loss: {test_loss}')  

Epoch [20/800], Loss: 0.6843211054801941
Epoch [40/800], Loss: 0.6626784205436707
Epoch [60/800], Loss: 0.62518310546875
Epoch [80/800], Loss: 0.5534219145774841
Epoch [100/800], Loss: 0.5028792023658752
Epoch [120/800], Loss: 0.4611772894859314
Epoch [140/800], Loss: 0.4217883050441742
Epoch [160/800], Loss: 0.38898882269859314
Epoch [180/800], Loss: 0.3619275689125061
Epoch [200/800], Loss: 0.3373410105705261
Epoch [220/800], Loss: 0.3144283890724182
Epoch [240/800], Loss: 0.29469531774520874
Epoch [260/800], Loss: 0.2773706614971161
Epoch [280/800], Loss: 0.2622169256210327
Epoch [300/800], Loss: 0.24893078207969666
Epoch [320/800], Loss: 0.2379922717809677
Epoch [340/800], Loss: 0.22580529749393463
Epoch [360/800], Loss: 0.2141113579273224
Epoch [380/800], Loss: 0.20340166985988617
Epoch [400/800], Loss: 0.1942615509033203
Epoch [420/800], Loss: 0.18589818477630615
Epoch [440/800], Loss: 0.1780509203672409
Epoch [460/800], Loss: 0.1702110469341278
Epoch [480/800], Loss: 0.163034170

In [15]:
all_outputs_tensor = torch.tensor(X_data_array, dtype=torch.float32) 
all_outputs = model(all_outputs_tensor)
print(all_outputs.shape) 

# 将tensor转换为numpy数组（pandas可以直接处理numpy数组）  
all_outputs_array = all_outputs.detach().numpy()
# 创建一个DataFrame，并指定列名  
all_outputs_df = pd.DataFrame(all_outputs_array, columns=['new_var1', 'new_var2'])  


torch.Size([686, 2])


### Cox Model

In [18]:
concat_df_cols = pd.concat([ori_data_df, all_outputs_df], axis=1)
concat_df_cols

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status,new_var1,new_var2
0,1,132,49,0,18,2,2,0,0,0,1838,0,1.467206,0.075188
1,2,1575,55,1,20,3,16,0,0,0,403,1,0.000000,1.465647
2,3,1140,56,1,40,3,3,0,0,0,1603,0,0.793895,0.000000
3,4,769,45,0,25,3,1,0,4,0,177,0,2.377056,0.000000
4,5,130,65,1,30,2,5,0,36,1,1855,0,0.000000,1.382362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,682,586,51,0,30,3,2,1152,38,1,1760,0,6.339674,0.000000
682,683,1273,64,1,26,2,2,1356,1144,1,1152,0,4.612369,0.944075
683,684,1525,57,1,35,3,1,1490,209,1,1342,0,7.892196,0.000000
684,685,736,44,0,21,2,3,1600,70,0,629,0,6.055097,0.000000


In [19]:
from lifelines import CoxPHFitter  

  
# 假设你的CSV文件有以下列：'duration', 'event', 'var1', 'var2', ... 其中'duration'是时间，'event'是事件发生（1）或未发生（0）  
# 你需要根据你的数据集来指定这些列名  
duration_col = 'rfstime'  
event_col = 'status'  
covariates = ['age', 'meno', 'size', 'grade', 'nodes', 'pgr', 'er', 'hormon', 'new_var1', 'new_var2', 'rfstime', 'status']  # 替换为你的协变量列名  
  
# 使用CoxPHFitter进行拟合  
cph = CoxPHFitter()  
# cph.fit(ori_data_df[covariates], duration_col=duration_col, event_col=event_col)  
cph.fit(concat_df_cols[covariates], duration_col=duration_col, event_col=event_col)
  
# 输出模型的摘要信息  
cph.print_summary()
  
# # 你也可以预测个体的风险  
# # 例如，预测一个新的观测值的风险，其中var1=1, var2=2, var3=3  
# individual_risk = cph.predict_partial_hazard(pd.DataFrame({'trt': [1], 'fev': [2], 'fev2': [3]}))  
# print(f"Predicted partial hazard: {individual_risk[0]}")  
  
# # 你还可以预测个体的生存函数或风险函数等  
# # 例如，预测生存函数  
# survival_function = cph.predict_survival_function(pd.DataFrame({'trt': [1], 'fev': [2], 'fev2': [3]}))  
# print(survival_function.iloc[:, 0])

0,1
model,lifelines.CoxPHFitter
duration col,'rfstime'
event col,'status'
baseline estimation,breslow
number of observations,686
number of events observed,299
partial log-likelihood,-1660.40
time fit was run,2024-06-02 13:57:12 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.0,1.0,0.01,-0.02,0.02,0.98,1.02,0.0,0.1,0.92,0.12
meno,-0.25,0.78,0.22,-0.68,0.18,0.51,1.2,0.0,-1.14,0.25,1.97
size,0.0,1.0,0.0,-0.0,0.01,1.0,1.01,0.0,0.87,0.38,1.38
grade,0.15,1.17,0.12,-0.09,0.39,0.92,1.48,0.0,1.24,0.21,2.23
nodes,0.01,1.01,0.01,-0.01,0.03,0.99,1.03,0.0,0.69,0.49,1.03
pgr,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.18,0.86,0.22
er,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.72,0.47,1.09
hormon,-0.07,0.93,0.16,-0.39,0.25,0.68,1.29,0.0,-0.42,0.67,0.57
new_var1,-0.88,0.42,0.13,-1.12,-0.63,0.33,0.53,0.0,-6.95,<0.005,37.95
new_var2,0.39,1.47,0.09,0.21,0.57,1.23,1.76,0.0,4.19,<0.005,15.15

0,1
Concordance,0.76
Partial AIC,3340.80
log-likelihood ratio test,255.41 on 10 df
-log2(p) of ll-ratio test,160.80
