## **Combination of Deep Neural Network & Cox Model**
#### **For the Final Project of the Master Degree in JNU**

### 1 Related Package

In [None]:
%pip install pandas lifelines

### 2 Read Dataset

In [156]:
import pandas as pd
import numpy as np

file_path = 'data/rhdnase_data.xlsx'

# 数据读取
if '.csv' in file_path:
    data_df = pd.read_csv(file_path)
else:
    data_df = pd.read_excel(file_path)

# 计算持续时间
data_df['duration'] = data_df['stop'] - data_df['start']
print(data_df.columns)

# 原始数据
ori_data_df = data_df
# 用于神经网络的数据
dnn_data_df = data_df

Index(['id', 'trt', 'fev', 'fev2', 'start', 'stop', 'event', 'etype', 'enum',
       'enum1', 'emum2', 'duration'],
      dtype='object')


In [157]:
dnn_data_df

Unnamed: 0,id,trt,fev,fev2,start,stop,event,etype,enum,enum1,emum2,duration
0,493301,1,28.8,28.1,0,168,0,1,1,1,0,168
1,493303,1,64.0,63.0,0,169,0,1,1,1,0,169
2,493305,0,67.2,68.7,0,65,1,1,1,1,0,65
3,493305,0,67.2,68.7,65,75,1,2,2,1,1,10
4,493305,0,67.2,68.7,75,168,0,1,3,2,1,93
...,...,...,...,...,...,...,...,...,...,...,...,...
1321,1024306,0,48.0,46.9,0,169,0,1,1,1,0,169
1322,1024309,0,51.2,52.7,0,170,0,1,1,1,0,170
1323,1024311,1,96.0,95.5,0,169,0,1,1,1,0,169
1324,1024312,1,54.4,54.1,0,169,0,1,1,1,0,169


### Comp

In [179]:
filtered_df_list = []
for i in range(0, 181, 10):
    filtered_df = ori_data_df[(ori_data_df['start'] < i) & (ori_data_df['stop'] > i) & (ori_data_df['event'] == 1)]
    filtered_df_list.append(filtered_df)

filtered_df_final = pd.concat(filtered_df_list, axis=0, ignore_index=True)
filtered_df_final

Unnamed: 0,id,trt,fev,fev2,start,stop,event,etype,enum,enum1,emum2,duration
0,493305,0,67.2,68.7,0,65,1,1,1,1,0,65
1,493313,0,32.0,32.4,0,90,1,1,1,1,0,90
2,589302,0,28.8,29.2,8,22,1,2,2,1,1,14
3,589303,0,112.0,110.7,0,60,1,1,1,1,0,60
4,589305,0,70.4,71.7,0,50,1,1,1,1,0,50
...,...,...,...,...,...,...,...,...,...,...,...,...
2647,1005324,0,41.6,41.6,152,162,1,2,2,1,1,10
2648,918313,1,41.6,40.8,157,171,1,2,4,2,2,14
2649,972310,1,89.6,89.1,164,171,1,2,6,3,3,7
2650,980774,0,41.6,40.4,158,171,1,2,8,4,4,13


### Deep Neural Network

In [126]:
import torch  
import torch.nn as nn

from sklearn.model_selection import train_test_split  
from torch.utils.data import DataLoader, TensorDataset  
from sklearn.preprocessing import StandardScaler 

class FeatureAugDNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(FeatureAugDNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 64) 
        self.fc3 = nn.Linear(64, 8)   
        self.fc4 = nn.Linear(8, num_classes) 
        self.relu = nn.ReLU()

    def forward(self, x, is_fc3=False):    
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        fc3_x = self.relu(self.fc3(x))  
        x = self.relu(self.fc4(fc3_x))  
        out = x  

        if is_fc3 is False:
            return out  
        else:
            return fc3_x

co_var = ['trt', 'fev', 'fev2']
Y_df = dnn_data_df['event']
X_df = dnn_data_df[co_var]


X_data_array = X_df.values
Y_data_array = Y_df.values

batch_size = 8 

# 归一化
scaler = StandardScaler() 
X_data_array = scaler.fit_transform(X_data_array)  


X_train, X_test, y_train, y_test = train_test_split(X_data_array, Y_data_array, test_size=0.2, random_state=2024)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)  
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)  
y_test_tensor = torch.tensor(y_test, dtype=torch.long) 

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)  
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False) 


model = FeatureAugDNN(input_dim=len(co_var), num_classes=2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.1)


# 训练模型  
num_epochs = 800

for epoch in range(num_epochs):  
    for i, (inputs, labels) in enumerate(train_loader):  
        # 前向传播 
        outputs = model(inputs, is_fc3=False)  
        loss = criterion(outputs, labels) 
        # 反向传播和优化  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step() 
    if (epoch+1) % 20 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')  

# 在测试集上评估模型（这里只是打印损失，你可以添加更多的评估指标如准确率）  
model.eval()  # 设置模型为评估模式  
with torch.no_grad():  # 不需要计算梯度  
    test_loss = 0  
    for inputs, labels in DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32):  
        outputs = model(inputs, is_fc3=False)  
        test_loss += criterion(outputs, labels).item()  

    test_loss /= len(DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32))  
    print(f'Test Loss: {test_loss}')  

Epoch [20/800], Loss: 0.7923089265823364
Epoch [40/800], Loss: 0.7626185417175293
Epoch [60/800], Loss: 0.7413303852081299
Epoch [80/800], Loss: 0.7249782681465149
Epoch [100/800], Loss: 0.7089015245437622
Epoch [120/800], Loss: 0.6969414353370667
Epoch [140/800], Loss: 0.6897838711738586
Epoch [160/800], Loss: 0.6839803457260132
Epoch [180/800], Loss: 0.6788749694824219
Epoch [200/800], Loss: 0.6764603853225708
Epoch [220/800], Loss: 0.6741312146186829
Epoch [240/800], Loss: 0.6724860668182373
Epoch [260/800], Loss: 0.6691049337387085
Epoch [280/800], Loss: 0.666946530342102
Epoch [300/800], Loss: 0.6619800329208374
Epoch [320/800], Loss: 0.6567709445953369
Epoch [340/800], Loss: 0.655096173286438
Epoch [360/800], Loss: 0.6535453200340271
Epoch [380/800], Loss: 0.6530414819717407
Epoch [400/800], Loss: 0.6522413492202759
Epoch [420/800], Loss: 0.6507343649864197
Epoch [440/800], Loss: 0.6525105834007263
Epoch [460/800], Loss: 0.6522207856178284
Epoch [480/800], Loss: 0.651651740074157

In [139]:
all_outputs_tensor = torch.tensor(X_data_array, dtype=torch.float32) 
all_outputs = model(all_outputs_tensor, is_fc3=False)
print(all_outputs.shape) 

# 将tensor转换为numpy数组（pandas可以直接处理numpy数组）  
all_outputs_array = all_outputs.detach().numpy()
# 创建一个DataFrame，并指定列名  
all_outputs_df = pd.DataFrame(all_outputs_array, columns=['new_var1', 'new_var2', 
                                                        #   'new_var3', 'new_var4', 
                                                        #   'new_var5', 'new_var6', 
                                                        #   'new_var7', 'new_var8'
                                                          ])  


torch.Size([1326, 2])


### Cox Model

In [182]:
# with deep neural network
# concat_df_cols = pd.concat([ori_data_df, all_outputs_df], axis=1)
# concat_df_cols

# with comp
concat_df_cols = pd.concat([ori_data_df, filtered_df_final], axis=0)
concat_df_cols

Unnamed: 0,id,trt,fev,fev2,start,stop,event,etype,enum,enum1,emum2,duration
0,493301,1,28.8,28.1,0,168,0,1,1,1,0,168
1,493303,1,64.0,63.0,0,169,0,1,1,1,0,169
2,493305,0,67.2,68.7,0,65,1,1,1,1,0,65
3,493305,0,67.2,68.7,65,75,1,2,2,1,1,10
4,493305,0,67.2,68.7,75,168,0,1,3,2,1,93
...,...,...,...,...,...,...,...,...,...,...,...,...
2647,1005324,0,41.6,41.6,152,162,1,2,2,1,1,10
2648,918313,1,41.6,40.8,157,171,1,2,4,2,2,14
2649,972310,1,89.6,89.1,164,171,1,2,6,3,3,7
2650,980774,0,41.6,40.4,158,171,1,2,8,4,4,13


In [183]:
from lifelines import CoxPHFitter  

  
duration_col = 'duration'  
event_col = 'event'  
# covariates = ['age', 'meno', 'size', 'grade', 'nodes', 'pgr', 'er', 'hormon', 'rfstime', 'status']  # 替换为你的协变量列名  
covariates = ['trt', 'fev', 'fev2', 
              # 'new_var1', 'new_var2', 
              # 'new_var3', 'new_var4', 
              # 'new_var5', 'new_var6', 
              # 'new_var7', 'new_var8',
              'duration', 'event']  # 替换为你的协变量列名  

# duration_col = '间隔时间'  
# event_col = 'event'  
# covariates = [
#               '间隔时间', 'event', 
#               '物料推送气缸推送状态', '物料推送气缸收回状态', 
#               '物料推送数', '物料待抓取数', 
#               '放置容器数', '容器上传检测数', '填装检测数',
#               '填装定位器固定状态', '填装定位器放开状态', 
#               '物料抓取数', '填装旋转数', '填装下降数', 
#               '填装数', '加盖检测数',
#               '加盖定位数', '推盖数', '加盖下降数', '加盖数', 
#               '拧盖检测数', '拧盖定位数', 
#               '拧盖下降数', '拧盖旋转数',
#               '拧盖数', '合格数', '不合格数'
#        ]  
  
# 使用CoxPHFitter进行拟合  
cph = CoxPHFitter()  
# cph.fit(ori_data_df[covariates], duration_col=duration_col, event_col=event_col)  
cph.fit(concat_df_cols[covariates], duration_col=duration_col, event_col=event_col)
  
# 输出模型的摘要信息  
cph.print_summary()
  
# # 你也可以预测个体的风险  
# # 例如，预测一个新的观测值的风险，其中var1=1, var2=2, var3=3  
# individual_risk = cph.predict_partial_hazard(pd.DataFrame({'trt': [1], 'fev': [2], 'fev2': [3]}))  
# print(f"Predicted partial hazard: {individual_risk[0]}")  
  
# # 你还可以预测个体的生存函数或风险函数等  
# # 例如，预测生存函数  
# survival_function = cph.predict_survival_function(pd.DataFrame({'trt': [1], 'fev': [2], 'fev2': [3]}))  
# print(survival_function.iloc[:, 0])

0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'event'
baseline estimation,breslow
number of observations,3978
number of events observed,3373
partial log-likelihood,-25312.70
time fit was run,2024-06-03 15:38:35 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
trt,-0.11,0.9,0.03,-0.17,-0.04,0.84,0.96,0.0,-3.03,<0.005,8.65
fev,0.03,1.03,0.02,-0.0,0.07,1.0,1.07,0.0,1.79,0.07,3.78
fev2,-0.04,0.96,0.02,-0.08,-0.01,0.92,0.99,0.0,-2.24,0.02,5.32

0,1
Concordance,0.54
Partial AIC,50631.40
log-likelihood ratio test,145.76 on 3 df
-log2(p) of ll-ratio test,101.87
