In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim

查看设备

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
print(f'The model will be running on {device} device')

The model will be running on cuda:0 device


定义一个NN类

In [3]:
class Net(nn.Module):
    def __init__(self, feature_count, class_count):
        super (Net, self).__init__()
        
        self.hidden_layers = nn.Sequential(nn.Linear(feature_count, 1000),
                                            nn.ReLU(True),
                                            nn.Linear(1000, 5000),
                                            nn.ReLU(True),
                                            nn.Linear(5000, 5000),
                                            nn.ReLU(True),
                                            nn.Linear(5000, 5000),
                                            nn.ReLU(True),
                                            nn.Linear(5000, 1000),
                                            nn.ReLU(True),
                                            nn.Linear(1000, class_count))
        
    def forward(self, x):
        outputs = self.hidden_layers(x)
        return outputs

定义训练函数

In [4]:
def train(whole_train_set, model_name):
    train_set, valid_set = random_split(whole_train_set, [int(0.8*len(whole_train_set)),
                                                            len(whole_train_set) - int(0.8*len(whole_train_set))])
    train_loader = DataLoader(train_set, batch_size = int(0.2*len(train_set)), shuffle = True)
    valid_loader = DataLoader(valid_set, batch_size = int(0.5*len(valid_set)), shuffle = True)
    
    model = Net(500, 2)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    best_accuracy = 0
    
    for epoch in range(200):
        running_train_loss = 0.0  
        running_val_loss = 0.0
        correct, total = 0, 0 
        for i, data in enumerate(train_loader, 0):
            X = data[0].to(device)
            y = data[1].to(device)
            optimizer.zero_grad()
            
            outputs = model(X)  
            loss = criterion(outputs, y.long())
            loss.backward()
            optimizer.step()
            running_train_loss += loss
        
        train_loss = running_train_loss/len(train_loader)
        
        with torch.no_grad():
            model.eval()
            
            for i, data in enumerate(valid_loader, 0):
                X = data[0].to(device)
                y = data[1].to(device)
            
                outputs = model(X)
                loss = criterion(outputs, y.long())
                _, y_pred = torch.max(outputs, dim = 1)
                running_val_loss += loss
                total += outputs.size(0)
                correct += (y == y_pred).sum().item()
            
        val_loss = running_val_loss/len(valid_loader)
        
        accuracy = 100*correct/total
        
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), f'./best_model/{model_name}_best.pth')
            print('The model has been saved for the best accuracy %d %%'%(accuracy))
            best_accuracy = accuracy
        
        if epoch == 0:
            print('The model is working fine!')

        if (epoch + 1)%100 == 0:
            print('Completed training epoch', epoch + 1, 'Training Loss is: %.4f' %train_loss, 'Validation Loss is: %.4f' %val_loss, 'Accuracy is %d %%' % (accuracy))

定义测试函数

In [5]:
def test(whole_test_set, model_name):
    model = Net(500, 2)
    model.to(device)
    model.load_state_dict(torch.load(f'./best_model/{model_name}_best.pth'))
    model.eval()
    
    X = whole_test_set.dataset[:][0].to(device)
    y = whole_test_set.dataset[:][1].to(device)
    
    y_pred = model(X)
    print("AUC:{:.4f} on test data.".format(roc_auc_score(y.cpu().detach().numpy(), 
                                                            y_pred.cpu().detach().numpy()[:, 1])))

导入数据

In [6]:
metadata = pd.read_csv('../COAD/metadata.csv', index_col = 2)['pathologic_stage_label']
trans_data  = pd.read_csv('./selected_transcriptome.csv', index_col = 0)

数据预处理

In [7]:
def StageNormalize(stage):
    stage = str(stage)
    if re.search('Stage IV', stage):
        return 3
    elif re.search('Stage III', stage):
        return 2
    elif re.search('Stage II', stage):
        return 1
    elif re.search('Stage I', stage):
        return 0
    else:
        return np.nan

metadata = metadata.apply(StageNormalize)

In [8]:
metadata.head()

transcriptom_id
TCGA-AA-3841-01A    1.0
TCGA-D5-6924-01A    1.0
TCGA-AA-3861-01A    1.0
TCGA-AA-3510-01A    1.0
TCGA-AA-A024-01A    1.0
Name: pathologic_stage_label, dtype: float64

In [9]:
metadata.dropna(inplace = True)
metadata.isnull().any()

False

Stage I vs Stage II

In [10]:
processed_X  = trans_data.loc[metadata[(metadata == 0) | (metadata == 1)].index]
labels = metadata[(metadata == 0) | (metadata == 1)].apply(lambda x: 1 if x != 0 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_1_vs_stage_2')
test(whole_test_set, 'stage_1_vs_stage_2')

The model has been saved for the best accuracy 72 %
The model is working fine!
The model has been saved for the best accuracy 76 %
Completed training epoch 100 Training Loss is: 0.0277 Validation Loss is: 0.4988 Accuracy is 72 %
Completed training epoch 200 Training Loss is: 0.0006 Validation Loss is: 0.8040 Accuracy is 65 %
AUC:0.8082 on test data.


Stage I vs Stage III

In [11]:
processed_X  = trans_data.loc[metadata[(metadata == 0) | (metadata == 2)].index]
labels = metadata[(metadata == 0) | (metadata == 2)].apply(lambda x: 1 if x != 0 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_1_vs_stage_3')
test(whole_test_set, 'stage_1_vs_stage_3')

The model has been saved for the best accuracy 29 %
The model is working fine!
The model has been saved for the best accuracy 70 %
The model has been saved for the best accuracy 73 %
The model has been saved for the best accuracy 75 %
Completed training epoch 100 Training Loss is: 0.0861 Validation Loss is: 1.8167 Accuracy is 75 %
The model has been saved for the best accuracy 78 %
Completed training epoch 200 Training Loss is: 0.0018 Validation Loss is: 1.8367 Accuracy is 73 %
AUC:0.8987 on test data.


Stage I vs Stage IV

In [12]:
processed_X  = trans_data.loc[metadata[(metadata == 0) | (metadata == 3)].index]
labels = metadata[(metadata == 0) | (metadata == 3)].apply(lambda x: 1 if x != 0 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_1_vs_stage_4')
test(whole_test_set, 'stage_1_vs_stage_4')

The model has been saved for the best accuracy 59 %
The model is working fine!
The model has been saved for the best accuracy 81 %
Completed training epoch 100 Training Loss is: 0.0215 Validation Loss is: 0.5630 Accuracy is 77 %
Completed training epoch 200 Training Loss is: 0.0004 Validation Loss is: 0.7789 Accuracy is 74 %
AUC:0.7677 on test data.


Stage II vs Stage III

In [13]:
processed_X  = trans_data.loc[metadata[(metadata == 1) | (metadata == 2)].index]
labels = metadata[(metadata == 1) | (metadata == 2)].apply(lambda x: 1 if x != 1 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_2_vs_stage_3')
test(whole_test_set, 'stage_2_vs_stage_3')

The model has been saved for the best accuracy 63 %
The model is working fine!
The model has been saved for the best accuracy 68 %
The model has been saved for the best accuracy 74 %
The model has been saved for the best accuracy 75 %
Completed training epoch 100 Training Loss is: 0.1123 Validation Loss is: 1.0499 Accuracy is 58 %
Completed training epoch 200 Training Loss is: 0.0016 Validation Loss is: 2.0027 Accuracy is 62 %
AUC:0.7893 on test data.


Stage II vs Stage IV

In [14]:
processed_X  = trans_data.loc[metadata[(metadata == 1) | (metadata == 3)].index]
labels = metadata[(metadata == 1) | (metadata == 3)].apply(lambda x: 1 if x != 1 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_2_vs_stage_4')
test(whole_test_set, 'stage_2_vs_stage_4')

The model has been saved for the best accuracy 75 %
The model is working fine!
The model has been saved for the best accuracy 79 %
Completed training epoch 100 Training Loss is: 0.0194 Validation Loss is: 1.0740 Accuracy is 68 %
Completed training epoch 200 Training Loss is: 0.0006 Validation Loss is: 1.5907 Accuracy is 68 %
AUC:0.7229 on test data.


Stage III vs Stage IV

In [16]:
processed_X  = trans_data.loc[metadata[(metadata == 2) | (metadata == 3)].index]
labels = metadata[(metadata == 2) | (metadata == 3)].apply(lambda x: 1 if x != 2 else 0)
processed_X = np.array(processed_X).astype(np.float32)
labels = np.array(labels)

processed_data = TensorDataset(torch.from_numpy(processed_X), torch.from_numpy(labels))
whole_train_set, whole_test_set = random_split(processed_data, [int(0.8*len(processed_data)),
                                                                    len(processed_data) - int(0.8*len(processed_data))])

train(whole_train_set, 'stage_3_vs_stage_4')
test(whole_test_set, 'stage_3_vs_stage_4')

The model has been saved for the best accuracy 68 %
The model is working fine!
Completed training epoch 100 Training Loss is: 0.0044 Validation Loss is: 1.4286 Accuracy is 57 %
Completed training epoch 200 Training Loss is: 0.0002 Validation Loss is: 1.8590 Accuracy is 60 %
AUC:0.5963 on test data.
