In [49]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import TensorDataset, DataLoader, random_split, ConcatDataset, Subset, Dataset

In [50]:
def metrics_output(preds,labels):
    true_labels = np.array(labels)
    predicted_probs = np.array(preds)
    binary_predictions = (predicted_probs >= 0.5).astype(int)
    auc = roc_auc_score(true_labels, predicted_probs)
    conf_matrix = confusion_matrix(true_labels, binary_predictions)
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = accuracy_score(true_labels, binary_predictions)
    f1 = f1_score(true_labels, binary_predictions)
    mcc = matthews_corrcoef(true_labels, binary_predictions)  
    return (auc, sensitivity, specificity, accuracy, f1, mcc)

In [51]:
class TransformerEncoderClassification(nn.Module):
    def __init__(self):
        super(TransformerEncoderClassification, self).__init__()
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=8, nhead=4), #8 确定，4定为能使d_model divisible by nhead 
            num_layers= 3,
        ) 
        self.fc = nn.Linear(24, 2)# 4 batch, 24 samples every batch

    def forward(self, x):
        x = x.permute(1, 0, 2)  
        x = self.transformer_encoder(x) 
        x = x.permute(1, 0, 2) 
        x = x.flatten(1) 
        x = self.fc(x)
        return x
    
model = TransformerEncoderClassification() 
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.005)



In [52]:
loaded_datasets_info = torch.load('/Users/jiaming/Desktop/Lab2/datas/saved_datasets_scaled.pth')
loaded_train_dataset = loaded_datasets_info['train_dataset'] # all data, can use batch to split into batches
loaded_val_dataset = loaded_datasets_info['val_dataset']
loaded_test_dataset = loaded_datasets_info['test_dataset']

In [53]:
# split x and y from loader
X_train = []
y_train = []
for x, y in loaded_train_dataset:
    X_train.append(x.reshape(-1, 3, 8))
    y_train.append(y)
    
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0) 

print(X_train.shape) # 824 samples
print(y_train.shape)

############################################

X_val = []
y_val = []
for x, y in loaded_val_dataset:
    X_val.append(x.reshape(-1, 3, 8))
    y_val.append(y)
    
X_val = torch.cat(X_val, dim=0)
y_val = torch.cat(y_val, dim=0) 

print(X_val.shape) # 206 samples
print(y_val.shape)

############################################

X_test = []
y_test = []
for x, y in loaded_test_dataset:
    X_test.append(x.reshape(-1, 3, 8))
    y_test.append(y)

X_test = torch.cat(X_test, dim=0)
y_test = torch.cat(y_test, dim=0) 

print(X_test.shape) # 258 samples
print(y_test.shape)



torch.Size([824, 3, 8])
torch.Size([824])
torch.Size([206, 3, 8])
torch.Size([206])
torch.Size([258, 3, 8])
torch.Size([258])


In [54]:
# package up data and label
class MyDataSet(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label
        self.length = label.shape[0]
        
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    def __len__(self):
        return self.length

In [55]:
train_dataset = MyDataSet(X_train, y_train) 
test_dataset = MyDataSet(X_test, y_test) 
val_dataset = MyDataSet(X_val, y_val) 

In [56]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_dataset = ConcatDataset([train_dataset, val_dataset]) # 824(train)+206(val)=1030
train_loader = DataLoader(train_dataset, batch_size=103) # 103样本每批，总用10批

In [57]:
for fold_idx, (train_index, val_index) in enumerate(kf.split(train_dataset)):
    predicted_probabilities = []
    true_labels = []
    predicted_labels = []  
    
    train_loader_fold = DataLoader(Subset(train_dataset, train_index), batch_size=206) #824 #206/batch 4batch
    val_loader_fold = DataLoader(Subset(train_dataset, val_index), batch_size=103) #206 103/batch 2batch
    
    for epoch in range(15):    
        model = TransformerEncoderClassification() 
        for batch_idx, (features, targets) in enumerate(train_loader_fold):     
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets.long()) 
            loss.backward()
            optimizer.step()
            
        with torch.set_grad_enabled(False):
            for batch_idx, (features, targets) in enumerate(val_loader_fold):
                logits = model(features.float())
                _, y_pred = torch.max(logits.data, dim=1) 
                predicted_labels.extend(y_pred.tolist()) 
                logits = torch.nn.functional.softmax(logits, dim = 1)#以后都指定一下为1, 否则会Warning, 隐式选择dim=1
                predicted_probabilities.extend(logits[:,1].tolist()) 
                true_labels.extend(targets.tolist())
        #每一个都储存（没有必要，只要最后一次的就好）
        #torch.save(model.state_dict(), f'/Users/jiaming/Desktop/Lab2/datas/ROC/models/Transformer/transformer_{fold_idx}.pth')
        #np.save(f'/Users/jiaming/Desktop/Lab2/datas/ROC/models/Transformer/transformer_y_val_pred_{fold_idx}.npy', predicted_probabilities)
        #np.save(f'/Users/jiaming/Desktop/Lab2/datas/ROC/models/Transformer/transformer_y_val_{fold_idx}.npy', true_labels)

In [58]:
# 只储存了最后一次
torch.save(model.state_dict(), '/Users/jiaming/Desktop/Lab2/datas/ROC/models/Transformer/transformer.pth')

In [59]:
roc_auc, metrics_sn, metrics_sp, metrics_ACC, metrics_F1, metrics_MCC = metrics_output(predicted_probabilities,true_labels)
print(roc_auc, metrics_sn, metrics_sp, metrics_ACC, metrics_F1, metrics_MCC)

0.5132602670697909 0.46802721088435373 0.5512345679012346 0.5116504854368932 0.47694974003466206 0.019307539849644045


In [60]:
# 只储存了最后一次
np.save('/Users/jiaming/Desktop/Lab2/datas/ROC/val/Transformer/transformer_y_val_pred.npy', predicted_probabilities)
np.save('/Users/jiaming/Desktop/Lab2/datas/ROC/val/Transformer/transformer_y_val.npy', true_labels)

In [61]:
test_loader = DataLoader(test_dataset, batch_size=129) #258 129/批 2批 反正是测试，两次结束就好

In [62]:
predicted_labels = []  
true_labels = [] 
predicted_probabilities = []
with torch.set_grad_enabled(False): 
    for batch_idx, (features, targets) in enumerate(test_loader):
        logits = model(features.float())
        _, y_pred = torch.max(logits.data, dim=1)
        predicted_labels.extend(y_pred.tolist()) 
        logits = torch.nn.functional.softmax(logits, dim = 1)
        predicted_probabilities.extend(logits[:,1].tolist()) 
        true_labels.extend(targets.tolist())

In [63]:
roc_auc, metrics_sn, metrics_sp, metrics_ACC, metrics_F1, metrics_MCC = metrics_output(predicted_probabilities,true_labels)
print(roc_auc, metrics_sn, metrics_sp, metrics_ACC, metrics_F1, metrics_MCC)

0.49789562289562295 0.6060606060606061 0.42063492063492064 0.5155038759689923 0.5614035087719299 0.02716253925766122


In [64]:
np.save('/Users/jiaming/Desktop/Lab2/datas/ROC/test/Transformer/transformer_y_test_pred.npy', predicted_probabilities)
np.save('/Users/jiaming/Desktop/Lab2/datas/ROC/test/Transformer/transformer_y_test.npy', true_labels)