# Imports

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import time
from torch.utils.data import Subset
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score

# Settings and Dataset

In [None]:
##########################
### SETTINGS
##########################

# Device
device = torch.device("cuda:1" if torch.cuda.is_available() else "mps")

# Hyperparameters
random_seed = 123
learning_rate = 0.01
num_epochs = 10
batch_size = 128

# Architecture
num_classes = 2


In [None]:
pos_table = pd.read_csv('/Users/jiaming/Desktop/revision/datas/pos_encoding_OH_ND.csv')
neg_table = pd.read_csv('/Users/jiaming/Desktop/revision/datas/neg_encoding_OH_ND.csv')

pos_table = pos_table.iloc[:,1:]
neg_table = neg_table.iloc[:,1:]

In [None]:
pos_np = pos_table.to_numpy()
neg_np = neg_table.to_numpy()

pos = torch.FloatTensor(pos_np).unsqueeze(1)
neg = torch.FloatTensor(neg_np).unsqueeze(1)

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(raw_datas, raw_labels, test_size=0.2, random_state=random_seed)

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        return x, y

train_dataset_unsplit = CustomDataset(data_train, labels_train)
test_dataset = CustomDataset(data_test, labels_test)

# Kfold for train_dataset

In [None]:
# from sklearn.model_selection import KFold
# 
# kf = KFold(n_splits=5)
# 
# i = 1
# for train_dataset, val_dataset in kf.split(X=train_dataset):
#     print("iteration ", i)
#     print(train_dataset, " having :" , len(train_dataset))
#     print(val_dataset, " having :" , len(val_dataset))
#     print("-------------------------")
#     i += 1
# 

In [None]:
# train_loader = DataLoader(dataset=train_dataset, 
#                           batch_size=batch_size, 
#                           shuffle=True)
# 
# test_loader = DataLoader(dataset=test_dataset, 
#                          batch_size=batch_size, 
#                          shuffle=False)
# 
# # Checking the dataset 
# for images, labels in train_loader: # 随意load一次, 可无限循环, 必须break
#     print('Image batch dimensions:', images.shape)
#     print('Image label dimensions:', labels.shape)
#     break

# ResNet with identity blocks

In [None]:
##########################
### MODEL
##########################


class ConvNet(torch.nn.Module):

    def __init__(self, num_classes):
        super(ConvNet, self).__init__()
        
        #########################
        ### 1st residual block
        #########################
        
        self.block_1 = torch.nn.Sequential(
                torch.nn.Conv1d(in_channels=1,
                                out_channels=4,
                                kernel_size=1,
                                stride=1,
                                padding=0),
                torch.nn.BatchNorm1d(4),
                torch.nn.ReLU(inplace=True),
                torch.nn.Conv1d(in_channels=4,
                                out_channels=1,
                                kernel_size=3,
                                stride=1,
                                padding=1),
                torch.nn.BatchNorm1d(1)
        )
        
        self.block_2 = torch.nn.Sequential(
                torch.nn.Conv1d(in_channels=1,
                                out_channels=4,
                                kernel_size=1,
                                stride=1,
                                padding=0),
                torch.nn.BatchNorm1d(4),
                torch.nn.ReLU(inplace=True),
                torch.nn.Conv1d(in_channels=4,
                                out_channels=1,
                                kernel_size=3,
                                stride=1,
                                padding=1),
                torch.nn.BatchNorm1d(1)
        )

        #########################
        ### Fully connected
        #########################        
        self.linear_1 = torch.nn.Linear(205, num_classes) # 205 = 26240/batch_size

        
    def forward(self, x):
        
        #########################
        ### 1st residual block
        #########################
        shortcut = x
        x = self.block_1(x)
        x = torch.nn.functional.relu(x + shortcut)
        
        #########################
        ### 2nd residual block
        #########################
        shortcut = x
        x = self.block_2(x)
        x = torch.nn.functional.relu(x + shortcut)
        
        #########################
        ### Fully connected
        #########################
        logits = self.linear_1(x.view(-1, 205))
        return logits
    
    
torch.manual_seed(random_seed)
model = ConvNet(num_classes=num_classes)
model = model.to(device)
    
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)    

# Training

In [None]:
def compute_accuracy(model, data_loader):
    correct_pred, num_examples = 0, 0
    for i, (features, targets) in enumerate(data_loader):            
        features = features.to(device)
        targets = targets.to(device)
        logits = model(features)
        _, predicted_labels = torch.max(logits, dim=1) 
        #第一个元素是最大值，第二个元素是最大值对应的索引。由于我们只关心索引，而不关心最大值本身，因此使用 _ 将其忽略。
        num_examples += targets.size(0)
        correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
################################################

from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

i = 1
for train_dataset_indx, val_dataset_indx in kf.split(X=train_dataset_unsplit):
    
    # print(len(train_dataset_unsplit))
    print("iteration ", i)
    print("train_dataset_indx", " having :" , len(train_dataset_indx))
    print("val_dataset_indx", " having :" , len(val_dataset_indx))
    print("-------------------------")
    i += 1
    
    train_dataset = Subset(train_dataset_unsplit, train_dataset_indx)
    val_dataset = Subset(train_dataset_unsplit, val_dataset_indx)
    
    # train_dataset = train_dataset_unsplit[train_dataset_indx,:,:]
    # val_dataset = train_dataset_unsplit[val_dataset_indx,:,:]
    
    # for i in range(len(train_dataset_unsplit)):
    #     features, label = train_dataset_unsplit[i]
    #     print(f"Sample {i}: Features - {features}, Label - {label}")
    
################################################ 

    train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True)
    
    val_loader = DataLoader(dataset=val_dataset, 
                         batch_size=batch_size, 
                         shuffle=False)
    
    test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size, 
                         shuffle=False)

    # Checking the dataset 
    for images, labels in train_loader: # 随意load一次, 可无限循环, 必须break
        print('Image batch dimensions:', images.shape)
        print('Image label dimensions:', labels.shape)
        break
        
################################################ 
predicted_probabilities = []  # 存储模型的预测概率
true_labels = []  # 存储真实标签

start_time = time.time()
for epoch in range(num_epochs):
    model = model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        features = features.to(device)
        targets = targets.to(device)
        # print(features.shape,targets.shape)
        # print(batch_idx)
        
        ### FORWARD AND BACK PROP
        logits = model(features)
        # print(logits.shape)
        cost = torch.nn.functional.cross_entropy(logits, targets)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if batch_idx % 10 == 0: #没有余数时返回
            print ('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f' 
                   %(epoch+1, num_epochs, batch_idx, 
                     len(train_loader), cost))

    model = model.eval() # eval mode to prevent upd. batchnorm params during inference
    
    with torch.set_grad_enabled(False): # save memory during inference
        print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
              epoch+1, num_epochs, 
              compute_accuracy(model, train_loader)))
        
        for batch_idx, (features, targets) in enumerate(val_loader):
            features = features.to(device)
            targets = targets.to(device)
            # print(targets)
    
            logits = model(features)
            # print(logits)
            probabilities = torch.sigmoid(logits)[:,1] # 使用 sigmoid 转换为概率，获取类别为1的概率
            # print(probabilities)
            
            predicted_probabilities.extend(probabilities.tolist()) # 存储概率
            true_labels.extend(targets.tolist()) # 存储真实标签
    

# 使用 sklearn 的 roc_auc_score 函数计算 AUC 分数
AUC_val = roc_auc_score(true_labels, predicted_probabilities)

predicted_probabilities = np.array(predicted_probabilities)
ACC_val = accuracy_score(true_labels, (predicted_probabilities > 0.5).astype(int)) 
# 实际上阈值就是0.5, 因为是哪个大取哪个，但总和因为sigmoid为1
MCC_val = matthews_corrcoef(true_labels, (predicted_probabilities > 0.5).astype(int))
Sn_val = recall_score(true_labels, (predicted_probabilities > 0.5).astype(int))
Sp_val = precision_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    
print("Test Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))
print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))
    
print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score

model = model.eval()
predicted_probabilities = []  # 存储模型的预测概率
true_labels = []  # 存储真实标签

with torch.set_grad_enabled(False): # save memory during inference
    for batch_idx, (features, targets) in enumerate(test_loader):
            features = features.to(device)
            targets = targets.to(device)
            # print(targets)
    
            logits = model(features)
            # print(logits)
            probabilities = torch.sigmoid(logits)[:,1] # 使用 sigmoid 转换为概率，获取类别为1的概率
            # print(probabilities)
            
            predicted_probabilities.extend(probabilities.tolist()) # 存储概率
            true_labels.extend(targets.tolist()) # 存储真实标签
    

    # 使用 sklearn 的 roc_auc_score 函数计算 AUC 分数
    AUC_test = roc_auc_score(true_labels, predicted_probabilities)
    
    predicted_probabilities = np.array(predicted_probabilities)
    ACC_test = accuracy_score(true_labels, (predicted_probabilities > 0.5).astype(int)) 
    # 实际上阈值就是0.5, 因为是哪个大取哪个，但总和因为sigmoid为1
    MCC_test = matthews_corrcoef(true_labels, (predicted_probabilities > 0.5).astype(int))
    Sn_test = recall_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    Sp_test = precision_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    
    print("Test Metrics:")
    print("AUC_test: {:.4f}".format(AUC_test))
    print("ACC_test: {:.4f}".format(ACC_test))
    print("MCC_test: {:.4f}".format(MCC_test))
    print("Sn_test: {:.4f}".format(Sn_test))
    print("Sp_test: {:.4f}".format(Sp_test))
