# Imports

In [34]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import time
from torch.utils.data import Subset
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score

# Settings and Dataset

In [35]:
##########################
### SETTINGS
##########################

# Device
device = torch.device("cuda:1" if torch.cuda.is_available() else "mps")

# Hyperparameters
random_seed = 123
learning_rate = 0.00001
num_epochs = 50
batch_size = 128

# Architecture
num_classes = 2

In [36]:
pos_table = pd.read_csv('/Users/jiaming/Desktop/revision/datas/pos_encoding_OH_ND.csv')
neg_table = pd.read_csv('/Users/jiaming/Desktop/revision/datas/neg_encoding_OH_ND.csv')

pos_table = pos_table.iloc[:,1:]
neg_table = neg_table.iloc[:,1:]

In [37]:
pos_geo = pd.read_csv('/Users/jiaming/Desktop/revision/datas/pos_domain_encoding.csv')
neg_geo = pd.read_csv('/Users/jiaming/Desktop/revision/datas/neg_domain_encoding.csv')

pos_geo = pos_geo.iloc[:,1:]
neg_geo = neg_geo.iloc[:,1:]

In [38]:
pos_cb = pd.concat([pos_table, pos_geo], axis=1)
neg_cb = pd.concat([neg_table, neg_geo], axis=1)

pos_np = pos_cb.to_numpy()
neg_np = neg_cb.to_numpy()

pos = torch.FloatTensor(pos_np).unsqueeze(1)
neg = torch.FloatTensor(neg_np).unsqueeze(1)

print(pos.shape)
print(neg.shape)

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

torch.Size([1892, 1, 245])
torch.Size([18920, 1, 245])


In [39]:
data_train, data_test, labels_train, labels_test = train_test_split(raw_datas, raw_labels, test_size=0.2, random_state=random_seed)

In [40]:
data_train[np.isnan(data_train)] = 0
data_test[np.isnan(data_test)] = 0

In [41]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        return x, y

train_dataset_unsplit = CustomDataset(data_train, labels_train)
test_dataset = CustomDataset(data_test, labels_test)

# Kfold for train_dataset

In [42]:
# from sklearn.model_selection import KFold
# 
# kf = KFold(n_splits=5)
# 
# i = 1
# for train_dataset, val_dataset in kf.split(X=train_dataset):
#     print("iteration ", i)
#     print(train_dataset, " having :" , len(train_dataset))
#     print(val_dataset, " having :" , len(val_dataset))
#     print("-------------------------")
#     i += 1
# 

In [43]:
# train_loader = DataLoader(dataset=train_dataset, 
#                           batch_size=batch_size, 
#                           shuffle=True)
# 
# test_loader = DataLoader(dataset=test_dataset, 
#                          batch_size=batch_size, 
#                          shuffle=False)
# 
# # Checking the dataset 
# for images, labels in train_loader: # 随意load一次, 可无限循环, 必须break
#     print('Image batch dimensions:', images.shape)
#     print('Image label dimensions:', labels.shape)
#     break

# ResNet with identity blocks

In [44]:
##########################
### MODEL
##########################


class ConvNet(torch.nn.Module):

    def __init__(self, num_classes):
        super(ConvNet, self).__init__()
        
        #########################
        ### 1st residual block
        #########################
        
        self.block_1 = torch.nn.Sequential(
                torch.nn.Conv1d(in_channels=1,
                                out_channels=4,
                                kernel_size=1,
                                stride=1,
                                padding=0),
                torch.nn.BatchNorm1d(4),
                torch.nn.ReLU(inplace=True),
                torch.nn.Conv1d(in_channels=4,
                                out_channels=1,
                                kernel_size=3,
                                stride=1,
                                padding=1),
                torch.nn.BatchNorm1d(1)
        )
        
        self.block_2 = torch.nn.Sequential(
                torch.nn.Conv1d(in_channels=1,
                                out_channels=4,
                                kernel_size=1,
                                stride=1,
                                padding=0),
                torch.nn.BatchNorm1d(4),
                torch.nn.ReLU(inplace=True),
                torch.nn.Conv1d(in_channels=4,
                                out_channels=1,
                                kernel_size=3,
                                stride=1,
                                padding=1),
                torch.nn.BatchNorm1d(1)
        )

        #########################
        ### Fully connected
        #########################        
        self.linear_1 = torch.nn.Linear(245, num_classes) # 205 = 26240/batch_size

        
    def forward(self, x):
        
        #########################
        ### 1st residual block
        #########################
        shortcut = x
        x = self.block_1(x)
        x = torch.nn.functional.relu(x + shortcut)
        
        #########################
        ### 2nd residual block
        #########################
        shortcut = x
        x = self.block_2(x)
        x = torch.nn.functional.relu(x + shortcut)
        
        #########################
        ### Fully connected
        #########################
        logits = self.linear_1(x.view(-1, 245))
        return logits
    
    
torch.manual_seed(random_seed)
model = ConvNet(num_classes=num_classes)
model = model.to(device)
    
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)    

# Training

In [45]:
def compute_accuracy(model, data_loader):
    correct_pred, num_examples = 0, 0
    for i, (features, targets) in enumerate(data_loader):            
        features = features.to(device)
        targets = targets.to(device)
        logits = model(features)
        _, predicted_labels = torch.max(logits, dim=1) 
        num_examples += targets.size(0)
        correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [46]:
################################################

from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

i = 1
for train_dataset_indx, val_dataset_indx in kf.split(X=train_dataset_unsplit):
    
    # print(len(train_dataset_unsplit))
    print("iteration ", i)
    print("train_dataset_indx", " having :" , len(train_dataset_indx))
    print("val_dataset_indx", " having :" , len(val_dataset_indx))
    print("-------------------------")
    i += 1
    
    train_dataset = Subset(train_dataset_unsplit, train_dataset_indx)
    val_dataset = Subset(train_dataset_unsplit, val_dataset_indx)
    
    # train_dataset = train_dataset_unsplit[train_dataset_indx,:,:]
    # val_dataset = train_dataset_unsplit[val_dataset_indx,:,:]
    
    # for i in range(len(train_dataset_unsplit)):
    #     features, label = train_dataset_unsplit[i]
    #     print(f"Sample {i}: Features - {features}, Label - {label}")
    
################################################ 

    train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True)
    
    val_loader = DataLoader(dataset=val_dataset, 
                         batch_size=batch_size, 
                         shuffle=False)
    
    test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size, 
                         shuffle=False)

    # Checking the dataset 
    for images, labels in train_loader: 
        print('Image batch dimensions:', images.shape)
        print('Image label dimensions:', labels.shape)
        break
        
################################################ 
predicted_probabilities = []  
true_labels = []  

start_time = time.time()
for epoch in range(num_epochs):
    model = model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        features = features.to(device)
        targets = targets.to(device)
        
        ### FORWARD AND BACK PROP
        logits = model(features)
        cost = torch.nn.functional.cross_entropy(logits, targets)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if batch_idx % 10 == 0: 
            print ('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f' 
                   %(epoch+1, num_epochs, batch_idx, 
                     len(train_loader), cost))

    model = model.eval() 
    
    with torch.set_grad_enabled(False): 
        print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
              epoch+1, num_epochs, 
              compute_accuracy(model, train_loader)))
        
        for batch_idx, (features, targets) in enumerate(val_loader):
            features = features.to(device)
            targets = targets.to(device)
    
            logits = model(features)
            probabilities = torch.sigmoid(logits)[:,1] 
            
            predicted_probabilities.extend(probabilities.tolist()) 
            true_labels.extend(targets.tolist()) 
    


AUC_val = roc_auc_score(true_labels, predicted_probabilities)
predicted_probabilities = np.array(predicted_probabilities)
ACC_val = accuracy_score(true_labels, (predicted_probabilities > 0.5).astype(int)) 
MCC_val = matthews_corrcoef(true_labels, (predicted_probabilities > 0.5).astype(int))
Sn_val = recall_score(true_labels, (predicted_probabilities > 0.5).astype(int))
Sp_val = precision_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    
print("Test Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))
print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))
    
print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))

iteration  1
train_dataset_indx  having : 13319
val_dataset_indx  having : 3330
-------------------------
Image batch dimensions: torch.Size([128, 1, 245])
Image label dimensions: torch.Size([128])
iteration  2
train_dataset_indx  having : 13319
val_dataset_indx  having : 3330
-------------------------
Image batch dimensions: torch.Size([128, 1, 245])
Image label dimensions: torch.Size([128])
iteration  3
train_dataset_indx  having : 13319
val_dataset_indx  having : 3330
-------------------------
Image batch dimensions: torch.Size([128, 1, 245])
Image label dimensions: torch.Size([128])
iteration  4
train_dataset_indx  having : 13319
val_dataset_indx  having : 3330
-------------------------
Image batch dimensions: torch.Size([128, 1, 245])
Image label dimensions: torch.Size([128])
iteration  5
train_dataset_indx  having : 13320
val_dataset_indx  having : 3329
-------------------------
Image batch dimensions: torch.Size([128, 1, 245])
Image label dimensions: torch.Size([128])
Epoch: 001

Epoch: 014/050 | Batch 030/105 | Cost: 0.3638
Epoch: 014/050 | Batch 040/105 | Cost: 0.2783
Epoch: 014/050 | Batch 050/105 | Cost: 0.3256
Epoch: 014/050 | Batch 060/105 | Cost: 0.2963
Epoch: 014/050 | Batch 070/105 | Cost: 0.3090
Epoch: 014/050 | Batch 080/105 | Cost: 0.3289
Epoch: 014/050 | Batch 090/105 | Cost: 0.3047
Epoch: 014/050 | Batch 100/105 | Cost: 0.2333
Epoch: 014/050 training accuracy: 90.88%
Epoch: 015/050 | Batch 000/105 | Cost: 0.2403
Epoch: 015/050 | Batch 010/105 | Cost: 0.2854
Epoch: 015/050 | Batch 020/105 | Cost: 0.2863
Epoch: 015/050 | Batch 030/105 | Cost: 0.2228
Epoch: 015/050 | Batch 040/105 | Cost: 0.2963
Epoch: 015/050 | Batch 050/105 | Cost: 0.2965
Epoch: 015/050 | Batch 060/105 | Cost: 0.3027
Epoch: 015/050 | Batch 070/105 | Cost: 0.2940
Epoch: 015/050 | Batch 080/105 | Cost: 0.2694
Epoch: 015/050 | Batch 090/105 | Cost: 0.3663
Epoch: 015/050 | Batch 100/105 | Cost: 0.2652
Epoch: 015/050 training accuracy: 90.92%
Epoch: 016/050 | Batch 000/105 | Cost: 0.254

Epoch: 029/050 | Batch 030/105 | Cost: 0.1983
Epoch: 029/050 | Batch 040/105 | Cost: 0.1283
Epoch: 029/050 | Batch 050/105 | Cost: 0.2052
Epoch: 029/050 | Batch 060/105 | Cost: 0.1660
Epoch: 029/050 | Batch 070/105 | Cost: 0.2170
Epoch: 029/050 | Batch 080/105 | Cost: 0.1673
Epoch: 029/050 | Batch 090/105 | Cost: 0.2032
Epoch: 029/050 | Batch 100/105 | Cost: 0.2314
Epoch: 029/050 training accuracy: 90.56%
Epoch: 030/050 | Batch 000/105 | Cost: 0.2558
Epoch: 030/050 | Batch 010/105 | Cost: 0.1287
Epoch: 030/050 | Batch 020/105 | Cost: 0.1854
Epoch: 030/050 | Batch 030/105 | Cost: 0.2453
Epoch: 030/050 | Batch 040/105 | Cost: 0.1481
Epoch: 030/050 | Batch 050/105 | Cost: 0.1736
Epoch: 030/050 | Batch 060/105 | Cost: 0.1546
Epoch: 030/050 | Batch 070/105 | Cost: 0.2378
Epoch: 030/050 | Batch 080/105 | Cost: 0.2657
Epoch: 030/050 | Batch 090/105 | Cost: 0.1329
Epoch: 030/050 | Batch 100/105 | Cost: 0.1804
Epoch: 030/050 training accuracy: 90.59%
Epoch: 031/050 | Batch 000/105 | Cost: 0.211

Epoch: 044/050 | Batch 030/105 | Cost: 0.2344
Epoch: 044/050 | Batch 040/105 | Cost: 0.1549
Epoch: 044/050 | Batch 050/105 | Cost: 0.2250
Epoch: 044/050 | Batch 060/105 | Cost: 0.0711
Epoch: 044/050 | Batch 070/105 | Cost: 0.1920
Epoch: 044/050 | Batch 080/105 | Cost: 0.2117
Epoch: 044/050 | Batch 090/105 | Cost: 0.2056
Epoch: 044/050 | Batch 100/105 | Cost: 0.1605
Epoch: 044/050 training accuracy: 90.94%
Epoch: 045/050 | Batch 000/105 | Cost: 0.1731
Epoch: 045/050 | Batch 010/105 | Cost: 0.1230
Epoch: 045/050 | Batch 020/105 | Cost: 0.2082
Epoch: 045/050 | Batch 030/105 | Cost: 0.1977
Epoch: 045/050 | Batch 040/105 | Cost: 0.1137
Epoch: 045/050 | Batch 050/105 | Cost: 0.1359
Epoch: 045/050 | Batch 060/105 | Cost: 0.1544
Epoch: 045/050 | Batch 070/105 | Cost: 0.1347
Epoch: 045/050 | Batch 080/105 | Cost: 0.1961
Epoch: 045/050 | Batch 090/105 | Cost: 0.1926
Epoch: 045/050 | Batch 100/105 | Cost: 0.1685
Epoch: 045/050 training accuracy: 90.93%
Epoch: 046/050 | Batch 000/105 | Cost: 0.173

In [47]:
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score

model = model.eval()
predicted_probabilities = []  
true_labels = [] 

with torch.set_grad_enabled(False): # save memory during inference
    for batch_idx, (features, targets) in enumerate(test_loader):
            features = features.to(device)
            targets = targets.to(device)
    
            logits = model(features)
            probabilities = torch.sigmoid(logits)[:,1] 
            
            predicted_probabilities.extend(probabilities.tolist())
            true_labels.extend(targets.tolist()) 
    

    AUC_test = roc_auc_score(true_labels, predicted_probabilities)
    predicted_probabilities = np.array(predicted_probabilities)
    ACC_test = accuracy_score(true_labels, (predicted_probabilities > 0.5).astype(int)) 
    MCC_test = matthews_corrcoef(true_labels, (predicted_probabilities > 0.5).astype(int))
    Sn_test = recall_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    Sp_test = precision_score(true_labels, (predicted_probabilities > 0.5).astype(int))
    
    print("Test Metrics:")
    print("AUC_test: {:.4f}".format(AUC_test))
    print("ACC_test: {:.4f}".format(ACC_test))
    print("MCC_test: {:.4f}".format(MCC_test))
    print("Sn_test: {:.4f}".format(Sn_test))
    print("Sp_test: {:.4f}".format(Sp_test))


Test Metrics:
AUC_test: 0.9471
ACC_test: 0.9176
MCC_test: 0.5864
Sn_test: 0.7610
Sp_test: 0.5197


In [48]:
model = model.to("cpu")
torch.save(model.state_dict(), 'ResNet_weights.pth')