In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset   

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

from math import sqrt

In [2]:
train = pd.read_csv('/Users/clause/Desktop/ZD/train.csv', index_col = 0)
train = shuffle(train)
x, label = train.iloc[:,:-1].values, train.iloc[:,-1].values
x = torch.tensor(x.astype(np.float32))
label = torch.tensor(label.astype(np.float32))

test = pd.read_csv('/Users/clause/Desktop/ZD/test.csv', index_col = 0)
X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.layer_1 = nn.Linear(8, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

In [4]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        self.len = len(y_data)
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.X_data)
    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
    
    def __len__(self):
        return len(self.X_data)

In [5]:
# k fold     
def get_k_fold_data(k, i, X, y): 
    # Return the training and verification data needed for the i-th fold cross-validation, 
    # X_train is the training data, X_valid is the verification data
    assert k > 1
    # All data / nuber of fold
    fold_size = X.shape[0] // k  
    
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size) # slice(start,end,step) 
        # idx is the valid of every folds
        X_part, y_part = X[idx, :], y[idx]
        if j == i: # valid of #i fold
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0) #dim=0, add row, link by column
            y_train = torch.cat((y_train, y_part), dim=0)
    #print(X_train.size(),X_valid.size())
    return X_train, y_train, X_valid,y_valid 

In [6]:
def k_fold(k, X_train, y_train, num_epochs=50,learning_rate=0.001, weight_decay=0, batch_size=64):
    
    # train set part
    train_loss_sum, valid_loss_sum = 0, 0
    train_acc_sum ,valid_acc_sum = 0,0
    train_mcc_sum ,valid_mcc_sum = 0,0
    
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train) # get data after k-fold
        net =  Net()
        # train
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,\
                                   weight_decay, batch_size) 
        t_acc = float(train_ls[-1][1])
        v_acc = float(valid_ls[-1][1])
        t_mcc  = float(train_ls[-1][2])
        v_mcc = float(valid_ls[-1][2])
        
        print('*'*25,'fold',i+1,'*'*25)
        print('train_loss:%.6f'%train_ls[-1][0],'train_acc:%.4f'%t_acc,'train_mcc:%.4f\n'%t_mcc, \
              'valid loss:%.6f'%valid_ls[-1][0],'valid_acc:%.4f'%v_acc,'valid_mcc:%.4f'%v_mcc)
        
        train_loss_sum += train_ls[-1][0]
        valid_loss_sum += valid_ls[-1][0]
        train_acc_sum += t_acc
        valid_acc_sum += v_acc
        train_mcc_sum += t_mcc
        valid_mcc_sum += v_mcc
    
    print('*'*25,'final result','*'*25) 
    print('train_loss_sum:%.4f'%(train_loss_sum/k),'train_acc_sum:%.4f'%(train_acc_sum/k),'train_mcc_sum:%.4f\n'%(train_mcc_sum/k),\
          'valid_loss_sum:%.4f'%(valid_loss_sum/k),'valid_acc_sum:%.4f'%(valid_acc_sum/k),'valid_mcc_sum:%.4f'%(valid_mcc_sum/k))
    
    # test set part
    test_data = testData(torch.FloatTensor(X_test.values))
    test_loader = DataLoader(dataset = test_data, batch_size = 1)
    y_pred_list = []
    net.eval()
    with torch.no_grad():
        for X_batch in test_loader:
            y_test_pred= net(X_batch)
            y_test_pred= torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag.cpu().numpy())
        
    y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
    cm, acc, mcc = evaluate(y_test, y_pred_list)
    print()
    print('Confusion Matrix of test set:')
    print(cm)
    print(f' Accurency:{acc}, MCC:{mcc}')

In [7]:
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate,weight_decay, batch_size):
    
    train_ls, test_ls = [], []
    
    dataset = trainData(train_features, train_labels) 
    train_iter = DataLoader(dataset, batch_size, shuffle=True)
    
    optimizer = torch.optim.Adam(params=net.parameters(), lr= learning_rate, weight_decay=weight_decay)
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        epoch_acc = 0
        for X, y in train_iter:  # train by batch
            output  = net(X)
            optimizer.zero_grad()
            loss = loss_func(output,y.unsqueeze(1))
            loss.backward()
            optimizer.step()
        
        # get loss & accuracy of each epoch
        train_ls.append(mcc(0,net, train_features, train_labels)) 
        if test_labels is not None:
            test_ls.append(mcc(1,net, test_features, test_labels))
    
    return train_ls, test_ls

In [8]:
def mcc(flag,net,x,y):
    if flag == 1: ### valid 数据集
        net.eval()
    else:
        net.train()
    #np.seterr(divide='ignore', invalid='ignore')
    output = net(x)
    result = torch.round(torch.sigmoid(output))
    loss = loss_func(output,y.unsqueeze(1))
    
    cm, acc, mcc = evaluate(y.detach().numpy(), result.detach().numpy())
    
    #print(cm)
    return (loss.data.item(), acc, mcc)


In [9]:
def evaluate(x, y):
    cm = confusion_matrix(x, y)
    acc = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
    upper = cm[0,0]*cm[1,1]-cm[0,1]*cm[1,0]
    lower = sqrt((cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0]))
    
    if (cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0]) == 0:
        mcc = 0
    else:
        mcc = upper / lower
    return cm, acc, mcc

In [10]:
loss_func = nn.BCEWithLogitsLoss()
k_fold(5, x, label)

************************* fold 1 *************************
train_loss:0.268360 train_acc:0.8806 train_mcc:0.7622
 valid loss:0.376408 valid_acc:0.8520 valid_mcc:0.7056
************************* fold 2 *************************
train_loss:0.280465 train_acc:0.8744 train_mcc:0.7489
 valid loss:0.392208 valid_acc:0.8420 valid_mcc:0.6844
************************* fold 3 *************************
train_loss:0.291938 train_acc:0.8672 train_mcc:0.7345
 valid loss:0.393149 valid_acc:0.8405 valid_mcc:0.6815
************************* fold 4 *************************
train_loss:0.279358 train_acc:0.8805 train_mcc:0.7612
 valid loss:0.389929 valid_acc:0.8330 valid_mcc:0.6665
************************* fold 5 *************************
train_loss:0.279183 train_acc:0.8815 train_mcc:0.7632
 valid loss:0.388477 valid_acc:0.8415 valid_mcc:0.6829
************************* final result *************************
train_loss_sum:0.2799 train_acc_sum:0.8768 train_mcc_sum:0.7540
 valid_loss_sum:0.3880 valid_ac