In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [23]:
train = pd.read_csv('/Users/clause/Desktop/QML/train.csv', index_col = 0)
test = pd.read_csv('/Users/clause/Desktop/QML/test.csv', index_col = 0)

In [24]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,0.1
0,0.652778,0.0,0.0,1.237158,-0.485387,-0.186567,-0.639733,0.0,0
1,0.438889,0.0,0.0,-0.017201,-0.324686,0.515736,-0.158885,1.0,0
2,0.466667,0.0,0.0,0.484542,0.234839,-0.537718,-0.701237,1.0,0
3,0.166667,0.0,0.0,0.484542,0.047138,-0.537718,0.584752,1.0,0
4,0.177778,0.0,0.0,-1.271561,-0.488966,-0.186567,-0.530704,1.0,0


In [25]:
train.rename(columns={'0.1': 'target'}, inplace=True)
test.rename(columns={'0.1': 'target'}, inplace=True)

In [26]:
X_train, y_train, X_test, y_test = train.iloc[:,:-1], train.iloc[:,-1], test.iloc[:,:-1], test.iloc[:,-1]

In [6]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [7]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.X_data)
    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
    
    def __len__(self):
        return len(self.X_data)


In [8]:
train_data = trainData(torch.FloatTensor(X_train.values),
                       torch.FloatTensor(y_train.values))    

test_data = testData(torch.FloatTensor(X_test.values))

In [9]:
train_loader = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size = 1)

In [10]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        self.layer_1 = nn.Linear(8, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] 
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    

In [11]:

########k折划分############        
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据，分开放，X_train为训练数据，X_valid为验证数据
    assert k > 1
    fold_size = X.shape[0] // k  # 每份的个数:数据总条数/折数（组数）
    
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)  #slice(start,end,step)切片函数
        ##idx 为每组 valid
        X_part, y_part = X[idx, :], y[idx]
        if j == i: ###第i折作valid
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0) #dim=0增加行数，竖着连接
            y_train = torch.cat((y_train, y_part), dim=0)
    #print(X_train.size(),X_valid.size())
    return X_train, y_train, X_valid,y_valid
 
 
def k_fold(k, X_train, y_train, num_epochs=3, learning_rate=0.001, weight_decay=0.1, batch_size=5):
    train_loss_sum, valid_loss_sum = 0, 0
    train_acc_sum ,valid_acc_sum = 0,0
    
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train) # 获取k折交叉验证的训练和验证数据
        net =  Net()  ### 实例化模型
        ### 每份数据进行训练,体现步骤三####
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,\
                                   weight_decay, batch_size) 
       
        print('*'*25,'第',i+1,'折','*'*25)
        print('train_loss:%.6f'%train_ls[-1][0],'train_acc:%.4f\n'%valid_ls[-1][1],\
              'valid loss:%.6f'%valid_ls[-1][0],'valid_acc:%.4f'%valid_ls[-1][1])
        train_loss_sum += train_ls[-1][0]
        valid_loss_sum += valid_ls[-1][0]
        train_acc_sum += train_ls[-1][1]
        valid_acc_sum += valid_ls[-1][1]
    print('#'*10,'最终k折交叉验证结果','#'*10) 
    ####体现步骤四#####
    print('train_loss_sum:%.4f'%(train_loss_sum/k),'train_acc_sum:%.4f\n'%(train_acc_sum/k),\
          'valid_loss_sum:%.4f'%(valid_loss_sum/k),'valid_acc_sum:%.4f'%(valid_acc_sum/k))
 
 
#########训练函数##########
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate,weight_decay, batch_size):
    train_ls, test_ls = [], [] ##存储train_loss,test_loss
    dataset = TraindataSet(train_features, train_labels) 
    train_iter = DataLoader(dataset, batch_size, shuffle=True) 
    ### 将数据封装成 Dataloder 对应步骤（2）
    
    #这里使用了Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr= learning_rate, weight_decay=weight_decay)
    
    for epoch in range(num_epochs):
        for X, y in train_iter:  ###分批训练 
            output  = net(X)
            loss = loss_func(output,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        ### 得到每个epoch的 loss 和 accuracy 
        train_ls.append(log_rmse(0,net, train_features, train_labels)) 
        if test_labels is not None:
            test_ls.append(log_rmse(1,net, test_features, test_labels))
    #print(train_ls,test_ls)
    return train_ls, test_ls
 
def log_rmse(flag,net,x,y):
    if flag == 1: ### valid 数据集
        net.eval()
    output = net(x)
    result = torch.max(output,1)[1].view(y.size())
    corrects = (result.data == y.data).sum().item()
    accuracy = corrects*100.0/len(y)  #### 5 是 batch_size
    loss = loss_func(output,y)
    net.train()
    
    return (loss.data.item(),accuracy)
 
#loss_func = nn.CrossEntropyLoss() ###申明loss函
#k_fold(5,x,label) ### k=5,5折交叉验证

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cpu


In [13]:
model = MLP()
model.to(device)

print(model)

MLP(
  (layer_1): Linear(in_features=8, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_3): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [14]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE, weight_decay=0.1)

In [15]:
def MLP_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [16]:
model.train()
for e in range(1, EPOCHS + 1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = MLP_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print(f'Epoch {e+0:03}: | Loss:{epoch_loss/len(train_loader):.5f} | Acc:{epoch_acc/len(train_loader): .3f}')

Epoch 001: | Loss:0.44812 | Acc: 79.338
Epoch 002: | Loss:0.42257 | Acc: 81.280
Epoch 003: | Loss:0.41679 | Acc: 81.605
Epoch 004: | Loss:0.41955 | Acc: 81.541
Epoch 005: | Loss:0.41693 | Acc: 81.745
Epoch 006: | Loss:0.41910 | Acc: 82.274
Epoch 007: | Loss:0.42109 | Acc: 82.401
Epoch 008: | Loss:0.42413 | Acc: 82.306
Epoch 009: | Loss:0.43154 | Acc: 82.682
Epoch 010: | Loss:0.44352 | Acc: 82.236
Epoch 011: | Loss:0.44631 | Acc: 82.720
Epoch 012: | Loss:0.44567 | Acc: 82.331
Epoch 013: | Loss:0.44522 | Acc: 83.006
Epoch 014: | Loss:0.44640 | Acc: 82.752
Epoch 015: | Loss:0.44798 | Acc: 82.433
Epoch 016: | Loss:0.44653 | Acc: 82.503
Epoch 017: | Loss:0.44701 | Acc: 82.522
Epoch 018: | Loss:0.44550 | Acc: 82.459
Epoch 019: | Loss:0.44446 | Acc: 82.694
Epoch 020: | Loss:0.44201 | Acc: 82.503
Epoch 021: | Loss:0.44390 | Acc: 82.924
Epoch 022: | Loss:0.44446 | Acc: 82.656
Epoch 023: | Loss:0.44303 | Acc: 82.898
Epoch 024: | Loss:0.44372 | Acc: 82.516
Epoch 025: | Loss:0.44272 | Acc: 82.592


In [17]:
def pred(loader):
    y_pred_list = []
    model.eval()
    with torch.no_grad():
        for X_batch in loader:
            X_batch = X_batch.to(device)
            y_test_pred= model(X_batch)
            y_test_pred= torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag.cpu().numpy())
        
    y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
    print(len(y_pred_list))
    return y_pred_list

In [18]:
from math import sqrt
def evl(x, y_pred_list):
    cm = confusion_matrix(x, y_pred_list)
    acc = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
    mcc = (cm[0,0]*cm[1,1]-cm[0,1]*cm[1,0])/sqrt((cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0]))
    
    print('Confusion Matrix:')
    print(cm)
    print(f' Accurency:{acc}, MCC:{mcc}')

In [19]:
y_pred_list2 = pred(test_loader)
evl(y_test, y_pred_list2)

4522
Confusion Matrix:
[[3135  848]
 [  84  455]]
 Accurency:0.7938965059708094, MCC:0.45161558183571765
