# Train Model

## Import Libraries

In [1]:
# import library
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

# sklearn
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import f1_score

# torch
import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid, binary_cross_entropy, nll_loss
from torch.optim import Adam, SGD

# bio library
import biosppy
from biosppy import storage
from biosppy.signals import ecg

# 载入CNN模型
from ResNet import ResNet

DATA_DIR = "Data"
RESULT_DIR = "Result"

## Load Data

In [2]:
# Load Data
X_train_df = pd.read_csv(os.path.join(DATA_DIR, "X_train.csv"), header=0, index_col=0)
X_test_df = pd.read_csv(os.path.join(DATA_DIR, "X_test.csv"), header=0, index_col=0)
y_train_df = pd.read_csv(os.path.join(DATA_DIR, "y_train.csv"), header=0, index_col=0)

X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values.ravel()

# 获取有效长度
X_train_len = []
for row in X_train:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_train_len.append(tail_id[0])
    else:
        X_train_len.append(X_train.shape[1])

X_test_len = []
for row in X_test:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_test_len.append(tail_id[0])
    else:
        X_test_len.append(X_test.shape[1])

X_train_len, X_test_len = np.array(X_train_len), np.array(X_test_len)

# 划分训练集和验证集
splitted_X_train, splitted_X_valid, splitted_X_train_len, splitted_X_valid_len, splitted_y_train, splitted_y_valid = train_test_split(X_train, X_train_len, y_train, test_size=0.3, random_state=10)

## Process Data

In [3]:
def window_split(x: np.ndarray, y: np.ndarray, x_len: np.ndarray, window_size:  int = 6000, stride: int = 300) -> np.ndarray:
    """
    滑动窗口切分数据
    :param x: 数据
    :param y: 标签
    :param window_size: 窗口大小
    :param stride: 步长
    :return: 切分后的数据
    """
    new_x = []
    new_y = []
    pids = []
    for pid, (x_row, y_row, x_row_len) in enumerate(zip(x, y, x_len)):
        if x_row_len < window_size:
            new_x_row = np.concatenate((x_row[:x_row_len], np.zeros((window_size - x_row_len))), axis=0)
            new_x.append(new_x_row)
            new_y.append(y_row)
            pids.append(pid)
        else:
            for i in range(0, (x_row_len - window_size) // stride + 1):
                new_x_row = x_row[i * stride: i * stride + window_size]
                new_x.append(new_x_row)
                new_y.append(y_row)
                pids.append(pid)
    return np.array(new_x), np.array(new_y), np.array(pids)

new_X_train, new_y_train, train_pids = window_split(splitted_X_train, splitted_y_train, splitted_X_train_len, stride=300)
new_X_valid, new_y_valid, valid_pids = window_split(splitted_X_valid, splitted_y_valid, splitted_X_valid_len, stride=300)

## Train Model

In [4]:
# 处理数据
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu_device = torch.device("cpu")

new_X_train = torch.from_numpy(new_X_train).float().to(device)
new_y_train = torch.from_numpy(new_y_train).long().to(device)
new_X_valid = torch.from_numpy(new_X_valid).float().to(device)
new_y_valid = torch.from_numpy(new_y_valid).long().to(device)

dataset = torch.utils.data.TensorDataset(new_X_train, new_y_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

model = ResNet(input_channels=1, output_features=32, output_dim=4).to(device)
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def score(model, x, y):
    """
    预测
    :param model: 模型
    :param x: 数据
    :return: 预测结果
    """
    model.eval()
    with torch.no_grad():
        y_preds = []
        for i in range(0, x.shape[0], 64):
            x_batch = x[i:i + 64, :]
            x_batch = x_batch.reshape([x_batch.shape[0], 1, x_batch.shape[1]])
            y_pred = model(x_batch)
            y_pred = torch.argmax(torch.softmax(y_pred, dim=-1), dim=-1).to(cpu_device).detach().numpy()
            y_preds.append(y_pred)
        y_preds = np.concatenate(y_preds, axis=0)
        score = f1_score(y.to(cpu_device).detach().numpy(), y_preds, average="micro")
    model.train()
    return score

# 训练模型
model.train()
epochs = 200
f1s = []
with trange(epochs, desc="Training") as pbar:
    pbar_dict = {}
    pbar_dict["loss"] = round(0, 3)
    pbar_dict["f1"] = 0
    for epoch in pbar:
        for X_batch, y_batch in dataloader:
            optimizer.zero_grad()
            # 处理X_batch
            X_batch = X_batch.reshape([X_batch.shape[0], 1, X_batch.shape[1]]) # 输入需要是(sequence_len, 1, batch_size)

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            
            pbar_dict["loss"] = round(loss.detach().item(), 3)
            optimizer.step()

            pbar.set_postfix(pbar_dict)
        f1 = score(model, new_X_valid, new_y_valid)
        f1s.append(f1)

score(model, new_X_valid, new_y_valid)
# torch.save(model.state_dict(), os.path.join("Model", "Resnet_32F.pt"))

Training: 100%|██████████| 200/200 [1:26:28<00:00, 25.94s/it, loss=0.004, f1=0]


0.8101317715959004

In [5]:
score(model, new_X_valid, new_y_valid)

0.8101317715959004

In [6]:
# np.save(os.path.join("Data", "new_X_train_1.npy"), new_X_train.to(cpu_device).detach().numpy())
# np.save(os.path.join("Data", "new_y_train.npy"), new_y_train.to(cpu_device).detach().numpy())
# np.save(os.path.join("Data, pids.npy"), pids)