In [15]:
import os
import numpy as np
import torch
from torch import nn,einsum
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
from functions import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pickle


In [16]:
# set path
#data_path = "./jpegs_256/"    # define UCF-101 RGB data path
data_path = "/home/hanpeiheng/dataset/yawdd_temp/"
action_name_path = './data.pkl' #全部动作的名字标签
save_model_path = "../vivit_ckpt/"
#fnames = os.listdir(data_path)
#fnames

# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 256, 128
CNN_embed_dim = 256      # latent dim extracted by 2D CNN
img_x, img_y = 80,80  # resize video 2d frame size
img_size = 80
ptc_size=20
dropout_p = 0.3          # dropout probability 丢失概率设置为0.0，这意味着不会丢弃任何神经元。这表明该模型可能不太复杂，过拟合可能不是一个主要问题。

# DecoderRNN architecture
RNN_hidden_layers = 3 # 三个隐藏层
RNN_hidden_nodes = 128 # 每个隐藏层512个节点
RNN_FC_dim = 128 # 一个全连接层，其维度为256

# training parameters
k = 3             # number of target category 目标类别的数量为101个
epochs = 100        # training epochs 训练轮数
batch_size = 31     # 每批次训练的样本数量为30
learning_rate = 1e-4 # 学习率为0.0001  学习率参数很重要，自己搜搜看吧
log_interval = 10   # interval for displaying training info 训练过程中打印训练信息的间隔为10
lam=1e-4
weight_decay_global=1e-6
step_size=10
gamma=0.9

# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 30, 90, 2 #begin_frame表示从第1帧开始，end_frame表示结束帧数为29，skip_frame表示每隔1帧进行采样，即不对连续帧进行处理。这些参数通常用于视频分类或动作识别任务中




In [17]:
# 输入参数：包括日志输出间隔(log_interval)、模型(model)，设备(device)、训练数据加载器(train_loader)、优化器(optimizer)和当前epoch
def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    en = model[0]
    de = model[1]
    en.train()
    de.train()

    losses = [] # 损失率
    scores = [] # 准确率
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        # distribute data to device
        # 对于每个批次(batch)，它将数据(X, y)分发到给定的设备上(device)
        X, y = X.to(device), y.to(device).view(-1, )

        N_count += X.size(0)
        
        # 模型的前向传递通过使用卷积神经网络编码器(cnn_encoder)对输入数据进行特征提取
        # 并使用循环神经网络解码器(rnn_decoder)对特征进行分类输出(output)
        optimizer.zero_grad()
        output = de(en(X))   # output has dim = (batch, number of classes)
        
        # 计算输出(output)与标签(y)之间的交叉熵损失(loss)
        re_loss = 0
        for name, param in model[0].named_parameters():
            if param.requires_grad:
                re_loss += torch.sum(torch.abs(param))
        for name, param in model[1].named_parameters():
            if param.requires_grad:
                re_loss += torch.sum(torch.abs(param))
        loss = F.cross_entropy(output, y) + re_loss * lam
        losses.append(loss.item()) # 将该损失值添加到损失列表(losses)

        # to compute accuracy
        # 使用 torch.max() 函数检索输出张量(output)中最大值的索引作为预测标签(y_pred)
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        # 将预测标签与实际标签(y)进行比较。使用 accuracy_score() 函数计算预测准确率(step_score)
        # 并将其添加到准确度列表(scores)中
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        scores.append(step_score)         # computed on CPU
        
        # 反向传播将梯度更新到各个网络层
        # 并执行优化步骤(optimizer.step())来更新模型参数
        loss.backward()
        optimizer.step()

        # show information
        # 如果达到了日志输出间隔(log_interval)，则打印训练信息，包括当前epoch、批次进度、损失和准确率
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch , N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return sum(losses)/len(losses), sum(scores)/len(scores)

# 输入参数：接受一个模型、设备、优化器和测试数据集作为参数
def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    # 在没有梯度计算的情况下进行测试
    en = model[0]
    de = model[1]
    en.eval() # 将模型设置为评估（testing）模式
    de.eval() # 将模型设置为评估（testing）模式

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        # 循环遍历测试数据集中的所有批次
        # 将每个批次的输入数据X和标签y分配到指定的设备上
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device).view(-1, )
            # 使用模型对输入数据进行前向传递并计算输出结果
            output = de(en(X))
            # 使用交叉熵损失函数计算输出结果和实际标签的损失
            re_loss = 0
            for name, param in model[0].named_parameters():
                if param.requires_grad:
                    re_loss += torch.sum(torch.abs(param))
            
            for name, param in model[1].named_parameters():
                if param.requires_grad:
                    re_loss += torch.sum(torch.abs(param))
            
                    
            loss = F.cross_entropy(output, y, reduction='sum') + re_loss * lam
            test_loss += loss.item()                 # sum up batch loss 将所有批次的损失相加以计算平均损失
            # 将输出结果转换为预测标签
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            # 将所有批次的实际标签和预测标签收集到all_y和all_y_pred列表中
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # compute accuracy
    # 计算测试准确率
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('Test set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))

    # save Pytorch models of best record
    # 将当前模型的状态字典（state_dict）保存到指定路径
    torch.save(en.state_dict(), os.path.join(save_model_path, 'en_epoch{}.pth'.format(epoch)))  # save spatial_encoder
    torch.save(de.state_dict(), os.path.join(save_model_path, 'de_epoch{}.pth'.format(epoch)))  # save spatial_encoder
    # torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_epoch{}.pth'.format(epoch + 1)))  # save motion_encoder
    torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch{}.pth'.format(epoch)))      # save optimizer
    print("Epoch {} model saved!".format(epoch))
    
    # 返回测试损失和测试准确率
    return test_loss, test_score


# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

# Data loading parameters
# 参数介绍 # ↓
# batch_size: 批处理大小，指定每个批次(batch)中包含的样本数量
# shuffle: 是否对数据进行洗牌，即随机打乱顺序。如果设置为 True，则每个 epoch(整个数据集被遍历一次)都会重新打乱数据
# num_workers: 使用多少个子进程来加载数据。默认值是 0，表示在主进程中加载数据。如果设置为大于 0 的数值，则使用多个子进程同时读取数据，可以加快数据加载速度
# pin_memory: 是否将数据放置在 CUDA 主机内存上。如果使用 GPU 计算，这个参数可以提高数据加载速度
# if use_cuda else {} 根据变量 use_cuda 是否为 True 来决定是否将字典设为空字典 {}。这里的意思是，
# 如果 use_cuda 为 True，那么就使用前面定义的 batch_size、shuffle、num_workers 和 pin_memory 这些参数；否则，就使用空字典
# 这个判断是由于在使用 GPU 时，需要将参数数据字典放在显存上，而在 CPU 上使用时则不需要
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}

In [18]:
# load UCF101 actions names
# 加载101个视频动作名字
with open(action_name_path, 'rb') as f:
    action_names = pickle.load(f)
    
# convert labels -> category
# LabelEncoder 是 scikit-learn 库中的一个工具类，可以将分类变量编码为数值变量
# 定义了一个 LabelEncoder 的实例对象 le，然后使用 fit() 方法来拟合数据（即学习标签与类别之间的对应关系）。fit() 方法的参数为 action_names，即标签名列表
# 例如，如果 action_names 中包含三个不同的标签名：'A'、'B'、'C'，那么执行 fit() 后，le.classes_ 属性会被设置为 ['A', 'B', 'C']，
le = LabelEncoder()
le.fit(action_names)

# show how many classes there are
list(le.classes_)

# convert category -> 1-hot
# OneHotEncoder() 是 scikit-learn 库中的一个工具类，可以将分类变量编码为 one-hot 向量
# 使用前面定义的 LabelEncoder 对标签(label)进行编码，得到类别序号(category index)。
# 这一步使用了 le.transform(action_names) 方法将 action_names 列表中的每个标签映射为对应的类别序号
# 接着上面的例子就是le.transform(['A', 'B', 'C']) 会返回一个数组 [0, 1, 2]，表示 'A' 被编码为 0，'B' 编码为 1，'C' 编码为 2。
# 并通过 reshape(-1, 1) 将结果转换为列向量如下：
#array([[0],
#       [1],
#      [2]])
action_category = le.transform(action_names).reshape(-1, 1)
# 创建一个 OneHotEncoder 的实例对象 enc
# 并使用 fit() 方法拟合数据（即学习各类别之间的关系）
enc = OneHotEncoder()
enc.fit(action_category)

# # example
# y = ['HorseRace', 'YoYo', 'WalkingWithDog']
# y_onehot = labels2onehot(enc, le, y)
# y2 = onehot2labels(le, y_onehot)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [19]:
actions = []
fnames = os.listdir(data_path)

all_names = []
for f in fnames:
    loc1 = f.find('s-')
    if loc1==-1:
        loc1=f.find('d-')
        
        if loc1 == -1:
            loc1=f.find('e-')
            
    actions.append(f[(loc1 + 2): ])

    all_names.append(f)

all_X_list = all_names
all_y_list = labels2cat(le, actions)    

train_list, test_list, train_label, test_label = train_test_split(all_X_list, all_y_list, test_size=0.2, random_state=42)


transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()

# 反斜线 \ 是 Python 中的行连接符，用于将一行代码分成多行来提高代码的可读性和易维护性
train_set, valid_set = Dataset_CRNN(data_path, train_list, train_label, selected_frames, transform=transform), \
                       Dataset_CRNN(data_path, test_list, test_label, selected_frames, transform=transform)

train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

# Create model
# EncoderCNN和DecoderRNN都是functions.py的自定义类
en = ViViT_en(image_size=img_size, patch_size=ptc_size, num_frames=30,dropout=dropout_p).to(device)
de = LSTM_de(CNN_embed_dim=32, h_RNN_layers=2, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=3).to(device)
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    en = nn.DataParallel(en) #分配到可用的GPU上
    de = nn.DataParallel(de) #分配到可用的GPU上
# 作为一个整体被优化器所优化
params = list(en.parameters())+list(de.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate,weight_decay=weight_decay_global)
StepLR = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)


Using 4 GPUs!


In [20]:
# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

# start training
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, [en,de], device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation([en,de], device, optimizer, valid_loader)

    # save results
    # train_scores_avg=sum(train_scores)/len(train_scores)
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)

    # save all train test results
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)
    C = np.array(epoch_test_losses)
    D = np.array(epoch_test_scores)
    np.save('./vivit_epoch_training_losses.npy', A)
    np.save('./vivit_epoch_training_scores.npy', B)
    np.save('./vivit_epoch_test_loss.npy', C)
    np.save('./vivit_epoch_test_score.npy', D)

# plot
fig = plt.figure(figsize=(10, 4)) #创建一个10x4英寸的新窗口
plt.subplot(121) # 参数121表示将整个图形窗口分成1行2列，在第1个位置上添加子图，设为当前绘图区域
#  绘制折线图，在当前子图中绘制以训练时期为横坐标、损失值为纵坐标的折线图
# 参数np.arange(1, epochs + 1)用于生成一个1到epochs的整数序列，表示训练时期的编号
# A[:, -1]表示将数组A的最后一列作为y轴的数据，即每个训练时期的最后一个batch的损失值
plt.plot(np.arange(1, epochs + 1), A)  # train loss (on epoch end)
plt.plot(np.arange(1, epochs + 1), C)         #  test loss (on epoch end)
plt.title("model loss")
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc="upper left")
# 2nd figure 
#代码解释和上面的同理
plt.subplot(122)
plt.plot(np.arange(1, epochs + 1), B)  # train accuracy (on epoch end)
plt.plot(np.arange(1, epochs + 1), D)         #  test accuracy (on epoch end)
plt.title("training scores")
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc="upper left")
title = "./fig_yawdd_vivit.png"
plt.savefig(title, dpi=600)
# plt.close(fig)
plt.show()

Test set (157 samples): Average loss: 1.1929, Accuracy: 42.04%

Epoch 0 model saved!
Test set (157 samples): Average loss: 1.1845, Accuracy: 42.04%

Epoch 1 model saved!
Test set (157 samples): Average loss: 1.1799, Accuracy: 42.04%

Epoch 2 model saved!


KeyboardInterrupt: 

In [None]:
print(torch.cuda.is_available())

True
