# ----------------------------------------定义训练函数-----------------------------------------------
- 整合之前的函数进行训练

In [24]:
import argparse
import numpy as np
import os
import pickle
import import_ipynb
# import Ipynb_importer


import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms

# from Data_Loader_3 import get_loader
from Bulid_Vocab_2 import Vocabulary
from Model_4 import EncoderCNN, DecoderRNN

importing Jupyter notebook from Model_4.ipynb
Namespace(model_path='models/', crop_size=224, vocab_path='data/vocab.pkl', image_dir='data/resized2014', caption_path='data/annotations/captions_train2014.json', log_step=10, save_step=1000, embed_size=256, hidden_size=512, num_layers=1, num_epochs=1, batch_size=128, num_workers=2, learning_rate=0.001)
EncoderCNN(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2)

In [23]:
from Data_Loader import get_loader

Namespace(model_path='models/', crop_size=224, vocab_path='data/vocab.pkl', image_dir='data/resized2014', caption_path='data/annotations/captions_train2014.json', log_step=10, save_step=1000, embed_size=256, hidden_size=512, num_layers=1, num_epochs=1, batch_size=128, num_workers=None, learning_rate=0.001)
loading annotations into memory...
Done (t=0.51s)
creating index...
index created!
torch.Size([128, 3, 224, 224]) torch.Size([128, 22]) 128
tensor([[   1,   33,   30,  ...,   47,   19,    2],
        [   1,    4, 4666,  ...,    2,    0,    0],
        [   1,   48, 1089,  ...,    2,    0,    0],
        ...,
        [   1,   51, 1312,  ...,    0,    0,    0],
        [   1,   51,  744,  ...,    0,    0,    0],
        [   1,    4,   92,  ...,    0,    0,    0]])


In [25]:
"""定义使用GPU进行计算"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
"""定义训练函数"""
def main(args):
    # 检查模型路径是否存在, 不存在则创建
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # 定义图片的增广等操作
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size), # 随即裁剪到指定大小
        transforms.RandomHorizontalFlip(), # 随机水平翻转
        transforms.ToTensor(), # 转成tensor
        transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)) # 标准化
        
    ])
    
    # 加载词典
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # 构建用于训练的数据包
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, 
                             shuffle=True, num_workers=args.num_workers)
    
    # 搭建模型
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # 开始训练模型
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # 设置小批量
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # 把文本打包成一系列字节，为了处理不同长度的字符
            
            # 前向和反向传播，更新参数
            features = encoder(images) # 提取输入图像的特征
            outputs = decoder(features, captions, lengths) # 根据特征和源序列，及其有效填充长度计算预测结果
            loss = criterion(outputs, targets) # 计算误差
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()
            
            # 打印训练进度
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss:{:.4f}, Perplexity:{:5.4f}'.format(epoch, args.num_epochs, i, 
                                                                                           total_step, loss.item(), np.exp(loss.item())))
            
            # 保存模型的检查点
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
            
            

In [28]:
"""定义参数，运行模型（在直接调用的情况下）"""
if __name__ == '__main__':
    
    #   创建一个命令行参数解析器对象，用于储存参数
    parser = argparse.ArgumentParser() 
    
    # 给定一些基础参数，包括文件路径等
    parser.add_argument('--model_path', type=str, default='models/', help='保存训练模型的地方')
    parser.add_argument('--crop_size', type=int, default=224, help='随机裁剪图片的大小')
    parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='之前生成词典的路径')
    parser.add_argument('--image_dir', type=str, default='data/resized2014', help='已经处理好大小的训练图片的路径')
    parser.add_argument('--caption_path', type=str, default='data/annotations/captions_train2014.json',help='训练集标签的路径')
    parser.add_argument('--log_step', type=int, default=10, help='打印训练进度的设定值')
    parser.add_argument('--save_step', type=int, default=1000, help='保存模型节点的设定值')
    
    # 设定模型的参数值
    parser.add_argument('--embed_size', type=int, default=256, help='词嵌入向量的维度，也就是用多少维来表示一个词元')
    parser.add_argument('--hidden_size', type=int, default=512, help='隐藏状态的维度')
    parser.add_argument('--num_layers', type=int, default=1, help='LSTM层的数量')
    
    # 设定训练的参数
    parser.add_argument('--num_epochs', type=int, default=5, help='epoch数')
    parser.add_argument('--batch_size', type=int, default=128, help='批量大小')
    parser.add_argument('--num_workers', type=int, default=2, help='并行运算大小')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='学习率')
    
    # 解析传入的命令行参数
    args = parser.parse_args()
    print(args)
    
    # 调用定义的main函数，传入已经解析后的参数
    main(args)

usage: ipykernel_launcher.py [-h] [--model_path MODEL_PATH] [--crop_size CROP_SIZE] [--vocab_path VOCAB_PATH]
                             [--image_dir IMAGE_DIR] [--caption_path CAPTION_PATH] [--log_step LOG_STEP]
                             [--save_step SAVE_STEP] [--embed_size EMBED_SIZE] [--hidden_size HIDDEN_SIZE]
                             [--num_layers NUM_LAYERS] [--num_epochs NUM_EPOCHS] [--batch_size BATCH_SIZE]
                             [--num_workers NUM_WORKERS] [--learning_rate LEARNING_RATE]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\De\AppData\Roaming\jupyter\runtime\kernel-ba5b0948-ea27-4431-b02d-6f7505e38844.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [31]:
"""创建一个命令行参数解析器对象，用于储存参数"""
parser = argparse.ArgumentParser() 

# 给定一些基础参数，包括文件路径等
parser.add_argument('--model_path', type=str, default='models/', help='保存训练模型的地方')
parser.add_argument('--crop_size', type=int, default=224, help='随机裁剪图片的大小')
parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='之前生成词典的路径')
parser.add_argument('--image_dir', type=str, default='data/resized2014', help='已经处理好大小的训练图片的路径')
parser.add_argument('--caption_path', type=str, default='data/annotations/captions_train2014.json',help='训练集标签的路径')
parser.add_argument('--log_step', type=int, default=10, help='打印训练进度的设定值')
parser.add_argument('--save_step', type=int, default=1000, help='保存模型节点的设定值')

# 设定模型的参数值
parser.add_argument('--embed_size', type=int, default=256, help='词嵌入向量的维度，也就是用多少维来表示一个词元')
parser.add_argument('--hidden_size', type=int, default=512, help='隐藏状态的维度')
parser.add_argument('--num_layers', type=int, default=1, help='LSTM层的数量')

# 设定训练的参数
parser.add_argument('--num_epochs', type=int, default=1, help='epoch数')
parser.add_argument('--batch_size', type=int, default=128, help='批量大小')
parser.add_argument('--num_workers', type=int, default=0, help='并行运算大小')
parser.add_argument('--learning_rate', type=float, default=0.001, help='学习率')

# 解析传入的命令行参数
args = parser.parse_args(args=[])
print(args)

Namespace(model_path='models/', crop_size=224, vocab_path='data/vocab.pkl', image_dir='data/resized2014', caption_path='data/annotations/captions_train2014.json', log_step=10, save_step=1000, embed_size=256, hidden_size=512, num_layers=1, num_epochs=1, batch_size=128, num_workers=0, learning_rate=0.001)


# -------------测试代码能否训练

In [32]:
# 检查模型路径是否存在, 不存在则创建
if not os.path.exists(args.model_path):
    os.makedirs(args.model_path)

In [33]:
# 定义图片的增广等操作
transform = transforms.Compose([
    transforms.RandomCrop(args.crop_size), # 随即裁剪到指定大小
    transforms.RandomHorizontalFlip(), # 随机水平翻转
    transforms.ToTensor(), # 转成tensor
    transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)) # 标准化

])

In [34]:
# 加载词典
with open(args.vocab_path, 'rb') as f:
    vocab = pickle.load(f)

In [35]:
# 构建用于训练的数据包
data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, 
                         shuffle=True, num_workers=args.num_workers)

loading annotations into memory...
Done (t=0.50s)
creating index...
index created!


In [36]:
# 搭建模型
encoder = EncoderCNN(args.embed_size).to(device)
decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)

In [37]:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=args.learning_rate)


In [38]:

# 开始训练模型
total_step = len(data_loader)
for epoch in range(args.num_epochs):
    for i, (images, captions, lengths) in enumerate(data_loader):

        # 设置小批量
        images = images.to(device)
        captions = captions.to(device)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # 把文本打包成一系列字节，为了处理不同长度的字符

        # 前向和反向传播，更新参数
        features = encoder(images) # 提取输入图像的特征
        outputs = decoder(features, captions, lengths) # 根据特征和源序列，及其有效填充长度计算预测结果
        loss = criterion(outputs, targets) # 计算误差
        decoder.zero_grad()
        encoder.zero_grad()
        loss.backward()
        optimizer.step()

        # 打印训练进度
        if i % args.log_step == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss:{:.4f}, Perplexity:{:5.4f}'.format(epoch, args.num_epochs, i, 
                                                                                       total_step, loss.item(), np.exp(loss.item())))

        # 保存模型的检查点
        if (i+1) % args.save_step == 0:
            torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
            torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
            
            

Epoch [0/1], Step [0/3236], Loss:9.1966, Perplexity:9863.5826
Epoch [0/1], Step [10/3236], Loss:5.8161, Perplexity:335.6715
Epoch [0/1], Step [20/3236], Loss:5.3998, Perplexity:221.3660
Epoch [0/1], Step [30/3236], Loss:4.9297, Perplexity:138.3336
Epoch [0/1], Step [40/3236], Loss:4.5931, Perplexity:98.8026
Epoch [0/1], Step [50/3236], Loss:4.2968, Perplexity:73.4654
Epoch [0/1], Step [60/3236], Loss:4.3751, Perplexity:79.4479
Epoch [0/1], Step [70/3236], Loss:4.1304, Perplexity:62.2026
Epoch [0/1], Step [80/3236], Loss:3.9044, Perplexity:49.6190
Epoch [0/1], Step [90/3236], Loss:4.0232, Perplexity:55.8785
Epoch [0/1], Step [100/3236], Loss:3.9092, Perplexity:49.8611
Epoch [0/1], Step [110/3236], Loss:3.8121, Perplexity:45.2466
Epoch [0/1], Step [120/3236], Loss:3.5910, Perplexity:36.2694
Epoch [0/1], Step [130/3236], Loss:3.6123, Perplexity:37.0503
Epoch [0/1], Step [140/3236], Loss:3.6877, Perplexity:39.9513
Epoch [0/1], Step [150/3236], Loss:3.6752, Perplexity:39.4575
Epoch [0/1], S

Epoch [0/1], Step [1320/3236], Loss:2.5824, Perplexity:13.2287
Epoch [0/1], Step [1330/3236], Loss:2.4119, Perplexity:11.1557
Epoch [0/1], Step [1340/3236], Loss:2.4681, Perplexity:11.7997
Epoch [0/1], Step [1350/3236], Loss:2.4057, Perplexity:11.0858
Epoch [0/1], Step [1360/3236], Loss:2.3511, Perplexity:10.4972
Epoch [0/1], Step [1370/3236], Loss:2.4997, Perplexity:12.1790
Epoch [0/1], Step [1380/3236], Loss:2.4144, Perplexity:11.1835
Epoch [0/1], Step [1390/3236], Loss:2.3850, Perplexity:10.8586
Epoch [0/1], Step [1400/3236], Loss:2.5625, Perplexity:12.9684
Epoch [0/1], Step [1410/3236], Loss:2.4460, Perplexity:11.5416
Epoch [0/1], Step [1420/3236], Loss:2.4147, Perplexity:11.1864
Epoch [0/1], Step [1430/3236], Loss:2.4680, Perplexity:11.7993
Epoch [0/1], Step [1440/3236], Loss:2.5056, Perplexity:12.2514
Epoch [0/1], Step [1450/3236], Loss:2.4803, Perplexity:11.9454
Epoch [0/1], Step [1460/3236], Loss:2.5564, Perplexity:12.8897
Epoch [0/1], Step [1470/3236], Loss:2.5190, Perplexity:

Epoch [0/1], Step [2630/3236], Loss:2.2105, Perplexity:9.1206
Epoch [0/1], Step [2640/3236], Loss:2.3842, Perplexity:10.8504
Epoch [0/1], Step [2650/3236], Loss:2.2508, Perplexity:9.4953
Epoch [0/1], Step [2660/3236], Loss:2.1967, Perplexity:8.9950
Epoch [0/1], Step [2670/3236], Loss:2.2068, Perplexity:9.0862
Epoch [0/1], Step [2680/3236], Loss:2.2006, Perplexity:9.0301
Epoch [0/1], Step [2690/3236], Loss:2.3353, Perplexity:10.3329
Epoch [0/1], Step [2700/3236], Loss:2.2844, Perplexity:9.8202
Epoch [0/1], Step [2710/3236], Loss:2.2502, Perplexity:9.4896
Epoch [0/1], Step [2720/3236], Loss:2.3878, Perplexity:10.8899
Epoch [0/1], Step [2730/3236], Loss:2.3558, Perplexity:10.5470
Epoch [0/1], Step [2740/3236], Loss:2.2775, Perplexity:9.7524
Epoch [0/1], Step [2750/3236], Loss:2.3189, Perplexity:10.1649
Epoch [0/1], Step [2760/3236], Loss:2.3845, Perplexity:10.8538
Epoch [0/1], Step [2770/3236], Loss:2.0946, Perplexity:8.1219
Epoch [0/1], Step [2780/3236], Loss:2.3633, Perplexity:10.6265
E