In [1]:
from utils import config, data
from utils.batcher import *
from utils.train_util import *
from utils.initialize import loadCheckpoint, save_model

from utils.write_result import *
from datetime import datetime as dt
from tqdm import tqdm
from beam.transormer_beam_search import *
from tensorboardX import SummaryWriter
import argparse

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

parser = argparse.ArgumentParser()
parser.add_argument('--key_attention', type=bool, default=False, help = 'True/False')
parser.add_argument('--intra_encoder', type=bool, default=False, help = 'True/False')
parser.add_argument('--intra_decoder', type=bool, default=False, help = 'True/False')
parser.add_argument('--transformer', type=bool, default=True, help = 'True/False')
parser.add_argument('--train_rl', type=bool, default=False, help = 'True/False')
parser.add_argument('--keywords', type=str, default='POS_FOP_keywords', 
                    help = 'POS_FOP_keywords / DEP_FOP_keywords / TextRank_keywords')

parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--rand_unif_init_mag', type=float, default=0.02)
parser.add_argument('--trunc_norm_init_std', type=float, default=0.001)
parser.add_argument('--mle_weight', type=float, default=1.0)
parser.add_argument('--gound_truth_prob', type=float, default=0.1)

parser.add_argument('--max_enc_steps', type=int, default=1000)
parser.add_argument('--max_dec_steps', type=int, default=50)
parser.add_argument('--min_dec_steps', type=int, default=8)
parser.add_argument('--max_epochs', type=int, default=20)
parser.add_argument('--vocab_size', type=int, default=50000)
parser.add_argument('--beam_size', type=int, default=6)
parser.add_argument('--batch_size', type=int, default=8)

parser.add_argument('--hidden_dim', type=int, default=512)
parser.add_argument('--emb_dim', type=int, default=512)
parser.add_argument('--gradient_accum', type=int, default=1)

parser.add_argument('--load_ckpt', type=str, default=None, help='0002000')
parser.add_argument('--word_emb_type', type=str, default='word2Vec', help='word2Vec/glove/FastText')
parser.add_argument('--pre_train_emb', type=bool, default=False)

opt = parser.parse_args(args=[])
config = re_config(opt)
config.rl_weight = 1 - config.mle_weight

if not config.transformer:
    loggerName = 'Pointer_generator_%s' % (config.word_emb_type)
else:
    loggerName = 'Transformer_%s' % (config.word_emb_type)
    
if config.intra_encoder and config.intra_decoder and True :
    loggerName = loggerName + '_Intra_Atten'
if config.key_attention:
    loggerName = loggerName + '_Key_Atten'
    
logger = getLogger(loggerName) 

if not config.transformer:
    writer = SummaryWriter('runs/Pointer-Generator/%s/exp' % config.word_emb_type)
else:
    writer = SummaryWriter('runs/Transformer/%s/exp' % config.word_emb_type)

I0409 01:38:59.294201 140206455236416 file_utils.py:35] PyTorch version 1.4.0 available.
2020-04-09 01:39:00 - Transformer_word2Vec - INFO: - logger已啟動
I0409 01:39:00.136677 140206455236416 train_util.py:92] logger已啟動


In [2]:
train_loader, validate_loader, vocab = getDataLoader(logger, config)

2020-04-09 01:39:05 - Transformer_word2Vec - INFO: - train : 37771, test : 4197
I0409 01:39:05.943351 140206455236416 batcher.py:171] train : 37771, test : 4197


In [3]:
from transformer import Model
import torch.nn as nn
import torch as T
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

load_step = None
model = Model(pre_train_emb=config.pre_train_emb, 
              word_emb_type = config.word_emb_type, 
              vocab = vocab,config=config)

model = model.cuda()

optimizer = T.optim.Adam(model.parameters(), lr=config.lr)   
# optimizer = T.optim.Adagrad(model.parameters(),lr=config.lr, initial_accumulator_value=0.1)

load_model_path = config.save_model_path + '/%s/%s.tar' % (logger, config.load_ckpt)

if os.path.exists(load_model_path):
    model, optimizer, load_step = loadCheckpoint(logger, load_model_path, model, optimizer)

In [4]:
def train_one(model, config, batch):
        ''' Calculate Negative Log Likelihood Loss for the given batch. In order to reduce exposure bias,
                pass the previous generated token as input with a probability of 0.25 instead of ground truth label
        Args:
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param batch: batch object
        '''
        'Encoder data'
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, _, \
        _, _, _, _= \
            get_input_from_batch(batch, config, batch_first = False)
       
        'Decoder data'
        dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch = \
        get_output_from_batch(batch, config, batch_first = False) # Get input and target batchs for training decoder

        pred = model(enc_batch, dec_batch, enc_padding_mask, dec_padding_mask, enc_batch_extend_vocab, extra_zeros)
#         loss = model.label_smoothing_loss(pred, target_batch)
        loss = model.nll_loss(pred, target_batch, dec_lens)
#         print(loss)#         
        # >>>>>>>> DEBUG Session <<<<<<<<<
#         print('------------------------------------')
#         print("ENC\n")
#         print(enc_batch.shape)
#         print("DEC\n")
#         print(dec_batch.shape)
        # print("TGT\n")
        # print(target_batch.shape)
        # print("ENCP\n")
        # print(enc_padding_mask.shape)
        # print("DECP\n")
        # print(dec_padding_mask.shape)
        return loss


In [5]:
# @torch.no_grad()
@torch.autograd.no_grad()
def validate(validate_loader, config, model):
    model.eval()
    losses = []
#     batch = next(iter(validate_loader))
    for batch in validate_loader:
        loss = train_one(model, config, batch)
        losses.append(loss.item())
#         break
    model.train()
    ave_loss = sum(losses) / len(losses)
    return ave_loss

In [6]:
@torch.autograd.no_grad()
def calc_running_avg_loss(loss, running_avg_loss, decay=0.99):
    if running_avg_loss == 0:  # on the first iteration just take the loss
        running_avg_loss = loss
    else:
        running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    running_avg_loss = min(running_avg_loss, 12)  # clip
    return running_avg_loss

In [7]:
from random import randint
@torch.autograd.no_grad()
def decode(writer, logger, step, config, model, batch, mode):
    # 動態取batch
    model.eval()
    config.is_predicting = True
    if mode == 'test':
        num = len(iter(batch))
        select_batch = None
        rand_b_id = randint(0,num-1)
#         logger.info('test_batch : ' + str(num)+ ' ' + str(rand_b_id))
        for idx, b in enumerate(batch):
            if idx == rand_b_id:
                select_batch = b
                break
#         select_batch = next(iter(batch))
        batch = select_batch
        if type(batch) == torch.utils.data.dataloader.DataLoader:
            batch = next(iter(batch))

    pred_ids = beam_search(config, batch, model, START, END, UNKNOWN_TOKEN)
    config.is_predicting = False

    article_sents, decoded_sents, keywords_list, \
    ref_sents, long_seq_index = prepare_result(vocab, batch, pred_ids)
#     print(prepare_result(vocab, batch, pred_ids))
    rouge_l = write_rouge(writer, step, mode,article_sents, decoded_sents, \
                keywords_list, ref_sents, long_seq_index)

    write_bleu(writer, step, mode, article_sents, decoded_sents, \
               keywords_list, ref_sents, long_seq_index)

    write_group(writer, step, mode, article_sents, decoded_sents,\
                keywords_list, ref_sents, long_seq_index)

    return rouge_l

In [8]:
from random import randint
@torch.autograd.no_grad()
def avg_acc(writer, logger, epoch, config, model, dataloader):
    # 動態取batch
    num = len(iter(dataloader))
    avg_rouge_l = []
    model.eval()
    config.is_predicting = True
    for idx, batch in enumerate(dataloader): 
        pred_ids = beam_search(config, batch, model, START, END, UNKNOWN_TOKEN)
        config.is_predicting = False

        article_sents, decoded_sents, keywords_list, \
        ref_sents, long_seq_index = prepare_result(vocab, batch, pred_ids)

        rouge_l = write_rouge(writer, None, None, article_sents, decoded_sents, \
                    keywords_list, ref_sents, long_seq_index, write = False)
        avg_rouge_l.append(rouge_l)


    avg_rouge_l = sum(avg_rouge_l) / num
    writer.add_scalars('scalar_avg/acc',  
                   {'testing_avg_acc': avg_rouge_l
                   }, epoch)

    return avg_rouge_l

In [9]:
write_train_para(writer, config)
logger.info('------Training START--------')
running_avg_loss = 0
step = 0

for epoch in range(config.max_epochs):
    for batch in train_loader:
        step += 1
        train_one(model, config, batch)
        mle_loss = train_one(model, config, batch)
        rl_loss = T.FloatTensor([0]).cuda()
        (config.mle_weight * mle_loss + config.rl_weight * rl_loss).backward()  # 反向传播，计算当前梯度
        
#         '''梯度累加就是，每次获取1个batch的数据，计算1次梯度，梯度不清空'''
        if step % ( config.gradient_accum) == 0: # gradient accumulation
#             clip_grad_norm_(model.parameters(), 5.0)                      
#             (config.mle_weight * mle_loss + config.rl_weight * rl_loss).backward()  # 反向传播，计算当前梯度
            optimizer.step() # 根据累计的梯度更新网络参数
            optimizer.zero_grad() # 清空过往梯度 

            
        if step%1000 == 0 :
            with T.autograd.no_grad():
                train_batch_loss = mle_loss.item()
                val_avg_loss = validate(validate_loader, config, model) # call batch by validate_loader
                running_avg_loss = calc_running_avg_loss(train_batch_loss, running_avg_loss)
                logger.info('epoch %d: %d, training batch loss = %f, running_avg_loss loss = %f, validation loss = %f'
                            % (epoch, step, train_batch_loss, running_avg_loss, val_avg_loss))
                writer.add_scalars('scalar/Loss',  
                   {'train_batch_loss': train_batch_loss
                   }, step)
                writer.add_scalars('scalar_avg/loss',  
                   {'train_avg_loss': running_avg_loss,
                    'test_avg_loss': val_avg_loss
                   }, step)
            
        if step%5000 == 0:
            save_model(config, logger, model, optimizer, step, vocab, running_avg_loss, \
                       r_loss=0, title = loggerName)        
      
        if step%1000 == 0 and step > 0:
            train_rouge_l_f = decode(writer, logger, step, config, model, batch, mode = 'train') # call batch by validate_loader
            test_rouge_l_f = decode(writer, logger, step, config, model, validate_loader, mode = 'test') # call batch by validate_loader

            writer.add_scalars('scalar/Rouge-L',  
               {'train_rouge_l_f': train_rouge_l_f,
                'test_rouge_l_f': test_rouge_l_f
               }, step)
            logger.info('epoch %d: %d, train_rouge_l_f = %f, test_rouge_l_f = %f'
                            % (epoch, step, train_rouge_l_f, test_rouge_l_f))
            
    train_avg_acc = avg_acc(writer, logger, epoch, config, model, train_loader, mode = 'train')
    test_avg_acc = avg_acc(writer, logger, epoch, config, model, validate_loader, mode = 'test')
    logger.info('epoch %d: %d, test_avg_acc = %f, test_avg_acc = %f' % (epoch, step, train_avg_acc, test_avg_acc))
#     try:
#         test_avg_acc = avg_acc(writer, logger, epoch, config, model, validate_loader)
#         logger.info('epoch %d: %d, test_avg_acc = %f' % (epoch, step, test_avg_acc))
#     except Exception as e:
#         print(e)

logger.info(u'------Training END--------')                
removeLogger(logger)


2020-04-09 01:39:09 - Transformer_word2Vec - INFO: - ------Training START--------
I0409 01:39:09.021363 140206455236416 <ipython-input-9-86ea97d748bc>:2] ------Training START--------
2020-04-09 01:41:24 - Transformer_word2Vec - INFO: - epoch 0: 1000, training batch loss = 5.593308, running_avg_loss loss = 5.593308, validation loss = 5.092603
I0409 01:41:24.982884 140206455236416 <ipython-input-9-86ea97d748bc>:28] epoch 0: 1000, training batch loss = 5.593308, running_avg_loss loss = 5.593308, validation loss = 5.092603
2020-04-09 01:41:25 - Transformer_word2Vec - INFO: - epoch 0: 1000, train_rouge_l_f = 0.000000, test_rouge_l_f = 0.000000
I0409 01:41:25.183252 140206455236416 <ipython-input-9-86ea97d748bc>:50] epoch 0: 1000, train_rouge_l_f = 0.000000, test_rouge_l_f = 0.000000
2020-04-09 01:43:37 - Transformer_word2Vec - INFO: - epoch 0: 2000, training batch loss = 4.691309, running_avg_loss loss = 5.584288, validation loss = 4.805816
I0409 01:43:37.971808 140206455236416 <ipython-inp

In [10]:
import torch
x = torch.rand(2, 5)
x

tensor([[0.2019, 0.1671, 0.1338, 0.8487, 0.1730],
        [0.8708, 0.2498, 0.9451, 0.5590, 0.1955]])

In [11]:
# input.scatter_(dim, index, src)
# 将src中数据根据index中的索引按照dim的方向填进input中
torch.zeros(400, 50000).scatter_(1, torch.tensor([[1]
                                           ]), x)


tensor([[0.0000, 0.2019, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])