In [1]:
from utils import config
from utils.bert import data

from utils.bert.batcher import *
from utils.bert.train_util import *
from utils.bert.initialize import loadCheckpoint, save_model
from utils.bert.write_result import *

from datetime import datetime as dt
from tqdm import tqdm

from tensorboardX import SummaryWriter
import argparse
from torch.distributions import Categorical

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

parser = argparse.ArgumentParser()
parser.add_argument('--model_type', type=str, default='transformer', choices=['seq2seq', 'transformer'])
parser.add_argument('--copy', type=bool, default=True, choices=[True, False])
parser.add_argument("-encoder", default='bert', type=str, choices=['bert', 'Transformer'])
parser.add_argument("-max_pos", default=800, type=int)
parser.add_argument("-use_bert_emb", type=str2bool, nargs='?',const=True,default=True, choices=[False, True])

parser.add_argument("-lr_bert", default=2e-2, type=float, help='2e-3')
parser.add_argument("-lr_dec", default=2e-2, type=float, help='2e-3')
parser.add_argument("-share_emb", type=str2bool, nargs='?', const=True, default=False)
parser.add_argument("-finetune_bert", type=bool, default=True)
    
'''
原 Bert Base paper核心參數
dropout = 0.1
num_layers = 12
num_heads = 8
emb_dim(d_model) : 768
ff_embed_dim = 4*emb_dim = 3072

bert_config = BertConfig(self.encoder.model.config.vocab_size, hidden_size=768,
                                     num_hidden_layers=12, num_attention_heads=8,
                                     intermediate_size= 3072,
                                     hidden_dropout_prob=0.1,
                                     attention_probs_dropout_prob=0.1)
'''
parser.add_argument("-enc_dropout", default=0.1, type=float)
parser.add_argument("-enc_layers", default=8, type=int)
parser.add_argument("-enc_hidden_size", default=768, type=int)
parser.add_argument("-enc_heads", default=4, type=int)
parser.add_argument("-enc_ff_size", default=3072, type=int)

parser.add_argument("-dec_dropout", default=0.1, type=float)
parser.add_argument("-dec_layers", default=8, type=int)
parser.add_argument("-dec_hidden_size", default=768, type=int)
parser.add_argument("-dec_heads", default=4, type=int)
parser.add_argument("-dec_ff_size", default=2048, type=int)
parser.add_argument("-sep_optim", type=str2bool, nargs='?',const=True,default=True, choices=[False, True])

parser.add_argument("-param_init", default=0, type=float)
parser.add_argument("-param_init_glorot", type=str2bool, nargs='?',const=True,default=True)
parser.add_argument("-optim", default='adam', type=str)
parser.add_argument("-lr", default=1, type=float)
parser.add_argument("-beta1", default= 0.9, type=float)
parser.add_argument("-beta2", default=0.999, type=float)
parser.add_argument("-warmup_steps", default=8000, type=int)
parser.add_argument("-warmup_steps_bert", default=8000, type=int)
parser.add_argument("-warmup_steps_dec", default=8000, type=int)
parser.add_argument("-max_grad_norm", default=0, type=float)

parser.add_argument("-block_trigram", type=str2bool, nargs='?', const=True, default=True)


parser.add_argument('--train_rl', type=bool, default=False, help = 'True/False')
parser.add_argument('--keywords', type=str, default='POS_keys', 
                    help = 'POS_keys / DEP_keys / Noun_adj_keys / TextRank_keys')

parser.add_argument('--mle_weight', type=float, default=1.0)
parser.add_argument("-label_smoothing", default=0.0, type=float)
parser.add_argument("-generator_shard_size", default=32, type=int)
parser.add_argument("-alpha",  default=0.6, type=float)

parser.add_argument('--max_enc_steps', type=int, default=800)
parser.add_argument('--max_dec_steps', type=int, default=30)
parser.add_argument('--min_dec_steps', type=int, default=10)
parser.add_argument('--max_epochs', type=int, default=15)
parser.add_argument('--vocab_size', type=int, default=50000)
parser.add_argument('--beam_size', type=int, default=16)
parser.add_argument('--batch_size', type=int, default=8)

# parser.add_argument('--hidden_dim', type=int, default=512)
# parser.add_argument('--emb_dim', type=int, default=512)
parser.add_argument('--gradient_accum', type=int, default=1)

parser.add_argument('--load_ckpt', type=str, default='0000010', help='0000010')
# parser.add_argument('--word_emb_type', type=str, default='glove', help='word2Vec/glove/FastText')
# parser.add_argument('--pre_train_emb', type=bool, default=False, help = 'True/False') # 若pre_train_emb為false, 則emb type為NoPretrain

opt = parser.parse_args(args=[])
config = re_config(opt)

loggerName, writerPath = getName(config)    
logger = getLogger(loggerName)
writer = SummaryWriter(writerPath)

I0519 16:08:43.188616 140527635740480 file_utils.py:35] PyTorch version 1.4.0 available.
2020-05-19 16:08:44 - Pointer_Sep_BertEnc_Transformer_BertEmb - INFO: - logger已啟動
I0519 16:08:44.422532 140527635740480 train_util.py:119] logger已啟動


loggerName Pointer_Sep_BertEnc_Transformer_BertEmb
writerPath runs/Mix6_mainCat_20/Pointer_Sep_BertEnc_Transformer/BertEmb/exp


In [2]:
train_loader, validate_loader, vocab, symbols = getDataLoader(logger, config)
tokenizer = vocab.tokenizer
train_batches = len(iter(train_loader))
test_batches = len(iter(validate_loader))
save_steps = int(train_batches/1000)*1000

I0519 16:08:45.467981 140527635740480 tokenization.py:157] loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../temp/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


vocab_file ../temp/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
{'BOS': 1, 'EOS': 2, 'PAD': 0, 'EOQ': 3, 'SEP': 102, 'CLS': 101, 'UNK': 100}


E0519 16:09:05.190528 140527635740480 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.

I0519 16:09:05.195516 140527635740480 ultratb.py:1111] 
Unfortunately, your original traceback can not be constructed.



Traceback (most recent call last):
  File "/home/eagleuser/.conda/envs/Leyan/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-1b9bb1c260d5>", line 1, in <module>
    train_loader, validate_loader, vocab, symbols = getDataLoader(logger, config)
  File "/home/eagleuser/Users/leyan/Summarize_parallel/utils/bert/batcher.py", line 208, in getDataLoader
    total_df = pd.read_excel(config.xls_path)
  File "/home/eagleuser/.conda/envs/Leyan/lib/python3.6/site-packages/pandas/util/_decorators.py", line 178, in wrapper
    return func(*args, **kwargs)
  File "/home/eagleuser/.conda/envs/Leyan/lib/python3.6/site-packages/pandas/util/_decorators.py", line 178, in wrapper
    return func(*args, **kwargs)
  File "/home/eagleuser/.conda/envs/Leyan/lib/python3.6/site-packages/pandas/io/excel.py", line 307, in read_excel
    io = ExcelFile(io, engine=engine)
  File "/home/eagleuser/.cond

KeyboardInterrupt: 

In [None]:
from utils.transformer.loss import *
from utils.transformer.optimizers import Optimizer
from transformer import *
from utils.transformer.predictor import build_predictor
import torch.nn as nn
import torch
from parallel import DataParallelModel, DataParallelCriterion
# https://gist.github.com/thomwolf/7e2407fbd5945f07821adae3d9fd1312

model = AbsSummarizer(config)

load_model_path = config.save_model_path + '/%s/%s.tar' % (loggerName, config.load_ckpt)
if os.path.exists(load_model_path):
    model, optimizer, load_step = loadCheckpoint(config, logger, load_model_path, model)
else:    
    if (config.sep_optim):
        optim_bert = Optimizer(
            config.optim, config.lr_bert, config.max_grad_norm,
            beta1=config.beta1, beta2=config.beta2,
            decay_method='noam',
            warmup_steps=config.warmup_steps_bert)

        optim_dec = Optimizer(
            config.optim, config.lr_dec, config.max_grad_norm,
            beta1=config.beta1, beta2=config.beta2,
            decay_method='noam',
            warmup_steps=config.warmup_steps_dec)
        
        params = [(n, p) for n, p in list(model.named_parameters()) if n.startswith('encoder.model')]
        optim_bert.set_parameters(params)

        params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('encoder.model')]
        optim_dec.set_parameters(params)

        optimizer = [optim_bert, optim_dec]
    else:
        optimizer = Optimizer(
            config.optim, config.lr, config.max_grad_norm,
            beta1=config.beta1, beta2=config.beta2,
            decay_method='noam',
            warmup_steps=config.warmup_steps)
        optimizer.set_parameters(list(model.named_parameters()))
        optimizer = [optimizer]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
setattr(config, 'device_ids', [0])

model = get_cuda(model)
# net = nn.DataParallel(model, device_ids=config.device_ids)
# model = nn.DataParallel(model).cuda()
model.to(device) 

In [None]:
parallel_model = DataParallelModel(model) # Encapsulate the model

criterion = choose_criterion(config, model.vocab_size)
parallel_loss = DataParallelCriterion(criterion)

In [None]:

# loss, num_correct, target = compute_loss(None, criterion, pred, dec_batch[:,1:], num_tokens, tokenizer)
# # --------------------------------------------------------------------------------




In [None]:
def merge_res(res):
    ((pred1, attn1),(pred2, attn2)) = res
    merge_pred = torch.cat([pred1, pred2], dim = 0).cpu()
    attn = torch.cat([attn1, attn2], dim = 0).cpu()
    return (pred1, pred2), attn, merge_pred

def compute_loss(preds, target, merge_pred, num_tokens, tokenizer):
    gtruth = target   
    loss = parallel_loss(config.mle_weight , preds, gtruth) 
    num_correct = compute_correct(merge_pred, target, num_tokens, tokenizer)  
    return loss, num_correct, target

def get_package(inputs):    
    # ----------------------------------------------------
    normalization = 0
    'Encoder data'
    enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, _, _, _, _, _, enc_seg, enc_cls, enc_cls_mask = get_input_from_batch(inputs, config, batch_first = True)
    # ----------------------------------------------------
    'Decoder data'
    dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch = get_output_from_batch(inputs, config, batch_first = True) # Get input and target 
    num_tokens = dec_batch[:, 1:].ne(0).sum()
    normalization += num_tokens.item() 
    return (enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, enc_seg, enc_cls, enc_cls_mask,            dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch,             num_tokens, normalization)

def train_one(package):
    enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, enc_seg, enc_cls, enc_cls_mask,            dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch,             num_tokens, normalization = package
    # ----------------------------------------------------    
    parallel_res = parallel_model(enc_batch, dec_batch, enc_seg, enc_cls, enc_padding_mask, dec_padding_mask, enc_cls_mask, extra_zeros, enc_batch_extend_vocab)
    preds, attn, merge_pred = merge_res(parallel_res)
    
#     gtruth = target   
    loss = parallel_loss(config.mle_weight, preds, dec_batch[:,1:]) 
    
#     loss, num_correct, target = compute_loss(preds, dec_batch[:,1:], merge_pred, num_tokens, tokenizer)
#     acc = accuracy(num_correct, num_tokens)
#     cross_entropy = xent(loss, num_tokens)
#     perplexity = ppl(loss, num_tokens)    
    loss = loss / normalization
    
#     print("num_tokens:%s; acc: %6.2f; perplexity: %5.2f; cross entropy loss: %4.2f" 
#                         % (num_tokens,
#                         acc,
#                         perplexity,
#                         cross_entropy
#                         ))
    return loss
    
    

import pandas as pd
import time
from utils.seq2seq.write_result import total_evaulate, total_output

@torch.autograd.no_grad()
def decode_write_all(writer, logger, epoch, config, model, dataloader, mode):
    # 動態取batch
    num = len(dataloader)
    avg_rouge_1, avg_rouge_2, avg_rouge_l  = [], [], []
    avg_self_bleu1, avg_self_bleu2, avg_self_bleu3, avg_self_bleu4 = [], [], [], []
    avg_bleu1, avg_bleu2, avg_bleu3, avg_bleu4 = [], [], [], []
    avg_meteor = []
    outFrame = None
    avg_time = 0
    
    rouge = Rouge()  
    
    for idx, inputs in enumerate(dataloader):    
        start = time.time() 
        gold_tgt_len = inputs.dec_tgt.size(1)
        setattr(config, 'min_length',gold_tgt_len + 20)
        setattr(config, 'max_length',gold_tgt_len + 60)
        predictor = build_predictor(config, tokenizer, symbols, model, logger)

        # 'Encoder data'
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, _, \
        _, _, _, _, enc_seg, enc_cls, enc_cls_mask = \
            get_input_from_batch(inputs, config, batch_first = True)

        # 'Decoder data'
        dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch = \
        get_output_from_batch(inputs, config, batch_first = True) # Get input and target

        setattr(inputs, 'src',enc_batch)
        setattr(inputs, 'segs',enc_seg)
        setattr(inputs, 'mask_src',enc_padding_mask)

        inputs_data = predictor.translate_batch(inputs)
        translations = predictor.from_batch(inputs_data) # translation = (pred_sents, gold_sent, raw_src)
        article_sents = [t[2] for t in translations]
        decoded_sents = [t[0] for t in translations]
        ref_sents = [t[1] for t in translations]
        
        keywords_list = [str(word_list) for word_list in inputs.key_words]
        cost = (time.time() - start)
        avg_time += cost        

        rouge_1, rouge_2, rouge_l, self_Bleu_1, self_Bleu_2, self_Bleu_3, self_Bleu_4, \
            Bleu_1, Bleu_2, Bleu_3, Bleu_4, Meteor, batch_frame = total_evaulate(article_sents, keywords_list, decoded_sents, ref_sents)
        print(idx)
        if idx %1000 ==0 and idx >0 : print(idx)
        if idx == 0: outFrame = batch_frame
        else: outFrame = pd.concat([outFrame, batch_frame], axis=0, ignore_index=True) 
        # ----------------------------------------------------
        avg_rouge_1.extend(rouge_1)
        avg_rouge_2.extend(rouge_2)
        avg_rouge_l.extend(rouge_l)   
        
        avg_self_bleu1.extend(self_Bleu_1)
        avg_self_bleu2.extend(self_Bleu_2)
        avg_self_bleu3.extend(self_Bleu_3)
        avg_self_bleu4.extend(self_Bleu_4)
        
        avg_bleu1.extend(Bleu_1)
        avg_bleu2.extend(Bleu_2)
        avg_bleu3.extend(Bleu_3)
        avg_bleu4.extend(Bleu_4)
        avg_meteor.extend(Meteor)
        # ----------------------------------------------------    
    avg_time = avg_time / (num * config.batch_size) 
    
    avg_rouge_l, outFrame = total_output(mode, writerPath, outFrame, avg_time, avg_rouge_1, avg_rouge_2, avg_rouge_l, \
        avg_self_bleu1, avg_self_bleu2, avg_self_bleu3, avg_self_bleu4, \
        avg_bleu1, avg_bleu2, avg_bleu3, avg_bleu4, avg_meteor
    )
    
    return avg_rouge_l, outFrame

In [None]:
# @torch.no_grad()
@torch.autograd.no_grad()
def validate(validate_loader, config, model):
    model.eval()
    losses = []
#     batch = next(iter(validate_loader))
    val_num = len(iter(validate_loader))
    for idx, batch in enumerate(validate_loader):
#         package = get_package(batch)
        loss = train_one(get_package(batch))
#         loss = train_one(model, config, batch)
        losses.append(loss.item())
        if idx>= val_num/40: break
#     model.train()
    avg_loss = sum(losses) / len(losses)
    return avg_loss

@torch.autograd.no_grad()
def calc_running_avg_loss(loss, running_avg_loss, decay=0.99):
    if running_avg_loss == 0:  # on the first iteration just take the loss
        running_avg_loss = loss
    else:
        running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    running_avg_loss = min(running_avg_loss, 12)  # clip
    return running_avg_loss

In [None]:
# step = 0
# for epoch in range(config.max_epochs):
#     for inputs in train_loader:
#         step += 1            
#         loss_st = time.time()
#         package = get_package(inputs)
#         mle_loss = train_one(package)
# #         print(step, mle_loss)
#         mle_loss.backward()  # 反向传播，计算当前梯度

#         model.zero_grad() # 清空过往梯度

In [None]:
# config.batch_size = 8
# train_loader, validate_loader, vocab, symbols = getDataLoader(logger, config)
# tokenizer = vocab.tokenizer
# train_batches = len(iter(train_loader))
# test_batches = len(iter(validate_loader))


# train_avg_acc, train_outFrame = decode_write_all(writer, logger, 10, config, model, train_loader, mode = 'train')
# test_avg_acc, test_outFrame = decode_write_all(writer, logger, 10, config, model, validate_loader, mode = 'test')
# logger.info('epoch %d: train_avg_acc = %f, test_avg_acc = %f' % (10, train_avg_acc, test_avg_acc))

In [None]:
import time
loss_st, loss_cost = 0,0
decode_st, decode_cost = 0,0
from pytorchtools import EarlyStopping

write_train_para(writer, config)
logger.info('------Training START--------')
running_avg_loss, running_avg_rl_loss = 0, 0
sum_total_reward = 0
step = 0
# save_steps = 10
print_step = 2000
early_stopping = EarlyStopping(patience=3, verbose=True)
try:
    for epoch in range(config.max_epochs):
        for inputs in train_loader:
            step += 1            
            loss_st = time.time()
            package = get_package(inputs)
            parallel_model.module.train()
            mle_loss = train_one(package)
    #         print(step, mle_loss)
            mle_loss.backward()  # 反向传播，计算当前梯度

            model.zero_grad() # 清空过往梯度
            '''梯度累加就是，每次获取1个batch的数据，计算1次梯度，梯度不清空'''
            if step % (config.gradient_accum) == 0: # gradient accumulation
                    # clip_grad_norm_(model.parameters(), 5.0)                     
                for opt_idx, o in enumerate(optimizer):
                    o.step() # 根据累计的梯度更新网络参数
                    if opt_idx == 0: opt_name = 'lr_bert'
                    else: opt_name = 'lr_dec'
                    writer.add_scalars('scalar/%s' % opt_name,  
                           {'lr': o.learning_rate
                           }, step)
            if step%print_step == 0 :
                with T.autograd.no_grad():
                    train_batch_loss = mle_loss.item()
#                     val_avg_loss = validate(validate_loader, config, model) # call batch by validate_loader
                    running_avg_loss = calc_running_avg_loss(train_batch_loss, running_avg_loss)
                    running_avg_reward = sum_total_reward / step
#                     if step % print_step == 0:
#                         logger.info('epoch %d: %d, training batch loss = %f, running_avg_loss loss = %f, validation loss = %f'
#                                     % (epoch, step, train_batch_loss, running_avg_loss, val_avg_loss))
                    writer.add_scalars('scalar/Loss',  
                       {'train_batch_loss': train_batch_loss
                       }, step)
                    writer.add_scalars('scalar_avg/loss',  
                       {'train_avg_loss': running_avg_loss
#                         'test_avg_loss': val_avg_loss
                       }, step)
#                     loss_cost = time.time() - loss_st
#                     if step % save_steps == 0: logger.info('epoch %d|step %d| compute loss cost = %f ms'
#                                 % (epoch, step, loss_cost))
            if step % save_steps == 0:
                parallel_model.module.eval()
                logger.info('epoch : %s' % epoch)
                val_avg_loss = validate(validate_loader, config, model) # call batch by validate_loader
                logger.info('epoch %d: %d, training batch loss = %f, running_avg_loss loss = %f, validation loss = %f'
                                    % (epoch, step, train_batch_loss, running_avg_loss, val_avg_loss))
                writer.add_scalars('scalar_avg/loss',  
                       {'train_avg_loss': running_avg_loss,
                        'test_avg_loss': val_avg_loss
                       }, step)
                '''（讀取所儲存模型引數後，再進行並行化操作，否則無法利用之前的程式碼進行讀取）'''
                save_model(config, logger, parallel_model, optimizer, step, vocab, running_avg_loss, \
                           r_loss=0, title = loggerName)
                loss_cost = time.time() - loss_st
                logger.info('epoch %d|step %d| compute loss cost = %f ms'
                                % (epoch, step, loss_cost))
                writer.add_scalars('scalar_avg/epoch_loss',  
                   {'train_avg_loss': running_avg_loss,
                    'test_avg_loss': val_avg_loss
                   }, epoch)
                last_save_step = step
        logger.info('-------------------------------------------------------------')

        early_stopping(val_avg_loss) # update patience
        if early_stopping.early_stop:
            logger.info("Early stopping epoch %s"%(epoch))
            break
    
except Excepation as e:
        print(e)
else:
    logger.info(u'------Training SUCCESS--------')  
finally:
    logger.info(u'------Training END--------')    
    logger.info("stopping epoch %s"%(epoch))
    config.batch_size = 8
    train_loader, validate_loader, vocab, symbols = getDataLoader(logger, config)
    tokenizer = vocab.tokenizer
    train_batches = len(iter(train_loader))
    test_batches = len(iter(validate_loader))

    
#     train_avg_acc, train_outFrame = decode_write_all(writer, logger, epoch, config, model, train_loader, mode = 'train')
    test_avg_acc, test_outFrame = decode_write_all(writer, logger, epoch, config, model, validate_loader, mode = 'test')
#     logger.info('epoch %d: train_avg_acc = %f, test_avg_acc = %f' % (epoch, train_avg_acc, test_avg_acc))
    logger.info('epoch %d: test_avg_acc = %f' % (epoch, test_avg_acc))
    removeLogger(logger)    

In [None]:
test_outFrame.head()

In [None]:
# train_avg_acc, train_outFrame = decode_write_all(writer, logger, epoch, config, model, train_loader, mode = 'train')
# test_avg_acc, test_outFrame = decode_write_all(writer, logger, epoch, config, model, validate_loader, mode = 'test')
# logger.info('epoch %d: train_avg_acc = %f, test_avg_acc = %f' % (epoch, train_avg_acc, test_avg_acc)) 