In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import time
import re


import torch as T



import torch.nn as nn
import torch.nn.functional as F


from model import Model


from data_util import config
from data_util import bert_data as data
from data_util.bert_batcher import Batcher
from data_util.bert_data import Vocab

# from data_util import data
# from data_util.batcher import Batcher
# from data_util.data import Vocab


from train_util import *
from torch.distributions import Categorical
from rouge import Rouge
from numpy import random
import argparse
import torchsnooper
import logging
transformers_logger = logging.getLogger("transformers.tokenization_utils")
transformers_logger.setLevel(logging.ERROR)
transformers_logger.disabled = True

# -------- Test Packages -------
from bert_beam_search import *
import shutil
from tensorboardX import SummaryWriter
from nltk.translate.bleu_score import corpus_bleu

# from pytorch_pretrained_bert import BertModel
from transformers import BertModel, BertTokenizer 
from transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLConfig

config.batch_size = 2
config.emb_dim = 768
config.max_enc_steps = 512
config.lr = 0.0001 # 0.001

# config.keywords = "TextRank_keywords"
# config.max_key_num = 8
# help(config)

I0317 01:43:50.612472 140157232645952 file_utils.py:35] PyTorch version 1.3.1 available.


We have  30526 bert tokens now
We have added 3 XL tokens


In [2]:
info_str = ''
for a in dir(config):
    if type(getattr(config, a)) in [str,int,float,bool] \
    and 'path' not in str(a) \
    and '__' not in str(a) \
    and 'info' not in str(a):

        info_str += '## %s : %s\n'%(a,getattr(config, a))

# [print(a,getattr(config, a)) for a in dir(config)
# if type(getattr(config, a)) in [str,int,float]
#  and 'path' not in str(a)
#  and '__' not in str(a)
#  and 'info' not in str(a)
# ]
print(info_str)

## batch_size : 2
## beam_size : 16
## ber_layer : 11
## data_type : Cameras_new5
## emb_dim : 768
## eps : 1e-12
## gound_truth_prob : 0.1
## hidden_dim : 512
## intra_decoder : True
## intra_encoder : True
## key_attention : False
## keywords : POS_FOP_keywords
## loggerName : Text-Summary
## lr : 0.0001
## max_dec_steps : 50
## max_enc_steps : 512
## max_epochs : 100
## max_iterations : 500000
## max_key_num : 8
## min_dec_steps : 4
## rand_unif_init_mag : 0.02
## trunc_norm_init_std : 0.0001
## vocab_size : 100000
## word_emb_type : word2Vec



# Logger

In [3]:
from datetime import datetime as dt

def getLogger(loggerName, loggerPath):
    # 設置logger
    logger = logging.getLogger(loggerName)  # 不加名稱設置root logger
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    logging.Filter(loggerName)

    # 使用FileHandler輸出到文件
    directory = os.path.dirname(loggerPath)
    if not os.path.exists(directory):
        os.makedirs(directory)
    fh = logging.FileHandler(loggerPath)

    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)

    # 使用StreamHandler輸出到屏幕
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    # 添加兩個Handler
    logger.addHandler(ch)
    logger.addHandler(fh)
    # Handler只啟動一次
    # 設置logger
    logger.info(u'logger已啟動')
    return logger

def removeLogger(logger):
    logger.info(u'logger已關閉')
    handlers = logger.handlers[:]
    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)

# View batch data

In [4]:
def test_batch():
    vocab = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.train_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)
    batch = batcher.next_batch()
    # with torchsnooper.snoop():
    while batch is not None:
        example_list = batch.example_list
        for ex in example_list:
            r = str(ex.original_review)
            s = str(ex.original_summary)
            k = str(ex.key_words)
            sent = ex.original_summary_sents
#             print("original_review_sents:", r)
            print("original_summary_sents : ", s)
            print("key_words : ", k)
#             print('------------------------------------------------------------\n')
        batch = batcher.next_batch()        
        break
test_batch()

original_summary_sents :  <s> other than that the camera take amazing underwater shot </s>
key_words :  ['battery', 'auxiliary', 'battery', 'backup', 'underwater', 'amazing']
original_summary_sents :  <s> this camera is the best affordable action camera there is on the market </s>
key_words :  ['action', 'affordable', 'screen', 'lcd', 'clip', 'small']


# Get Bin Information

In [None]:
 with open(config.bin_info,'r',encoding='utf-8') as f:
    lines = f.readlines()
    [print(line) for line in lines]
    train_num = int(lines[0].split(":")[1])
    test_num = int(lines[1].split(":")[1])
    val_num = int(lines[2].split(":")[1])
    # f.write("train : %s\n"%(len(flit_key_train_df)))
    # f.write("test : %s\n"%(len(flit_key_test_df)))
    # f.write("valid : %s\n"%(len(flit_key_valid_df)))


train : 16497

test : 2061

valid : 2062



# View model summary
#### 只有torchsummaryX成功
#### 日後將以此模擬呈現結構

In [None]:
# from torchsummary import summary # 不支援RNN
# from model import Encoder,Model
# # https://www.cnblogs.com/lindaxin/p/8052043.html
# device = T.device("cuda" if T.cuda.is_available() else "cpu") # PyTorch v0.4.0
# encoder = Encoder().to(device)    

# vocab = Vocab(config.vocab_path, config.vocab_size)
# batcher = Batcher(config.train_data_path, vocab, mode='train',
#                        batch_size=config.batch_size, single_pass=False)
# batch = batcher.next_batch()
# enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(batch)
# enc_batch = Model().embeds(enc_batch) # Get embeddings for encoder input

# # summary(encoder, enc_batch, enc_lens, show_hierarchical=True) 
# # summary(encoder, [enc_batch, enc_lens])

In [None]:
# from modelsummary import summary # 未知問題
# from model import Encoder,Model
# # https://www.cnblogs.com/lindaxin/p/8052043.html
# device = T.device("cuda" if T.cuda.is_available() else "cpu") # PyTorch v0.4.0
# encoder = Encoder().to(device)    

# vocab = Vocab(config.vocab_path, config.vocab_size)
# batcher = Batcher(config.train_data_path, vocab, mode='train',
#                        batch_size=config.batch_size, single_pass=False)
# batch = batcher.next_batch()
# enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(batch)
# enc_batch = Model().embeds(enc_batch) # Get embeddings for encoder input

# # summary(encoder, enc_batch, enc_lens, show_hierarchical=True) 
# # summary(encoder, enc_batch, enc_lens, show_input=False)

In [None]:
from torchsummaryX import summary
from model import Encoder,Model
device = T.device("cuda" if T.cuda.is_available() else "cpu") # PyTorch v0.4.0
encoder = Encoder().to(device)    

vocab = Vocab(config.vocab_path, config.vocab_size)
batcher = Batcher(config.train_data_path, vocab, mode='train',
                       batch_size=config.batch_size, single_pass=False)
batch = batcher.next_batch()
enc_batch, enc_lens, enc_padding_mask, enc_key_batch, enc_key_lens, enc_key_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(batch)
# enc_batch = Model(False,'word2Vec',vocab).embeds(enc_batch) #Get embeddings for encoder input


enc_batch = enc_batch.type(T.LongTensor).cuda() #  `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
     
bert_model = get_cuda(BertModel.from_pretrained('bert-base-uncased'))
# xl_config = TransfoXLConfig()
# xl_config.d_embed = 1024
# bert_model = TransfoXLModel(xl_config) # 更改參數以傳入TransfoXLModel

# bert_model = get_cuda(TransfoXLModel.from_pretrained('transfo-xl-wt103'))
all_hidden_states, _ = bert_model(enc_batch)[-2:]
print(all_hidden_states.shape)

# enc_batch = self.bert_model(enc_batch)[0][0] ; print(enc_batch.shape)
        
summary(encoder, all_hidden_states, enc_lens) # encoder summary


I0317 01:44:00.505785 140157232645952 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/eagleuser/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
I0317 01:44:00.602400 140157232645952 configuration_utils.py:199] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "ou

torch.Size([2, 119, 768])
           Kernel Shape Output Shape   Params  Mult-Adds
Layer                                                   
0_lstm                -  [163, 1024]  5251072    5242880
1_reduce_h  [1024, 512]     [2, 512]   524800     524288
2_reduce_c  [1024, 512]     [2, 512]   524800     524288
--------------------------------------------------------
                       Totals
Total params          6300672
Trainable params      6300672
Non-trainable params        0
Mult-Adds             6291456


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_lstm,-,"[163, 1024]",5251072,5242880
1_reduce_h,"[1024, 512]","[2, 512]",524800,524288
2_reduce_c,"[1024, 512]","[2, 512]",524800,524288


# Train object

In [None]:
from torch.autograd import Variable
import torch
def write_enc_graph():
    encoder_writer = SummaryWriter('runs/Pointer-Generator/word2Vec/Encoder')
    device = T.device("cuda" if T.cuda.is_available() else "cpu") # PyTorch v0.4.0
    encoder = Encoder().to(device) 

    vocab = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.train_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)
    batch = batcher.next_batch()
    enc_batch, enc_lens, enc_padding_mask, enc_key_batch, enc_key_lens, enc_key_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(batch)
    enc_batch = Model(False,'word2Vec',vocab).embeds(enc_batch) #Get embeddings for encoder input

#     enc_batch = Variable(torch.rand(enc_batch.shape)).to(device) 
    enc_lens = torch.from_numpy(enc_lens).to(device) 

    encoder_writer.add_graph(encoder, (enc_batch, enc_lens), verbose=True)
    encoder_writer.close()

def write_dec_graph():
    decoder_writer = SummaryWriter('runs/Pointer-Generator/word2Vec/Decoder')
    device = T.device("cuda" if T.cuda.is_available() else "cpu") # PyTorch v0.4.0
    # decoder = Decoder().to(device)    

    vocab = Vocab(config.vocab_path, config.vocab_size)
    model = Model(False,'word2Vec',vocab)
    
    batcher = Batcher(config.train_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)
    batch = batcher.next_batch()
    enc_batch, enc_lens, enc_padding_mask, enc_key_batch, enc_key_lens, enc_key_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(batch)
    enc_batch = model.embeds(enc_batch) #Get embeddings for encoder input
    enc_out, enc_hidden = model.encoder(enc_batch, enc_lens)

    # train_batch_MLE
    dec_batch, max_dec_len, dec_lens, target_batch = get_dec_data(batch)                        #Get input and target batchs for training decoder
    step_losses = []
    s_t = (enc_hidden[0], enc_hidden[1])                                                        #Decoder hidden states
    # x_t 為decoder每一個time step 的batch input
    x_t = get_cuda(T.LongTensor(len(enc_out)).fill_(2))                             #Input to the decoder
    prev_s = None                                                                               #Used for intra-decoder attention (section 2.2 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf)
    sum_temporal_srcs = None     


    for t in range(min(max_dec_len, config.max_dec_steps)):
        use_gound_truth = get_cuda((T.rand(len(enc_out)) > 0.25)).long()                        #Probabilities indicating whether to use ground truth labels instead of previous decoded tokens
        # use_gound_truth * dec_batch[:, t] : 為ground true time step token
        # (1 - use_gound_truth) * x_t : 為previous time step token
        if t == 0 :temp_batch = dec_batch[:, t]
        x_t = use_gound_truth * temp_batch + (1 - use_gound_truth) * x_t                   #Select decoder input based on use_ground_truth probabilities
        x_t = model.embeds(x_t)
    #     final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = model.decoder(x_t, s_t, enc_out, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s)
        final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = model.decoder(
        x_t, s_t, enc_out, enc_padding_mask,context, 
        extra_zeros,enc_batch_extend_vocab,sum_temporal_srcs, prev_s, 
        enc_key_batch, enc_key_lens)        


        #         decoder_summary = summary(model.decoder, x_t, s_t, enc_out, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s,enc_key_batch, enc_key_lens) # encoder summary
#         x_t = Variable(torch.rand(x_t.shape)).to(device) 
        #             s_t = Variable(torch.rand(s_t.shape)).to(device)
#         enc_out = Variable(torch.rand(enc_out.shape)).to(device)
#         enc_padding_mask = Variable(torch.rand(enc_padding_mask.shape)).to(device,dtype=torch.long)
#         context = Variable(torch.rand(context.shape)).to(device)
#         extra_zeros = Variable(torch.rand(extra_zeros.shape)).to(device)
#         enc_batch_extend_vocab = Variable(torch.rand(enc_batch_extend_vocab.shape)).to(device)
        #             sum_temporal_srcs = Variable(torch.rand(sum_temporal_srcs.shape)).to(device)
        #             prev_s = Variable(torch.rand(prev_s.shape)).to(device)
#         enc_key_batch = Variable(torch.rand(enc_key_batch.shape)).to(device)
        enc_key_lens = torch.from_numpy(enc_key_lens).to(device) 
        
        decoder_writer.add_graph(model.decoder, 
                         (x_t, s_t, enc_out, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s,enc_key_batch, enc_key_lens), verbose=True)
        decoder_writer.close()
        break    

In [None]:
# https://blog.csdn.net/u012869752/article/details/72513141
# 由于在jupyter notebook中，args不为空
from glob import glob
from bert_emb_run import *

# nvidia-smi -pm 1
if __name__ == "__main__":   
    try:
        # --------------------------Training ----------------------------------
        parser = argparse.ArgumentParser()
        parser.add_argument('--train_mle', type=bool, default=True)
        parser.add_argument('--train_rl', type=bool, default=False)
        parser.add_argument('--mle_weight', type=float, default=1.0)
    #         parser.add_argument('--load_model', type=str, default='/0015000_3.29_0.00.tar')
        parser.add_argument('--load_model', type=str, default=None)
        parser.add_argument('--new_lr', type=float, default=None)
        parser.add_argument('--multi_device', type=bool, default=True)
        parser.add_argument('--view', type=bool, default=True)
        parser.add_argument('--pre_train_emb', type=bool, default=True)
        parser.add_argument('--word_emb_type', type=str, default='bert')
        parser.add_argument('--train_action', type=bool, default=True)
        opt = parser.parse_args(args=[])

        today = dt.now()
    #         loggerPath = "LOG//%s-(%s_%s_%s)-(%s:%s:%s)"%(opt.word_emb_type,
    #                   today.year,today.month,today.day,
    #                   today.hour,today.minute,today.second)
    #         logger = getLogger(config.loggerName,loggerPath)   

        loggerPath = "LOG//%s"%(opt.word_emb_type)
        logger = getLogger(config.loggerName,loggerPath) 

        if opt.load_model == None:
            shutil.rmtree('runs/Pointer-Generator/bert', ignore_errors=True) # clear previous 
            shutil.rmtree('runs/Pointer-Generator/bert/exp-4', ignore_errors=True) # clear previous 
            shutil.rmtree('runs/Pointer-Generator/bert/Eecoder', ignore_errors=True) # clear previous 
            shutil.rmtree('runs/Pointer-Generator/bert/Decoder', ignore_errors=True) # clear previous 

        writer = SummaryWriter('runs/Pointer-Generator/bert/exp-4')
        writer.add_text('Train_Para/',info_str,0)
    #         write_enc_graph()
    #         write_dec_graph()

        if opt.train_action: train_action(opt, logger, writer, train_num)

    except Exception as e:
        traceback = sys.exc_info()[2]
        print(sys.exc_info())
        print(traceback.tb_lineno)
        print(e)
    finally:
        removeLogger(logger)
        # export scalar data to JSON for external processing
        # tensorboard --logdir /home/eagleuser/Users/leyan/Text-Summarizer-FOP/TensorBoard
#         tensorboard --logdir ./runs
#         if not os.path.exists('TensorBoard'): os.makedirs('TensorBoard')
#         writer.export_scalars_to_json("TensorBoard/test.json")
        writer.close()
        

2020-03-17 01:44:23 - Text-Summary - INFO: - logger已啟動
I0317 01:44:23.236703 140157232645952 <ipython-input-3-00f9962e7fdd>:30] logger已啟動
2020-03-17 01:44:23 - Text-Summary - INFO: - ------Training Setting--------
I0317 01:44:23.242853 140157232645952 bert_emb_run.py:59] ------Training Setting--------
2020-03-17 01:44:23 - Text-Summary - INFO: - Traing Type :Cameras_new5
I0317 01:44:23.246093 140157232645952 bert_emb_run.py:61] Traing Type :Cameras_new5
2020-03-17 01:44:23 - Text-Summary - INFO: - Training mle: True, mle weight: 1.00
I0317 01:44:23.247637 140157232645952 bert_emb_run.py:63] Training mle: True, mle weight: 1.00
2020-03-17 01:44:23 - Text-Summary - INFO: - use pre_train_bert vocab_size 100000 

I0317 01:44:23.249203 140157232645952 bert_emb_run.py:70] use pre_train_bert vocab_size 100000 

2020-03-17 01:44:23 - Text-Summary - INFO: - intra_encoder: True intra_decoder: True 

I0317 01:44:23.251284 140157232645952 bert_emb_run.py:75] intra_encoder: True intra_decoder: True