# Load pyrouge

In [1]:
!pip install pyrouge --upgrade
!pip install https://github.com/bheinzerling/pyrouge/archive/master.zip
!pip install pyrouge
!pip show pyrouge
!git clone https://github.com/andersjo/pyrouge.git
from pyrouge import Rouge155
!pyrouge_set_rouge_path 'pyrouge/tools/ROUGE-1.5.5'

Collecting pyrouge
[?25l  Downloading https://files.pythonhosted.org/packages/11/85/e522dd6b36880ca19dcf7f262b22365748f56edc6f455e7b6a37d0382c32/pyrouge-0.1.3.tar.gz (60kB)
[K     |█████▍                          | 10kB 14.6MB/s eta 0:00:01[K     |██████████▉                     | 20kB 21.0MB/s eta 0:00:01[K     |████████████████▎               | 30kB 25.9MB/s eta 0:00:01[K     |█████████████████████▋          | 40kB 28.7MB/s eta 0:00:01[K     |███████████████████████████     | 51kB 16.4MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 5.0MB/s 
[?25hBuilding wheels for collected packages: pyrouge
  Building wheel for pyrouge (setup.py) ... [?25l[?25hdone
  Created wheel for pyrouge: filename=pyrouge-0.1.3-cp37-none-any.whl size=191621 sha256=33cb8cfdfe38615c88335e1df807bb6eb845af4ad0ae34179d2e84b508fca190
  Stored in directory: /root/.cache/pip/wheels/75/d3/0c/e5b04e15b6b87c42e980de3931d2686e14d36e045058983599
Successfully built pyrouge
Installing collected

In [2]:
!pip install transformers
!pip install tensorboardX
!pip install easydict

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 15.3MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 40.5MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |

In [3]:
%cd /content
!git clone https://github.com/HaloKim/KorBertSum.git
%cd /content/KorBertSum

/content
Cloning into 'KorBertSum'...
remote: Enumerating objects: 11117, done.[K
remote: Counting objects: 100% (10815/10815), done.[K
remote: Compressing objects: 100% (7591/7591), done.[K
remote: Total 11117 (delta 3243), reused 10768 (delta 3214), pack-reused 302[K
Receiving objects: 100% (11117/11117), 18.77 MiB | 14.11 MiB/s, done.
Resolving deltas: 100% (3418/3418), done.
/content/KorBertSum


# Code

In [4]:
import os

os.chdir('/content/KorBertSum/src')

In [5]:
import torch
import numpy as np
from models import data_loader, model_builder
from models.model_builder import Summarizer
from others.logging import logger, init_logger
from models.data_loader import load_dataset
from transformers import BertConfig
from tensorboardX import SummaryWriter
from models.reporter import ReportMgr
from models.stats import Statistics
import easydict
from transformers import BertTokenizer

# Preprocessing

In [6]:
def _tally_parameters(model):
    n_params = sum([p.nelement() for p in model.parameters()])
    return n_params

def build_trainer(args, device_id, model,
                  optim):
    """
    Simplify `Trainer` creation based on user `opt`s*
    Args:
        opt (:obj:`Namespace`): user options (usually from argument parsing)
        model (:obj:`onmt.models.NMTModel`): the model to train
        fields (dict): dict of fields
        optim (:obj:`onmt.utils.Optimizer`): optimizer used during training
        data_type (str): string describing the type of data
            e.g. "text", "img", "audio"
        model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object
            used to save the model
    """
    device = "cpu" if args.visible_gpus == '-1' else "cuda"


    grad_accum_count = args.accum_count
    n_gpu = args.world_size

    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0

    print('gpu_rank %d' % gpu_rank)

    tensorboard_log_dir = args.model_path

    writer = SummaryWriter(tensorboard_log_dir, comment="Unmt")

    report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer)

    trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager)

    # print(tr)
    if (model):
        n_params = _tally_parameters(model)
        logger.info('* number of parameters: %d' % n_params)

    return trainer


class Trainer(object):
    """
    Class that controls the training process.

    Args:
            model(:py:class:`onmt.models.model.NMTModel`): translation model
                to train
            train_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            valid_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            optim(:obj:`onmt.utils.optimizers.Optimizer`):
               the optimizer responsible for update
            trunc_size(int): length of truncated back propagation through time
            shard_size(int): compute loss in shards of this size for efficiency
            data_type(string): type of the source input: [text|img|audio]
            norm_method(string): normalization methods: [sents|tokens]
            grad_accum_count(int): accumulate gradients this many times.
            report_manager(:obj:`onmt.utils.ReportMgrBase`):
                the object that creates reports, or None
            model_saver(:obj:`onmt.models.ModelSaverBase`): the saver is
                used to save a checkpoint.
                Thus nothing will be saved if this parameter is None
    """

    def __init__(self,  args, model,  optim,
                  grad_accum_count=1, n_gpu=1, gpu_rank=1,
                  report_manager=None):
        # Basic attributes.
        self.args = args
        self.save_checkpoint_steps = args.save_checkpoint_steps
        self.model = model
        self.optim = optim
        self.grad_accum_count = grad_accum_count
        self.n_gpu = n_gpu
        self.gpu_rank = gpu_rank
        self.report_manager = report_manager

        self.loss = torch.nn.BCELoss(reduction='none')
        assert grad_accum_count > 0
        # Set model in training mode.
        if (model):
            self.model.train()

    def summ(self, test_iter, step, cal_lead=False, cal_oracle=False):
      """ Validate model.
          valid_iter: validate data iterator
      Returns:
          :obj:`nmt.Statistics`: validation loss statistics
      """
      # Set model in validating mode.
      def _get_ngrams(n, text):
          ngram_set = set()
          text_length = len(text)
          max_index_ngram_start = text_length - n
          for i in range(max_index_ngram_start + 1):
              ngram_set.add(tuple(text[i:i + n]))
          return ngram_set

      def _block_tri(c, p):
          tri_c = _get_ngrams(3, c.split())
          for s in p:
              tri_s = _get_ngrams(3, s.split())
              if len(tri_c.intersection(tri_s))>0:
                  return True
          return False

      if (not cal_lead and not cal_oracle):
          self.model.eval()
      stats = Statistics()

      with torch.no_grad():
          for batch in test_iter:
              src = batch.src
              labels = batch.labels
              segs = batch.segs
              clss = batch.clss
              mask = batch.mask
              mask_cls = batch.mask_cls

              if (cal_lead):
                  selected_ids = [list(range(batch.clss.size(1)))] * batch.batch_size
              elif (cal_oracle):
                  selected_ids = [[j for j in range(batch.clss.size(1)) if labels[i][j] == 1] for i in
                                  range(batch.batch_size)]
              else:
                  sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)
                  sent_scores = sent_scores + mask.float()
                  sent_scores = sent_scores.cpu().data.numpy()
                  selected_ids = np.argsort(-sent_scores, 1)
      return selected_ids



    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)

            loss = self.loss(sent_scores, labels.float())
            loss = (loss*mask.float()).sum()
            (loss/loss.numel()).backward()
            # loss.div(float(normalization)).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)


            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()

    def _save(self, step):
        real_model = self.model
        # real_generator = (self.generator.module
        #                   if isinstance(self.generator, torch.nn.DataParallel)
        #                   else self.generator)

        model_state_dict = real_model.state_dict()
        # generator_state_dict = real_generator.state_dict()
        checkpoint = {
            'model': model_state_dict,
            # 'generator': generator_state_dict,
            'opt': self.args,
            'optim': self.optim,
        }
        checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step)
        logger.info("Saving checkpoint %s" % checkpoint_path)
        # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step)
        if (not os.path.exists(checkpoint_path)):
            torch.save(checkpoint, checkpoint_path)
            return checkpoint, checkpoint_path

    def _start_report_manager(self, start_time=None):
        """
        Simple function to start report manager (if any)
        """
        if self.report_manager is not None:
            if start_time is None:
                self.report_manager.start()
            else:
                self.report_manager.start_time = start_time

    def _maybe_gather_stats(self, stat):
        """
        Gather statistics in multi-processes cases

        Args:
            stat(:obj:onmt.utils.Statistics): a Statistics object to gather
                or None (it returns None in this case)

        Returns:
            stat: the updated (or unchanged) stat object
        """
        if stat is not None and self.n_gpu > 1:
            return Statistics.all_gather_stats(stat)
        return stat

    def _maybe_report_training(self, step, num_steps, learning_rate,
                               report_stats):
        """
        Simple function to report training stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_training` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_training(
                step, num_steps, learning_rate, report_stats,
                multigpu=self.n_gpu > 1)

    def _report_step(self, learning_rate, step, train_stats=None,
                     valid_stats=None):
        """
        Simple function to report stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_step` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_step(
                learning_rate, step, train_stats=train_stats,
                valid_stats=valid_stats)

    def _maybe_save(self, step):
        """
        Save the model if a model saver is set
        """
        if self.model_saver is not None:
            self.model_saver.maybe_save(step)

class BertData():
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
        self.sep_vid = self.tokenizer.vocab['[SEP]']
        self.cls_vid = self.tokenizer.vocab['[CLS]']
        self.pad_vid = self.tokenizer.vocab['[PAD]']

    def preprocess(self, src):

        if (len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]
        idxs = [i for i, s in enumerate(src) if (len(s) > 1)]

        src = [src[i][:2000] for i in idxs]
        src = src[:1000]

        if (len(src) < 3):
            return None
        if (len(labels) == 0):
            return None

        src_txt = [' '.join(sent) for sent in src]
        text = ' [SEP] [CLS] '.join(src_txt)
        src_subtokens = self.tokenizer.tokenize(text)
        src_subtokens = src_subtokens[:510]
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']

        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]
        labels = None
        src_txt = [original_src_txt[i] for i in idxs]
        tgt_txt = None
        return src_subtoken_idxs, labels, segments_ids, cls_ids, src_txt, tgt_txt

def _lazy_dataset_loader(pt_file):
  yield  pt_file

# Params

In [16]:
args = easydict.EasyDict({
    "encoder":'classifier',
    "mode":'test',
    "bert_data_path":'/content/drive/MyDrive/Colab_Notebooks/Study/BertSum-master/bert_data/korean',
    "model_path":'./models/bert_classifier',
    "result_path":'./results',
    "temp_dir":'./temp',
    "batch_size":1000,
    "use_interval":True,
    "hidden_size":128,
    "ff_size":512,
    "heads":4,
    "inter_layers":2,
    "rnn_size":512,
    "param_init":0,
    "param_init_glorot":True,
    "dropout":0.1,
    "optim":'adam',
    "lr":2e-3,
    "report_every":1,
    "save_checkpoint_steps":5,
    "block_trigram":True,
    "recall_eval":False,
    
    "accum_count":1,
    "world_size":1,
    "visible_gpus":'-1',
    "gpu_ranks":'0',
    "log_file":'/content/drive/MyDrive/Colab_Notebooks/Study/BertSum-master/logs/log.log',
    "test_from":'/content/drive/MyDrive/Colab_Notebooks/Study/BertSum-master/models/bert_classifier/model_step_1000.pt'
})
model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers','encoder','ff_actv', 'use_interval','rnn_size']


# Test code

In [17]:
def test(args, input_list, device_id, pt, step):
  init_logger(args.log_file)
  device = "cpu" if args.visible_gpus == '-1' else "cuda"
  device_id = 0 if device == "cuda" else -1

  cp = args.test_from
  try:
    step = int(cp.split('.')[-2].split('_')[-1])
  except:
    step = 0

  device = "cpu" if args.visible_gpus == '-1' else "cuda"
  if (pt != ''):
      test_from = pt
  else:
      test_from = args.test_from
  logger.info('Loading checkpoint from %s' % test_from)
  checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
  opt = vars(checkpoint['opt'])
  for k in opt.keys():
      if (k in model_flags):
        setattr(args, k, opt[k])

  config = BertConfig.from_pretrained('bert-base-multilingual-cased')
  model = Summarizer(args, device, load_pretrained_bert=False, bert_config = config)
  model.load_cp(checkpoint)
  model.eval()

  test_iter = data_loader.Dataloader(args, _lazy_dataset_loader(input_list),
                                args.batch_size, device,
                                shuffle=False, is_test=True)
  trainer = build_trainer(args, device_id, model, None)
  result = trainer.summ(test_iter,step)
  return result, input_list

args.gpu_ranks = [int(i) for i in args.gpu_ranks.split(',')]
os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

In [18]:
def txt2input(text):
  data = list(filter(None, text.split('\n')))
  bertdata = BertData()
  txt_data = bertdata.preprocess(data)
  data_dict = {"src":txt_data[0],
               "labels":[0,1,2],
               "segs":txt_data[2],
               "clss":txt_data[3],
               "src_txt":txt_data[4],
               "tgt_txt":None}
  input_data = []
  input_data.append(data_dict)
  return input_data

In [19]:
text = '''
환자로 찾아왔던 공군 여성 장교를 성폭행하려다 미수에 그친 국군수도병원 의사가 재판에서 징역 3년 6개월을 선고받았다. 징역형을 선고받은 의사는 과거에 대통령 주치의를 역임했던 해당 분야의 권위자였다. 

 
지난달 대위로 전역한 전 공군장교A씨는 지난 2017년 국군병원 근무 중 육군 부사관에게 성추행을 당한 정신적 충격으로 당시 국군수도병원 신경과에 근무하던 70세 B씨에게 치료를 받았다.  
 
3년 뒤인 지난해 국군수도병원에서 다시 만난 B씨는A씨에게 3년 전 일을 거론한 뒤 조언을 하고 싶다며 식사를 제안했다. 며칠 후 저녁을 함께한 뒤 B씨는A씨를 근처 자신의 집으로 끌고 들어가 성폭행을 시도했다.  
 
간신히 집밖으로 달아난 A씨는 외상 후 스트레스 장애, 해리성 기억상실증, 마비 등 증상을 겪다 일주일 후 부대에 신고했다.  
 
B씨는 강제 추행 장면이 담긴 아파트 CCTV에 찍힌 영상을 본 뒤 범행을 인정했다.  
 
B씨는 지난해 12월 강제추행, 강간치상 등 혐의로 구속돼 재판에 넘겨졌고 결국 징역형을 선고받았다.  
 
징역 10년을 구형했던 군 검찰은 1심에 불복해 항소할 방침이다.  

[출처: 중앙일보] 여장교 성폭행 시도 70대 의사, 알고보니 대통령 주치의 출신
'''

In [20]:
input_data = txt2input(text)
sum_list = test(args, input_data, -1, '', None)
sum_list[0][0]

[2021-06-10 03:28:52,721 INFO] Loading checkpoint from /content/drive/MyDrive/Colab_Notebooks/Study/BertSum-master/models/bert_classifier/model_step_1000.pt
[2021-06-10 03:29:50,660 INFO] * number of parameters: 177854209


gpu_rank 0


array([4, 6, 2, 0, 5, 1, 3, 7])

# Result

In [21]:
[list(filter(None, text.split('\n')))[i] for i in sum_list[0][0][:3]]

['3년 뒤인 지난해 국군수도병원에서 다시 만난 B씨는A씨에게 3년 전 일을 거론한 뒤 조언을 하고 싶다며 식사를 제안했다. 며칠 후 저녁을 함께한 뒤 B씨는A씨를 근처 자신의 집으로 끌고 들어가 성폭행을 시도했다.  ',
 '간신히 집밖으로 달아난 A씨는 외상 후 스트레스 장애, 해리성 기억상실증, 마비 등 증상을 겪다 일주일 후 부대에 신고했다.  ',
 '지난달 대위로 전역한 전 공군장교A씨는 지난 2017년 국군병원 근무 중 육군 부사관에게 성추행을 당한 정신적 충격으로 당시 국군수도병원 신경과에 근무하던 70세 B씨에게 치료를 받았다.  ']