## [LG 전자] 자연어 처리 # 4 : 문서요약 - (2)

* Summarunner를 이용한 추출 요약
* 예상 난이도 ⭐️⭐️⭐️⭐️⭐️

## 강의 복습

강의자료 : 자연어처리 4, AGENDA 03 - 인공신경망 기반의 추출요약

## 실습 요약

1. 본 실습에서는 SummaRunner를 활용하여 추출요약 모델을 구축합니다.
2. SummaRunner는 TextRank와 달리 학습을 통해 문서요약을 수행하는 지도학습 기반의 모델입니다.


------

### STEP 0. 환경 구축하기
* 필요한 library들을 import 합니다

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
!pip install gensim==3.8.3

In [2]:
import sys
import json
import random
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False
#%matplotlib inline #생성한 figure를 notebook에서 볼 수있게 해주는 코드

import gensim
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#check torch version & device
print ("Python version:[%s]."%(sys.version))
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device)) # device에 cuda:0가 프린트 된다면 GPU를 사용하는 상태입니다

Python version:[3.7.0 (default, Oct  9 2018, 10:31:47) 
[GCC 7.3.0]].
PyTorch version:[1.7.1].
device:[cuda:0].


In [None]:
# konlpy, Mecab 형태소 분석기 설치 스크립트 실행
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [4]:
# set random seed 

def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
random_seed = 42
set_seed(random_seed)

### STEP 1. 데이터 준비하기
금일 실습에서는 **AIHUB**에서 제공하는 **한글 뉴스기사 요약 데이터**를 활용합니다.
* 데이터셋 출처
  * https://aihub.or.kr/aidata/8054


In [5]:
# github에서 데이터 불러오기
!git clone https://github.com/KU-DIC/LG_natural_language_processing_day23

In [5]:
# 데이터셋 읽기
with open('./LG_natural_language_processing_day23/data/sum_data.json','r',encoding='utf-8') as f:
  data = json.load(f)

In [6]:
# 분석에 사용할 형태로 가공하기
label = [] # extractive
document = [] # text

for cur_news in tqdm(data['documents']):
  # 리뷰 문장
  document.append(cur_news['text'])
  label.append(cur_news['extractive'])


100%|██████████| 243983/243983 [00:00<00:00, 2186948.63it/s]


In [7]:
# 데이터 프레임 형태로 변환하기
df = {
    "label" : label,
    "document" : document
}
df = pd.DataFrame(df)

### STEP 2. 전처리 진행 (Preprocessing)

In [8]:
news_sentences = df['document'].to_list()

In [9]:
import re
def preprocess(text):
  text = re.sub('[-=+,#/\?:^$~@*\"※~▲△&%ㆍ·!』\\‘|\(\)\[\]\<\>`\'…》]','', text)
  text = re.sub('[ㅠㅎㅋ]','', text)
  return text

In [10]:
fixed_document = []
for cur_sentences in tqdm(news_sentences):
  fix_sent = []
  for sent in cur_sentences:
    fix_sent.extend(sent)
  fixed_document.append(fix_sent)

normalize_document = []
for cur_sentences in tqdm(fixed_document):
  norm_sent = []
  for sent in cur_sentences:
    sentence = preprocess(sent['sentence'])
    norm_sent.append(sentence)
  normalize_document.append(norm_sent)

df['normalize_document'] = normalize_document

100%|██████████| 12199/12199 [00:00<00:00, 204510.65it/s]
100%|██████████| 12199/12199 [00:00<00:00, 26746.34it/s]


In [11]:
label_sentence = []
for id, cur_doc in enumerate(df['normalize_document']):
    sent = []
    for idx in df['label'].iloc[id]:
        sent.append(cur_doc[idx])
    label_sentence.append(sent)

df['label_sentence'] = label_sentence

### STEP 3. 토큰화 진행 (Tokenization)

* 문서 요약 실습에서는 단순 띄어쓰기 단위로 토큰화를 진행합니다
* 이유 : 문서요약 데이터는 하나의 데이터당 굉장히 많은 문장을 가지므로, 토큰화 과정에만 약 30분의 시간이 소요됨

In [12]:
# Okt(Open Korea Text)
# from konlpy.tag import Okt  
# okt = Okt() 

# tokenized_document = []
# for cur_doc in tqdm(normalize_document):
#   tokenized_sentence = []
#   for cur_sent in cur_doc:
#     sent = okt.morphs(cur_sent)
#     tokenized_sentence.append(sent)
#   tokenized_document.append(tokenized_sentence)

# df['tokenized_document'] = tokenized_document

In [13]:
tokenized_document = []
for cur_doc in tqdm(normalize_document):
  tokenized_sentence = []
  for cur_sent in cur_doc:
    sent = cur_sent.split(' ')
    tokenized_sentence.append(sent)
  tokenized_document.append(tokenized_sentence)

100%|██████████| 12199/12199 [00:00<00:00, 50383.26it/s]


In [14]:
df['tokenized_document'] = tokenized_document

In [15]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size =0.1, random_state= 42)

drop_list = [train_df.iloc[i].name for i in[845, 3035, 3936, 5246, 5417, 8988, 9710]]
train_df = train_df.drop(drop_list)

drop_list = [test_df.iloc[i].name for i in [408,583,1000]]
test_df = test_df.drop(drop_list)
len(train_df) , len(test_df)

(10972, 1217)

In [16]:
aggregate_document =[]
for cur_doc in tqdm(train_df['tokenized_document']):
    _aggregate_document = [i for i in cur_doc]
    aggregate_document.append(_aggregate_document[0])

train_df['aggregate_document'] = aggregate_document

100%|██████████| 10972/10972 [00:00<00:00, 1117584.72it/s]


In [17]:
labels = []
for idx, cur_doc in enumerate(train_df['normalize_document'].to_list()):
    label_idx = train_df['label'].iloc[idx]
    label = [0 for _ in range(len(cur_doc))]

    for l_idx in label_idx :
        label[l_idx]=1
    labels.append(label)
train_df['labels'] = labels


labels = []
for idx, cur_doc in enumerate(test_df['normalize_document'].to_list()):
    label_idx = test_df['label'].iloc[idx]
    label = [0 for _ in range(len(cur_doc))]

    for l_idx in label_idx :
        label[l_idx]=1
    labels.append(label)
test_df['labels'] = labels

In [21]:
train_df.head()

Unnamed: 0,label,document,normalize_document,label_sentence,tokenized_document,aggregate_document,labels
185126,"[4, 5, 9]","[[{'index': 0, 'sentence': '한꺼번에 뜬 수많은 금융권 채용 ...",[한꺼번에 뜬 수많은 금융권 채용 공고를 마주한 취업준비생 입장에서 서류 접수는 빨...,[특히 개인의 학력전공공인점수 등을 공개하지 않는 블라인드 채용이 자리 잡으면서 면...,"[[한꺼, 번, 에, 뜬, 수많은, 금융, 권, 채용, 공고, 를, 마주, 한, 취...","[한꺼, 번, 에, 뜬, 수많은, 금융, 권, 채용, 공고, 를, 마주, 한, 취업...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
134491,"[6, 12, 14]","[[{'index': 0, 'sentence': '판교2밸리 적용 스마트시티 기술 ...","[판교2밸리 적용 스마트시티 기술 조만간 선정 자료국토교통부 제공, 판교 제2테크노...",[현재 검토되는 주요 스마트시티 기술 중 교통 분야는 자율주행버스를 비롯해 스마트 ...,"[[판교, 2, 밸리, 적용, 스마트, 시티, 기술, 조만간, 선정, 자료, 국토교...","[판교, 2, 밸리, 적용, 스마트, 시티, 기술, 조만간, 선정, 자료, 국토교통...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, ..."
124873,"[6, 0, 2]","[[{'index': 0, 'sentence': '설문대여성문화센터(소장 김정완)는...",[설문대여성문화센터소장 김정완는 내달 2일부터 30일까지 2019 예술단체 발굴 지...,[이번 전시에선 한국화 24점이 전시되며 강보라미 강명지 김혜정 작가 등 총 7명의...,"[[설문, 대, 여성, 문화센터, 소장, 김정, 완는, 내달, 2일, 부터, 30일...","[설문, 대, 여성, 문화센터, 소장, 김정, 완는, 내달, 2일, 부터, 30일,...","[1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
160900,"[0, 1, 2]","[[{'index': 0, 'sentence': '경기도 화성시 ㈜착한건축자재백화점...",[경기도 화성시 ㈜착한건축자재백화점의 심재옥 대표는 지진 발생 시 피해 최소화를 위...,[경기도 화성시 ㈜착한건축자재백화점의 심재옥 대표는 지진 발생 시 피해 최소화를 위...,"[[경기도, 화성시, ㈜, 착한, 건축, 자재, 백화점, 의, 심재, 옥, 대표, ...","[경기도, 화성시, ㈜, 착한, 건축, 자재, 백화점, 의, 심재, 옥, 대표, 는...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
191075,"[2, 4, 10]","[[{'index': 0, 'sentence': '23일 오후 서울 성북구 고려대학...",[23일 오후 서울 성북구 고려대학교 서울캠퍼스 정경대 후문에 붙은 안녕들 하십니까...,[23일 서울 성북구 고려대 서울캠퍼스 정경대 후문에 그래서 안녕들 하십니까라는 제...,"[[23일, 오후, 서울, 성북구, 고려대학교, 서울, 캠퍼스, 정경, 대, 후문,...","[23일, 오후, 서울, 성북구, 고려대학교, 서울, 캠퍼스, 정경, 대, 후문, ...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]"


### STEP 4. 벡터화 진행 (Vectorization)
* 해당 실습에서는 별도의 벡터화를 수행한 후 모델링을 진행하지 않음
* 벡터화 과정을 모델 내부에서 수행하며, 이를 embedding이라 부름

In [18]:
%%time
from gensim.models import FastText

model = FastText(
    sentences = aggregate_document,
    size=100,
    window=5,
    min_count=1,
    workers=4)

CPU times: user 12.6 s, sys: 665 ms, total: 13.3 s
Wall time: 11.5 s


In [21]:
# vocab

In [19]:
vocab = list(model.wv.vocab)
train_sentence = train_df['tokenized_document'].to_list()
test_sentence = test_df['tokenized_document'].to_list()

In [20]:
word2index = {'<PAD>':0, '<UNK>':1}

for v in vocab: # v는 vocab 객체 하나를 의미함
  if word2index.get(v) is None:
    word2index[v] = len(word2index) # 단어별 index 부여

index2word = {}
for idx, vo in word2index.items():
  index2word[vo] = idx

# 한줄로 구현하기
# index2word = {v:idx for idx, v in word2index.items()}

In [21]:
fasttext_vector = []
for key in word2index.keys():
  if key in '<PAD>' or '<UNK>': # 두가지 단어는 vocab에 속하지 않음 
    fasttext_vector.append(np.random.randn(100,)) # random한 값으로 초기화 하여 제공
  else:
    fasttext_vector.append(model.wv[key])
  
fasttext_matrix = np.vstack(fasttext_vector)

In [22]:
print('vocab 개수 : ',len(vocab))
print('word matrix shape : ',fasttext_matrix.shape) # '<PAD>''<UNK>'를 추가해주었기 때문에 vocab보다 2개 많은 상태

vocab 개수 :  48306
word matrix shape :  (48308, 100)


### STEP 5. 모델 구축하기 (Modeling)
* SummaRunner를 활용하여 문서 요약 모델 구축하기
* 딥러닝 모델을 위해서는 vocab, dataset, model을 구축해야함
* 그동안 배웠던 내용을 모두 함수로 묶어 진행함 (목적 : clean code를 위한 모듈화)

In [23]:
# vocab을 생성하는 Class

class Feature:
    def __init__(self, word2id):
        self.word2id = word2id
        self.id2word = {idx: word for word, idx in word2id.items()}
        assert len(self.word2id) == len(self.id2word)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"

    def __len__(self):
        return len(self.word2id)

    def i2w(self, idx):
        return self.id2word[idx]

    def w2i(self, w):
        if w in self.word2id:
            return self.word2id[w]
        else:
            return self.UNK_IDX

    ###################
    # Create Features #
    ###################
    def make_features(
        self,
        docs,
        labels_list,
        summaries_list,
        sent_trunc=128,
        doc_trunc=100,
        split_token="\n",
    ):

        # trunc document
        # 문서 내 doc_trunc 문장 개수까지 가져옴
        sents_list, targets, doc_lens, summaries = [], [], [], []
        for doc, labels, summary in zip(docs, labels_list, summaries_list):
            sents = doc #.split(split_token)
            labels = labels #.split(split_token)
            labels = [int(l) for l in labels]
            max_sent_num = min(doc_trunc, len(sents))
            sents = sents[:max_sent_num]
            labels = labels[:max_sent_num]
            oracle = [sent for sent, label in zip(sents, labels) if label == 1]

            sents_list.extend(sents)
            targets.extend(labels)
            doc_lens.append(len(sents))
            summaries.append(summary)

        # trunc or pad sent
        # 문장 내 sent_trunc 단어 개수까지 가져옴
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
                self.w2i(w) for w in sent
            ]
            features.append(feature)

        return features, targets, doc_lens, summaries


In [24]:
# Dataset을 위한 Class

import torch
from torch.utils.data import Dataset, DataLoader

class SummarRunnerDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        doc = self.examples[idx]["doc"]
        labels = self.examples[idx]["labels"]
        summaries = self.examples[idx]["summaries"]

        return doc, labels, summaries

In [25]:
# model 정의
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# from .encoder import SentenceEncoder, DocumentEncoder
# from .encoder import Encoder


# Device configuration
DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

class SentenceEncoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        pretrained_vectors: np.ndarray = None,
    ):
        super().__init__()

        self.vocab_size = (vocab_size,)
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.num_directs = 1
        if bidirectional:
            self.num_directs = 2

        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if pretrained_vectors is not None:
            self.embed.weight.data.copy_(pretrained_vectors)
        else:
            nn.init.xavier_uniform_(self.embed.weight)

        self.bilstm = nn.LSTM(
            self.embed_dim,
            self.hidden_dim,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=self.bidirectional,
            # dropout=dropout,
        )
        # self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

    def avg_pool1d(self, sequences, seq_lens):
        out = []
        for idx, tensor in enumerate(sequences):
            tensor = tensor[: seq_lens[idx], :]
            tensor = torch.t(tensor).unsqueeze(0)
            out.append(F.avg_pool1d(tensor, tensor.size(2)))

        out = torch.cat(out).squeeze(2)
        return out

    def forward(self, docs):
        sent_lens = torch.sum(torch.sign(docs), dim=1).data

        x = self.embed(docs)
        output, _ = self.bilstm(x)
        output = self.avg_pool1d(output, sent_lens)
        # output = self.linear(output)

        return output


class DocumentEncoder(nn.Module):
    def __init__(
        self,
        input_dim: int = 128,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.num_directs = 1
        if bidirectional:
            self.num_directs = 2

        self.bilstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            # dropout=dropout,
        )
        # self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

    def pad_doc(self, sents, doc_lens):
        pad_dim = sents.size(1)
        max_doc_len = max(doc_lens)
        sent_input = []
        start = 0
        for doc_len in doc_lens:
            stop = start + doc_len
            valid = sents[start:stop]
            start = stop
            if doc_len == max_doc_len:
                sent_input.append(valid.unsqueeze(0))
            else:
                pad = Variable(torch.zeros(max_doc_len - doc_len, pad_dim)).to(DEVICE)
                sent_input.append(torch.cat([valid, pad]).unsqueeze(0))

        sent_input = torch.cat(sent_input, dim=0)
        return sent_input

    def avg_pool1d(self, sequences, seq_lens):
        out = []
        for idx, tensor in enumerate(sequences):
            tensor = tensor[: seq_lens[idx], :]
            tensor = torch.t(tensor).unsqueeze(0)
            out.append(F.avg_pool1d(tensor, tensor.size(2)))

        out = torch.cat(out).squeeze(2)
        return out

    def forward(self, sents, doc_lens):
        # make sent features(pad with zeros)
        x = self.pad_doc(sents, doc_lens)
        output, hidden = self.bilstm(x)
        # output = self.linear(output)
        return output


class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        pretrained_vectors: np.ndarray = None,
    ):
        super().__init__()

        self.sent_encoder = SentenceEncoder(
            vocab_size,
            embed_dim,
            hidden_dim,
            num_layers,
            bidirectional=True,
            dropout_p=dropout_p,
            pretrained_vectors=pretrained_vectors,
        )

        self.doc_encoder = DocumentEncoder(
            2 * hidden_dim,
            hidden_dim,
            num_layers,
            bidirectional=True,
            dropout_p=dropout_p,
        )

    def forward(self, docs, doc_lens):
        encoded_sents = self.sent_encoder(docs)
        encoded_docs = self.doc_encoder(encoded_sents, doc_lens)
        return encoded_docs


class SummaRunner(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        num_class: int = 1,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        pos_dim: int = 50,
        pos_num: int = 100,
        seg_num: int = 10,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        maxlen: int = 50,
        pretrained_vectors: np.ndarray = None,
    ):
        super(SummaRunner, self).__init__()

        self.hidden_dim = hidden_dim
        self.abs_pos_embed = nn.Embedding(pos_num, pos_dim)  # absolute postion
        self.rel_pos_embed = nn.Embedding(seg_num, pos_dim)  # relative position

        self.encoder = Encoder(
            vocab_size, embed_dim, hidden_dim, num_layers, bidirectional, dropout_p
        )

        self.fc = nn.Linear(2 * hidden_dim, 2 * hidden_dim)

        # Parameters of Classification Layer
        # P(y_j = 1|h_j, s_j, d), Eq.6 in SummaRuNNer paper
        self.content = nn.Linear(2 * hidden_dim, 1, bias=False)
        self.salience = nn.Bilinear(2 * hidden_dim, 2 * hidden_dim, 1, bias=False)
        self.novelty = nn.Bilinear(2 * hidden_dim, 2 * hidden_dim, 1, bias=False)
        self.abs_pos = nn.Linear(pos_dim, 1, bias=False)
        self.rel_pos = nn.Linear(pos_dim, 1, bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))

    def avg_pool1d(self, sequences, seq_lens):
        out = []
        for idx, tensor in enumerate(sequences):
            tensor = tensor[: seq_lens[idx], :]
            tensor = torch.t(tensor).unsqueeze(0)
            out.append(F.avg_pool1d(tensor, tensor.size(2)))

        out = torch.cat(out).squeeze(2)
        return out

    def forward(self, docs, doc_lens):
        sent_out = self.encoder(docs, doc_lens)
        docs = self.avg_pool1d(sent_out, doc_lens)

        probs = []
        for index, doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index, :doc_len, :]
            doc = torch.tanh(self.fc(docs[index])).unsqueeze(0)
            s = Variable(torch.zeros(1, 2 * self.hidden_dim)).to(DEVICE)
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]])).to(DEVICE)
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)

                rel_index = int(round((position + 1) * 9.0 / doc_len))
                rel_index = Variable(torch.LongTensor([[rel_index]])).to(DEVICE)
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)

                # classification layer
                content = self.content(h)
                salience = self.salience(h, doc)
                novelty = -1 * self.novelty(h, torch.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                # P(y_j = 1|h_j, s_j, d) Eq.6 in SummaRuNNer paper
                prob = torch.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
                s = s + torch.mm(prob, h)
                probs.append(prob)

        return torch.cat(probs).squeeze()

In [26]:
# make example
# 모델이 원하는 스타일의 데이터로 바꾸어주는 부분

train_examples = []

for cur_example in zip(train_df['normalize_document'],train_df['labels'],train_df['label_sentence']):
    example = {
        'doc': cur_example[0],
        'labels': cur_example[1],
        'summaries': cur_example[2]
    }
    train_examples.append(example)

test_examples = []

for cur_example in zip(test_df['normalize_document'],test_df['labels'],test_df['label_sentence']):
    example = {
        'doc': cur_example[0],
        'labels': cur_example[1],
        'summaries': cur_example[2]
    }
    test_examples.append(example)

In [64]:
len(test_examples)

1217

In [27]:
def accuracy(logits, labels):
    preds = torch.round(logits)
    corrects = (preds == labels).sum().float()
    acc = corrects / labels.numel()
    return acc

In [33]:
# train

EPOCH = 5
BATCH_SIZE = 30
LR = 0.001

In [34]:
def collate_fn(batch, feature, doc_trunc=100):
    docs = [entry[0] for entry in batch]
    labels_list = [entry[1] for entry in batch]
    summaries = [entry[2] for entry in batch]

    features, targets, doc_lens, summaries = feature.make_features(
        docs, labels_list, summaries, doc_trunc=doc_trunc
    )
    
    features = torch.LongTensor(features)
    targets = torch.FloatTensor(targets)
    return features, targets, doc_lens, summaries

In [35]:
from functools import partial

feature = Feature(word2index)

train_dataset = SummarRunnerDataset(train_examples)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=partial(collate_fn, feature=feature),
    )

In [36]:
model = SummaRunner(
    vocab_size=len(word2index),
    num_class=1,
    embed_dim=100,
    hidden_dim=128,
    pos_dim=50,
    pos_num=100,
    seg_num=10,
    num_layers=1,
    bidirectional=True,
    dropout_p=0.3,
    maxlen=100,
    pretrained_vectors=fasttext_matrix
)

model = model.to(device)
    
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [37]:
from torch.nn.utils import clip_grad_norm_

model.train()
for epoch in tqdm(range(EPOCH)):
    batch_loss = []
    for step, batch in enumerate(train_loader):
        features, targets, doc_lens, _ = batch
        
        features = features.to(device)
        targets = targets.to(device)
        
        model.zero_grad()
        probs = model(features, doc_lens)
        
        loss = loss_function(probs, targets)
        batch_loss.append(loss.item())
        loss.backward()
        
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()
        if step % 100 == 0:
            train_acc = accuracy(probs, targets)
            print(' ')
            print(f'Step: {step}, Loss: {loss.item()}, Acc: {train_acc}')

Batch ID: 0, Loss: 0.7848403453826904, Acc: 0.5013550519943237
Batch ID: 100, Loss: 0.41332948207855225, Acc: 0.8138957619667053
Batch ID: 200, Loss: 0.3994556963443756, Acc: 0.8362282514572144
Batch ID: 300, Loss: 0.4436092674732208, Acc: 0.8115501403808594
Batch ID: 0, Loss: 0.4189411997795105, Acc: 0.8075881004333496
Batch ID: 100, Loss: 0.4233170449733734, Acc: 0.8163771629333496
Batch ID: 200, Loss: 0.3919476568698883, Acc: 0.8312655091285706
Batch ID: 300, Loss: 0.43431755900382996, Acc: 0.8085106611251831
Batch ID: 0, Loss: 0.4080554246902466, Acc: 0.8157181739807129
Batch ID: 100, Loss: 0.42411670088768005, Acc: 0.8138957619667053
Batch ID: 200, Loss: 0.3765488266944885, Acc: 0.8387096524238586
Batch ID: 300, Loss: 0.4104790687561035, Acc: 0.8145896792411804
Batch ID: 0, Loss: 0.37991562485694885, Acc: 0.8130081295967102
Batch ID: 100, Loss: 0.41270336508750916, Acc: 0.826302707195282
Batch ID: 200, Loss: 0.359308660030365, Acc: 0.8411910533905029
Batch ID: 300, Loss: 0.3865181

In [38]:
# test
test_dataset = SummarRunnerDataset(test_examples)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=partial(collate_fn, feature=feature),
    )

In [65]:
model.eval()

model_prediction = []
test_label = []
with torch.no_grad():
    for step, batch in enumerate(test_loader):
        features, targets, doc_lens, _ = batch

        features = features.to(device)
        targets = targets.to(device)
        
        probs = model(features, doc_lens)
        start = 0
        for doc_id, doc_len in enumerate(doc_lens):
            # print(doc_len)
            stop = start + doc_len
            prob = probs[start:stop]
            # print(prob)
            topk = min(3, doc_len)
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()

            model_prediction.append(topk_indices)

In [66]:
len(model_prediction)

1217

In [71]:
all_pred = []
for idx, cur_doc in enumerate(test_df['normalize_document']):
    pred_idx = model_prediction[idx]
    pred_doc = []
    for cur_idx in pred_idx:
        pred_doc.append(cur_doc[cur_idx])
    all_pred.append(pred_doc)

In [72]:
all_label = test_df['label_sentence'].to_list()

In [73]:
len(all_pred), len(all_label)

(1217, 1217)

In [74]:
'''
rouge score 계산을 위한 코드이며, 
해당 실습에서는 score 계산에 필요한 전체적인 흐름을 따라갈 목적으로서 해당 코드를 사용합니다.
즉, rouge score의 계산원리를 모두 익히지 않아도 됩니다.
'''

import os
import re
import platform
import itertools
import collections
import pkg_resources  # pip install py-rouge
from io import open

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab


class Rouge:
    DEFAULT_METRICS = {"rouge-n"}
    DEFAULT_N = 1
    STATS = ["f", "p", "r"]
    AVAILABLE_METRICS = {"rouge-n", "rouge-l", "rouge-w"}
    AVAILABLE_LENGTH_LIMIT_TYPES = {"words", "bytes"}
    REMOVE_CHAR_PATTERN = re.compile("[^A-Za-z0-9가-힣]")

    def __init__(
        self,
        metrics=None,
        max_n=None,
        limit_length=True,
        length_limit=1000,
        length_limit_type="words",
        apply_avg=True,
        apply_best=False,
        use_tokenizer=True,
        alpha=0.5,
        weight_factor=1.0,
    ):
        self.metrics = metrics[:] if metrics is not None else Rouge.DEFAULT_METRICS
        for m in self.metrics:
            if m not in Rouge.AVAILABLE_METRICS:
                raise ValueError("Unknown metric '{}'".format(m))

        self.max_n = max_n if "rouge-n" in self.metrics else None
        # Add all rouge-n metrics
        if self.max_n is not None:
            index_rouge_n = self.metrics.index("rouge-n")
            del self.metrics[index_rouge_n]
            self.metrics += ["rouge-{}".format(n) for n in range(1, self.max_n + 1)]
        self.metrics = set(self.metrics)

        self.limit_length = limit_length
        if self.limit_length:
            if length_limit_type not in Rouge.AVAILABLE_LENGTH_LIMIT_TYPES:
                raise ValueError("Unknown length_limit_type '{}'".format(length_limit_type))

        self.length_limit = length_limit
        if self.length_limit == 0:
            self.limit_length = False
        self.length_limit_type = length_limit_type

        self.use_tokenizer = use_tokenizer
        if use_tokenizer:
            self.tokenizer = Mecab()

        self.apply_avg = apply_avg
        self.apply_best = apply_best
        self.alpha = alpha
        self.weight_factor = weight_factor
        if self.weight_factor <= 0:
            raise ValueError("ROUGE-W weight factor must greater than 0.")

    def tokenize_text(self, text):
        return self.tokenizer.morphs(text)

    @staticmethod
    def split_into_sentences(text):
        return text.split("\n")

    @staticmethod
    def _get_ngrams(n, text):
        ngram_set = collections.defaultdict(int)
        max_index_ngram_start = len(text) - n
        for i in range(max_index_ngram_start + 1):
            ngram_set[tuple(text[i : i + n])] += 1
        return ngram_set

    @staticmethod
    def _split_into_words(sentences):
        return list(itertools.chain(*[_.split() for _ in sentences]))

    @staticmethod
    def _get_word_ngrams_and_length(n, sentences):
        assert len(sentences) > 0
        assert n > 0

        tokens = Rouge._split_into_words(sentences)
        return Rouge._get_ngrams(n, tokens), tokens, len(tokens) - (n - 1)

    @staticmethod
    def _get_unigrams(sentences):
        assert len(sentences) > 0

        tokens = Rouge._split_into_words(sentences)
        unigram_set = collections.defaultdict(int)
        for token in tokens:
            unigram_set[token] += 1
        return unigram_set, len(tokens)

    @staticmethod
    def _compute_p_r_f_score(
        evaluated_count,
        reference_count,
        overlapping_count,
        alpha=0.5,
        weight_factor=1.0,
    ):
        precision = 0.0 if evaluated_count == 0 else overlapping_count / float(evaluated_count)
        if weight_factor != 1.0:
            precision = precision ** (1.0 / weight_factor)
        recall = 0.0 if reference_count == 0 else overlapping_count / float(reference_count)
        if weight_factor != 1.0:
            recall = recall ** (1.0 / weight_factor)
        f1_score = Rouge._compute_f_score(precision, recall, alpha)
        return {"f": f1_score, "p": precision, "r": recall}

    @staticmethod
    def _compute_f_score(precision, recall, alpha=0.5):
        return (
            0.0
            if (recall == 0.0 or precision == 0.0)
            else precision * recall / ((1 - alpha) * precision + alpha * recall)
        )

    @staticmethod
    def _compute_ngrams(evaluated_sentences, reference_sentences, n):
        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_ngrams, _, evaluated_count = Rouge._get_word_ngrams_and_length(
            n, evaluated_sentences
        )
        reference_ngrams, _, reference_count = Rouge._get_word_ngrams_and_length(
            n, reference_sentences
        )

        # Gets the overlapping ngrams between evaluated and reference
        overlapping_ngrams = set(evaluated_ngrams.keys()).intersection(set(reference_ngrams.keys()))
        overlapping_count = 0
        for ngram in overlapping_ngrams:
            overlapping_count += min(evaluated_ngrams[ngram], reference_ngrams[ngram])

        return evaluated_count, reference_count, overlapping_count

    @staticmethod
    def _compute_ngrams_lcs(evaluated_sentences, reference_sentences, weight_factor=1.0):
        def _lcs(x, y):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(int)
            dirs = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        vals[i, j] = vals[i - 1, j - 1] + 1
                        dirs[i, j] = "|"
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"

            return vals, dirs

        def _wlcs(x, y, weight_factor):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(float)
            dirs = collections.defaultdict(int)
            lengths = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        length_tmp = lengths[i - 1, j - 1]
                        vals[i, j] = (
                            vals[i - 1, j - 1]
                            + (length_tmp + 1) ** weight_factor
                            - length_tmp ** weight_factor
                        )
                        dirs[i, j] = "|"
                        lengths[i, j] = length_tmp + 1
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                        lengths[i, j] = 0
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"
                        lengths[i, j] = 0

            return vals, dirs

        def _mark_lcs(mask, dirs, m, n):
            while m != 0 and n != 0:
                if dirs[m, n] == "|":
                    m -= 1
                    n -= 1
                    mask[m] = 1
                elif dirs[m, n] == "^":
                    m -= 1
                elif dirs[m, n] == "<":
                    n -= 1
                else:
                    raise UnboundLocalError("Illegal move")

            return mask

        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_unigrams_dict, evaluated_count = Rouge._get_unigrams(evaluated_sentences)
        reference_unigrams_dict, reference_count = Rouge._get_unigrams(reference_sentences)

        # Has to use weight factor for WLCS
        use_WLCS = weight_factor != 1.0
        if use_WLCS:
            evaluated_count = evaluated_count ** weight_factor
            reference_count = 0

        overlapping_count = 0.0
        for reference_sentence in reference_sentences:
            reference_sentence_tokens = reference_sentence.split()
            if use_WLCS:
                reference_count += len(reference_sentence_tokens) ** weight_factor
            hit_mask = [0 for _ in range(len(reference_sentence_tokens))]

            for evaluated_sentence in evaluated_sentences:
                evaluated_sentence_tokens = evaluated_sentence.split()

                if use_WLCS:
                    _, lcs_dirs = _wlcs(
                        reference_sentence_tokens,
                        evaluated_sentence_tokens,
                        weight_factor,
                    )
                else:
                    _, lcs_dirs = _lcs(reference_sentence_tokens, evaluated_sentence_tokens)
                _mark_lcs(
                    hit_mask,
                    lcs_dirs,
                    len(reference_sentence_tokens),
                    len(evaluated_sentence_tokens),
                )

            overlapping_count_length = 0
            for ref_token_id, val in enumerate(hit_mask):
                if val == 1:
                    token = reference_sentence_tokens[ref_token_id]
                    if evaluated_unigrams_dict[token] > 0 and reference_unigrams_dict[token] > 0:
                        evaluated_unigrams_dict[token] -= 1
                        reference_unigrams_dict[ref_token_id] -= 1

                        if use_WLCS:
                            overlapping_count_length += 1
                            if (
                                ref_token_id + 1 < len(hit_mask) and hit_mask[ref_token_id + 1] == 0
                            ) or ref_token_id + 1 == len(hit_mask):
                                overlapping_count += overlapping_count_length ** weight_factor
                                overlapping_count_length = 0
                        else:
                            overlapping_count += 1

        if use_WLCS:
            reference_count = reference_count ** weight_factor

        return evaluated_count, reference_count, overlapping_count

    def get_scores(self, hypothesis, references):
        if isinstance(hypothesis, str):
            hypothesis, references = [hypothesis], [references]

        if type(hypothesis) != type(references):
            raise ValueError("'hyps' and 'refs' are not of the same type")

        if len(hypothesis) != len(references):
            raise ValueError("'hyps' and 'refs' do not have the same length")
        scores = {}
        has_rouge_n_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].isdigit()]) > 0
        )
        if has_rouge_n_metric:
            scores.update(self._get_scores_rouge_n(hypothesis, references))
            # scores = {**scores, **self._get_scores_rouge_n(hypothesis, references)}

        has_rouge_l_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].lower() == "l"]) > 0
        )
        if has_rouge_l_metric:
            scores.update(self._get_scores_rouge_l_or_w(hypothesis, references, False))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, False)}

        has_rouge_w_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].lower() == "w"]) > 0
        )
        if has_rouge_w_metric:
            scores.update(self._get_scores_rouge_l_or_w(hypothesis, references, True))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, True)}

        return scores

    def _get_scores_rouge_n(self, all_hypothesis, all_references):
        metrics = [metric for metric in self.metrics if metric.split("-")[-1].isdigit()]

        if self.apply_avg or self.apply_best:
            scores = {metric: {stat: 0.0 for stat in Rouge.STATS} for metric in metrics}
        else:
            scores = {
                metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))]
                for metric in metrics
            }

        for sample_id, (hypothesis, references) in enumerate(zip(all_hypothesis, all_references)):
            assert isinstance(hypothesis, str)
            has_multiple_references = False
            if isinstance(references, list):
                has_multiple_references = len(references) > 1
                if not has_multiple_references:
                    references = references[0]

            # Prepare hypothesis and reference(s)
            hypothesis = self._preprocess_summary_as_a_whole(hypothesis)
            references = (
                [self._preprocess_summary_as_a_whole(reference) for reference in references]
                if has_multiple_references
                else [self._preprocess_summary_as_a_whole(references)]
            )

            # Compute scores
            for metric in metrics:
                suffix = metric.split("-")[-1]
                n = int(suffix)

                # Aggregate
                if self.apply_avg:
                    # average model
                    total_hypothesis_ngrams_count = 0
                    total_reference_ngrams_count = 0
                    total_ngrams_overlapping_count = 0

                    for reference in references:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams(hypothesis, reference, n)
                        total_hypothesis_ngrams_count += hypothesis_count
                        total_reference_ngrams_count += reference_count
                        total_ngrams_overlapping_count += overlapping_ngrams

                    score = Rouge._compute_p_r_f_score(
                        total_hypothesis_ngrams_count,
                        total_reference_ngrams_count,
                        total_ngrams_overlapping_count,
                        self.alpha,
                    )

                    for stat in Rouge.STATS:
                        scores[metric][stat] += score[stat]
                else:
                    # Best model
                    if self.apply_best:
                        best_current_score = None
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            if best_current_score is None or score["r"] > best_current_score["r"]:
                                best_current_score = score

                        for stat in Rouge.STATS:
                            scores[metric][stat] += best_current_score[stat]
                    # Keep all
                    else:
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            for stat in Rouge.STATS:
                                scores[metric][sample_id][stat].append(score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for metric in metrics:
                for stat in Rouge.STATS:
                    scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _get_scores_rouge_l_or_w(self, all_hypothesis, all_references, use_w=False):
        metric = "rouge-w" if use_w else "rouge-l"
        if self.apply_avg or self.apply_best:
            scores = {metric: {stat: 0.0 for stat in Rouge.STATS}}
        else:
            scores = {
                metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))]
            }

        for sample_id, (hypothesis_sentences, references_sentences) in enumerate(
            zip(all_hypothesis, all_references)
        ):
            assert isinstance(hypothesis_sentences, str)
            has_multiple_references = False
            if isinstance(references_sentences, list):
                has_multiple_references = len(references_sentences) > 1
                if not has_multiple_references:
                    references_sentences = references_sentences[0]

            # Prepare hypothesis and reference(s)
            hypothesis_sentences = self._preprocess_summary_per_sentence(hypothesis_sentences)
            references_sentences = (
                [
                    self._preprocess_summary_per_sentence(reference)
                    for reference in references_sentences
                ]
                if has_multiple_references
                else [self._preprocess_summary_per_sentence(references_sentences)]
            )

            # Compute scores
            # Aggregate
            if self.apply_avg:
                # average model
                total_hypothesis_ngrams_count = 0
                total_reference_ngrams_count = 0
                total_ngrams_overlapping_count = 0

                for reference_sentences in references_sentences:
                    (
                        hypothesis_count,
                        reference_count,
                        overlapping_ngrams,
                    ) = Rouge._compute_ngrams_lcs(
                        hypothesis_sentences,
                        reference_sentences,
                        self.weight_factor if use_w else 1.0,
                    )
                    total_hypothesis_ngrams_count += hypothesis_count
                    total_reference_ngrams_count += reference_count
                    total_ngrams_overlapping_count += overlapping_ngrams

                score = Rouge._compute_p_r_f_score(
                    total_hypothesis_ngrams_count,
                    total_reference_ngrams_count,
                    total_ngrams_overlapping_count,
                    self.alpha,
                    self.weight_factor if use_w else 1.0,
                )
                for stat in Rouge.STATS:
                    scores[metric][stat] += score[stat]
            else:
                # Best model
                if self.apply_best:
                    best_current_score = None
                    best_current_score_wlcs = None
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            total_hypothesis_ngrams_count,
                            total_reference_ngrams_count,
                            total_ngrams_overlapping_count,
                            self.alpha,
                            self.weight_factor if use_w else 1.0,
                        )

                        if use_w:
                            reference_count_for_score = reference_count ** (
                                1.0 / self.weight_factor
                            )
                            overlapping_ngrams_for_score = overlapping_ngrams
                            score_wlcs = (
                                overlapping_ngrams_for_score / reference_count_for_score
                            ) ** (1.0 / self.weight_factor)

                            if (
                                best_current_score_wlcs is None
                                or score_wlcs > best_current_score_wlcs
                            ):
                                best_current_score = score
                                best_current_score_wlcs = score_wlcs
                        else:
                            if best_current_score is None or score["r"] > best_current_score["r"]:
                                best_current_score = score

                    for stat in Rouge.STATS:
                        scores[metric][stat] += best_current_score[stat]
                # Keep all
                else:
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                            self.alpha,
                            self.weight_factor,
                        )

                        for stat in Rouge.STATS:
                            scores[metric][sample_id][stat].append(score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for stat in Rouge.STATS:
                scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _preprocess_summary_as_a_whole(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            # By words
            if self.length_limit_type == "words":
                summary = " ".join(sentences)
                all_tokens = summary.split()  # Counting as in the perls script
                summary = " ".join(all_tokens[: self.length_limit])

            # By bytes
            elif self.length_limit_type == "bytes":
                summary = ""
                current_len = 0
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)

                    if current_len + sentence_len < self.length_limit:
                        if current_len != 0:
                            summary += " "
                        summary += sentence
                        current_len += sentence_len
                    else:
                        if current_len > 0:
                            summary += " "
                        summary += sentence[: self.length_limit - current_len]
                        break
        else:
            summary = " ".join(sentences)

        summary = Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary.lower()).strip()

        tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary))
        preprocessed_summary = [" ".join(tokens)]

        return preprocessed_summary

    def _preprocess_summary_per_sentence(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            final_sentences = []
            current_len = 0
            # By words
            if self.length_limit_type == "words":
                for sentence in sentences:
                    tokens = sentence.strip().split()
                    tokens_len = len(tokens)
                    if current_len + tokens_len < self.length_limit:
                        sentence = " ".join(tokens)
                        final_sentences.append(sentence)
                        current_len += tokens_len
                    else:
                        sentence = " ".join(tokens[: self.length_limit - current_len])
                        final_sentences.append(sentence)
                        break
            # By bytes
            elif self.length_limit_type == "bytes":
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)
                    if current_len + sentence_len < self.length_limit:
                        final_sentences.append(sentence)
                        current_len += sentence_len
                    else:
                        sentence = sentence[: self.length_limit - current_len]
                        final_sentences.append(sentence)
                        break
            sentences = final_sentences

        final_sentences = []
        for sentence in sentences:
            sentence = Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence.lower()).strip()

            tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence))

            sentence = " ".join(tokens)

            final_sentences.append(sentence)

        return final_sentences

In [77]:
'''
rouge score 계산을 위해 필요한 함수입니다.
데이터셋에 따라 compute rouge 함수를 조금씩 수정해주어야합니다.
아래의 코드는 설명을 진행합니다.
'''
import os
from glob import glob
from tqdm import tqdm

class RougeScorer:
    def __init__(self):

        self.rouge_evaluator = Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=1000,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
        )

    def compute_rouge(self, ref_list, hyp_list):
        '''
        ref_path : 정답 문장
        hyp_path : 예측 문장
        '''

        print("-" * 50)
        print("# of Testset :", len(hyp_list))
        print("-" * 50)

        self.reference_summaries = []
        self.generated_summaries = []

        for ref_doc, hyp_doc in tqdm(zip(ref_list, hyp_list), total=len(ref_list)):

            ref_doc = " ".join(ref_doc)
            hyp_doc = " ".join(hyp_doc)

            self.reference_summaries.append(ref_doc)
            self.generated_summaries.append(hyp_doc)

        scores = self.rouge_evaluator.get_scores(self.generated_summaries, self.reference_summaries) # score를 계산하는 함수

        str_scores = self.format_rouge_scores(scores)
        print(str_scores)
        self.save_rouge_scores(str_scores)
        return str_scores

    def save_rouge_scores(self, str_scores):
        with open("rouge_scores.txt", "w") as output:
            output.write(str_scores)

    def format_rouge_scores(self, scores):
        return """\n
    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE 2
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE L
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}""".format(
            scores["rouge-1"]["f"],
            scores["rouge-1"]["p"],
            scores["rouge-1"]["r"],
            scores["rouge-2"]["f"],
            scores["rouge-2"]["p"],
            scores["rouge-2"]["r"],
            scores["rouge-l"]["f"],
            scores["rouge-l"]["p"],
            scores["rouge-l"]["r"],
        )

In [78]:
rouge_eval = RougeScorer()
result = rouge_eval.compute_rouge(all_label, all_pred)

--------------------------------------------------
# of Testset : 1217
--------------------------------------------------


100%|██████████| 1217/1217 [00:00<00:00, 240708.67it/s]


In [None]:
# SummarRunner의 성능은 아래와 같이 도출됩니다.
# 참고 : TextRank보다 높음

# ****** ROUGE SCORES ******
# ** ROUGE 1
# F1        >> 0.519
# Precision >> 0.576
# Recall    >> 0.498
# ** ROUGE 2
# F1        >> 0.384
# Precision >> 0.421
# Recall    >> 0.370
# ** ROUGE L
# F1        >> 0.424
# Precision >> 0.470
# Recall    >> 0.407