## [LG 전자] 자연어 처리 # 4 : 문서요약 - (1)

* TextRank를 이용한 추출 요약
* 예상 난이도 ⭐️⭐️

## 강의 복습

강의자료 : 자연어처리 4, AGENDA 02 - 그래프 기반의 추출요약

## 실습 요약

1. 본 실습에서는 TextRank를 활용하여 추출요약 모델을 구축합니다.
2. TextRank는 별도의 학습 과정을 진행하지 않는 비지도학습 기반의 모델입니다.



------

### STEP 0. 환경 구축하기
* 필요한 library들을 import 합니다

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
!pip install gensim==3.8.3

In [4]:
import sys
import json
import random
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False
#%matplotlib inline #생성한 figure를 notebook에서 볼 수있게 해주는 코드

import gensim
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#check torch version & device
print ("Python version:[%s]."%(sys.version))
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device)) # device에 cuda:0가 프린트 된다면 GPU를 사용하는 상태입니다

Python version:[3.8.0 (default, Nov  6 2019, 15:49:01) 
[Clang 4.0.1 (tags/RELEASE_401/final)]].
PyTorch version:[1.9.0].
device:[cpu].


In [None]:
# konlpy, Mecab 형태소 분석기 설치 스크립트 실행
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [5]:
# set random seed 

def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
random_seed = 42
set_seed(random_seed)

### STEP 1. 데이터 준비하기
금일 실습에서는 **AIHUB**에서 제공하는 **한글 뉴스기사 요약 데이터**를 활용합니다.
* 데이터셋 출처
  * https://aihub.or.kr/aidata/8054


In [5]:
# github에서 데이터 불러오기
!git clone https://github.com/KU-DIC/LG_natural_language_processing_day23

In [6]:
# 데이터셋 읽기
with open('./LG_natural_language_processing_day23/data/sum_data.json','r',encoding='utf-8') as f:
  data = json.load(f)

In [28]:
# 분석에 사용할 형태로 가공하기
label = [] # extractive
document = [] # text

for cur_news in tqdm(data['documents']):
  # 리뷰 문장
  document.append(cur_news['text'])
  label.append(cur_news['extractive'])

100%|██████████| 12000/12000 [00:00<00:00, 459381.26it/s]


In [29]:
# 데이터 프레임 형태로 변환하기
df = {
    "label" : label,
    "document" : document
}
df = pd.DataFrame(df)

### STEP 2. 전처리 진행 (Preprocessing)

In [9]:
news_sentences = df['document'].to_list()

In [10]:
import re
def preprocess(text):
  text = re.sub('[-=+,#/\?:^$~@*\"※~▲△&%ㆍ·!』\\‘|\(\)\[\]\<\>`\'…》]','', text)
  text = re.sub('[ㅠㅎㅋ]','', text)
  return text

In [11]:
fixed_document = []
for cur_sentences in tqdm(news_sentences):
  fix_sent = []
  for sent in cur_sentences:
    fix_sent.extend(sent)
  fixed_document.append(fix_sent)

normalize_document = []
for cur_sentences in tqdm(fixed_document):
  norm_sent = []
  for sent in cur_sentences:
    sentence = preprocess(sent['sentence'])
    norm_sent.append(sentence)
  normalize_document.append(norm_sent)

df['normalize_document'] = normalize_document

100%|██████████| 12199/12199 [00:00<00:00, 256394.36it/s]
100%|██████████| 12199/12199 [00:00<00:00, 26911.00it/s]


In [12]:
label_sentence = []
for id, cur_doc in enumerate(df['normalize_document']):
  sent = []
  for idx in df['label'].iloc[id]:
    sent.append(cur_doc[idx])
  label_sentence.append(sent)

df['label_sentence'] = label_sentence

### STEP 3. 토큰화 진행 (Tokenization)

* 문서 요약 실습에서는 단순 띄어쓰기 단위로 토큰화를 진행합니다
* 이유 : 문서요약 데이터는 하나의 데이터당 굉장히 많은 문장을 가지므로, 토큰화 과정에만 약 30분의 시간이 소요됨

In [14]:
# Okt(Open Korea Text)
# from konlpy.tag import Okt  
# okt = Okt() 

# tokenized_document = []
# for cur_doc in tqdm(normalize_document):
#   tokenized_sentence = []
#   for cur_sent in cur_doc:
#     sent = okt.morphs(cur_sent)
#     tokenized_sentence.append(sent)
#   tokenized_document.append(tokenized_sentence)

# df['tokenized_document'] = tokenized_document

100%|██████████| 12199/12199 [28:14<00:00,  7.20it/s]


In [None]:
tokenized_document = []
for cur_doc in tqdm(normalize_document):
  tokenized_sentence = []
  for cur_sent in cur_doc:
    sent = cur_sent.split(' ')
    tokenized_sentence.append(sent)
  tokenized_document.append(tokenized_sentence)

In [15]:
df['tokenized_document'] = tokenized_document

In [16]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size =0.1, random_state= 42)

drop_list = [train_df.iloc[i].name for i in[845, 3035, 3936, 5246, 5417, 8988, 9710]]
train_df = train_df.drop(drop_list)

drop_list = [test_df.iloc[i].name for i in [408,583,1000]]
test_df = test_df.drop(drop_list)
len(train_df) , len(test_df)

(10972, 1217)

In [17]:
aggregate_document =[]
for cur_doc in tqdm(train_df['tokenized_document']):
    _aggregate_document = [i for i in cur_doc]
    aggregate_document.append(_aggregate_document[0])

train_df['aggregate_document'] = aggregate_document

100%|██████████| 10972/10972 [00:00<00:00, 861812.08it/s]


### STEP 4. 벡터화 진행 (Vectorization)
* 해당 실습에서는 가장 성능이 좋았던 FastText를 활용하여 벡터화를 수행함
* 사전학습 모델 설명
* 실습에서 사용하지 않는 이유
  * 다운로드, 용량 (4GB)
  

In [20]:
%%time
from gensim.models import FastText

model = FastText(
    sentences = aggregate_document,
    size=100,
    window=5,
    min_count=1,
    workers=4)

CPU times: user 9.29 s, sys: 629 ms, total: 9.92 s
Wall time: 7.88 s


In [95]:
# vocab

In [21]:
vocab = list(model.wv.vocab)
train_sentence = train_df['tokenized_document'].to_list()
test_sentence = test_df['tokenized_document'].to_list()

In [22]:
word2index = {'<PAD>':0, '<UNK>':1}

for v in vocab: # v는 vocab 객체 하나를 의미함
  if word2index.get(v) is None:
    word2index[v] = len(word2index) # 단어별 index 부여

index2word = {}
for idx, vo in word2index.items():
  index2word[vo] = idx

# 한줄로 구현하기
# index2word = {v:idx for idx, v in word2index.items()}

In [23]:
fasttext_vector = []
for key in word2index.keys():
  if key in '<PAD>' or '<UNK>': # 두가지 단어는 vocab에 속하지 않음 
    fasttext_vector.append(np.random.randn(100,)) # random한 값으로 초기화 하여 제공
  else:
    fasttext_vector.append(model.wv[key])
  
fasttext_matrix = np.vstack(fasttext_vector)

In [24]:
print('vocab 개수 : ',len(vocab))
print('word matrix shape : ',fasttext_matrix.shape) # '<PAD>''<UNK>'를 추가해주었기 때문에 vocab보다 2개 많은 상태

vocab 개수 :  22667
word matrix shape :  (22669, 100)


### STEP 5. 모델 구축하기 (Modeling)
* TextRank를 활용하여 문서 요약 모델 구축하기


In [25]:
import re
import itertools
import networkx as nx
import numpy as np
from numpy import dot
from numpy.linalg import norm

class TextRank:
    def __init__(self, df, word2index, word_embedding, min_sim, top_k):
        self.df = df
        self.word2index = word2index
        self.word_embedding = word_embedding
        self.min_sim = min_sim
        self.top_k = top_k

    
    def make_sentence_graph(self, sentence):
        '''
        sentence graph를 생성하는 함수
        '''
        sentence_graph = nx.Graph()  # initialize an undirected graph
        sentence_graph.add_nodes_from(sentence)

        nodePairs = list(itertools.combinations(sentence, 2))

        # add edges to the graph (weighted by Levenshtein distance)
        for pair in nodePairs:
            node1 = pair[0]
            node2 = pair[1]

            cos_sim = dot(sentence[pair[0]][1], sentence[pair[1]][1]) / (
                norm(sentence[pair[0]][1]) * norm(sentence[pair[1]][1])
            )
            if cos_sim > self.min_sim: # 각 노드 pair간 유사도가 최소 유사도를 넘는 경우 graph에 추가함
                sentence_graph.add_edge(node1, node2, weight=cos_sim)

        return sentence_graph

    def extract_sentence(self, sentence_graph, sentence):
        '''
        하나의 document에서 중요한 sentence를 추출하는 함수
        '''
        calculated_page_rank = nx.pagerank(
            sentence_graph, alpha=0.85, max_iter=100, weight="weight"
        )

        sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

        modified_sentence = sentences[: -len(sentences) + self.top_k]
        result_sentence = [(sentence[sent][0], sent) for sent in modified_sentence]

        return result_sentence


    def sentence_summary(self):
        """
        sentence 요약을 진행하는 main 함수 
        """
        sentence_sum_result = []
        for doc_idx , (cur_data, cur_doc)  in enumerate(tqdm(zip(self.df['tokenized_document'], self.df['normalize_document']))):
            # cur_data == one article
            sentence = {}
            # article 자체로 그래프 생성해야함 - 문장단
            for idx, (tokenized_sent, sent) in enumerate(zip(cur_data, cur_doc)):
                sentence_vector = []
                for tok in tokenized_sent:
                    try:
                        sentence_vector.append(self.word_embedding[word2index[tok]])
                    except:
                        sentence_vector.append(self.word_embedding[word2index['<UNK>']])
                sentence[sent] = [idx, np.mean(sentence_vector, axis=0)]
                sentence_graph = self.make_sentence_graph(sentence)

            extracted_sentence = self.extract_sentence(sentence_graph, sentence)
            sentence_sum_result.append(extracted_sentence)

        return sentence_sum_result

In [26]:
MIN_SIM = 0.5
TOP_K = 3

model = TextRank(
    df=test_df, # 학습을 필요로 하지 않는 비지도 학습 방법론이므로, 다른 방법론과의 비교를 위해 test_df로 진행
    word2index=word2index, 
    word_embedding=fasttext_matrix, 
    min_sim=MIN_SIM, 
    top_k=TOP_K)

In [27]:
%%time
# [time] train_df : 1m24s / test_df :9s
output = model.sentence_summary()

1217it [00:09, 127.98it/s]

CPU times: user 9.52 s, sys: 32 ms, total: 9.55 s
Wall time: 9.51 s





In [29]:
all_pred = []
for cur_output in output:
    pred = []
    for index, sentence in cur_output:
        pred.append(sentence)
    # pred.sort() # index에서만
    all_pred.append(pred)

In [34]:
all_label = test_df['label_sentence'].to_list()
print(len(all_label),len(all_pred))

1217 1217


In [35]:
'''
rouge score 계산을 위한 코드이며, 
해당 실습에서는 score 계산에 필요한 전체적인 흐름을 따라갈 목적으로서 해당 코드를 사용합니다.
즉, rouge score의 계산원리를 모두 익히지 않아도 됩니다.
'''

import os
import re
import platform
import itertools
import collections
import pkg_resources  # pip install py-rouge
from io import open

if platform.system() == "Windows":
    try:
        from konlpy.tag import Mecab
        # from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab


class Rouge:
    DEFAULT_METRICS = {"rouge-n"}
    DEFAULT_N = 1
    STATS = ["f", "p", "r"]
    AVAILABLE_METRICS = {"rouge-n", "rouge-l", "rouge-w"}
    AVAILABLE_LENGTH_LIMIT_TYPES = {"words", "bytes"}
    REMOVE_CHAR_PATTERN = re.compile("[^A-Za-z0-9가-힣]")

    def __init__(
        self,
        metrics=None,
        max_n=None,
        limit_length=True,
        length_limit=1000,
        length_limit_type="words",
        apply_avg=True,
        apply_best=False,
        use_tokenizer=True,
        alpha=0.5,
        weight_factor=1.0,
    ):
        self.metrics = metrics[:] if metrics is not None else Rouge.DEFAULT_METRICS
        for m in self.metrics:
            if m not in Rouge.AVAILABLE_METRICS:
                raise ValueError("Unknown metric '{}'".format(m))

        self.max_n = max_n if "rouge-n" in self.metrics else None
        # Add all rouge-n metrics
        if self.max_n is not None:
            index_rouge_n = self.metrics.index("rouge-n")
            del self.metrics[index_rouge_n]
            self.metrics += ["rouge-{}".format(n) for n in range(1, self.max_n + 1)]
        self.metrics = set(self.metrics)

        self.limit_length = limit_length
        if self.limit_length:
            if length_limit_type not in Rouge.AVAILABLE_LENGTH_LIMIT_TYPES:
                raise ValueError("Unknown length_limit_type '{}'".format(length_limit_type))

        self.length_limit = length_limit
        if self.length_limit == 0:
            self.limit_length = False
        self.length_limit_type = length_limit_type

        self.use_tokenizer = use_tokenizer
        if use_tokenizer:
            self.tokenizer = Mecab()

        self.apply_avg = apply_avg
        self.apply_best = apply_best
        self.alpha = alpha
        self.weight_factor = weight_factor
        if self.weight_factor <= 0:
            raise ValueError("ROUGE-W weight factor must greater than 0.")

    def tokenize_text(self, text):
        return self.tokenizer.morphs(text)

    @staticmethod
    def split_into_sentences(text):
        return text.split("\n")

    @staticmethod
    def _get_ngrams(n, text):
        ngram_set = collections.defaultdict(int)
        max_index_ngram_start = len(text) - n
        for i in range(max_index_ngram_start + 1):
            ngram_set[tuple(text[i : i + n])] += 1
        return ngram_set

    @staticmethod
    def _split_into_words(sentences):
        return list(itertools.chain(*[_.split() for _ in sentences]))

    @staticmethod
    def _get_word_ngrams_and_length(n, sentences):
        assert len(sentences) > 0
        assert n > 0

        tokens = Rouge._split_into_words(sentences)
        return Rouge._get_ngrams(n, tokens), tokens, len(tokens) - (n - 1)

    @staticmethod
    def _get_unigrams(sentences):
        assert len(sentences) > 0

        tokens = Rouge._split_into_words(sentences)
        unigram_set = collections.defaultdict(int)
        for token in tokens:
            unigram_set[token] += 1
        return unigram_set, len(tokens)

    @staticmethod
    def _compute_p_r_f_score(
        evaluated_count,
        reference_count,
        overlapping_count,
        alpha=0.5,
        weight_factor=1.0,
    ):
        precision = 0.0 if evaluated_count == 0 else overlapping_count / float(evaluated_count)
        if weight_factor != 1.0:
            precision = precision ** (1.0 / weight_factor)
        recall = 0.0 if reference_count == 0 else overlapping_count / float(reference_count)
        if weight_factor != 1.0:
            recall = recall ** (1.0 / weight_factor)
        f1_score = Rouge._compute_f_score(precision, recall, alpha)
        return {"f": f1_score, "p": precision, "r": recall}

    @staticmethod
    def _compute_f_score(precision, recall, alpha=0.5):
        return (
            0.0
            if (recall == 0.0 or precision == 0.0)
            else precision * recall / ((1 - alpha) * precision + alpha * recall)
        )

    @staticmethod
    def _compute_ngrams(evaluated_sentences, reference_sentences, n):
        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_ngrams, _, evaluated_count = Rouge._get_word_ngrams_and_length(
            n, evaluated_sentences
        )
        reference_ngrams, _, reference_count = Rouge._get_word_ngrams_and_length(
            n, reference_sentences
        )

        # Gets the overlapping ngrams between evaluated and reference
        overlapping_ngrams = set(evaluated_ngrams.keys()).intersection(set(reference_ngrams.keys()))
        overlapping_count = 0
        for ngram in overlapping_ngrams:
            overlapping_count += min(evaluated_ngrams[ngram], reference_ngrams[ngram])

        return evaluated_count, reference_count, overlapping_count

    @staticmethod
    def _compute_ngrams_lcs(evaluated_sentences, reference_sentences, weight_factor=1.0):
        def _lcs(x, y):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(int)
            dirs = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        vals[i, j] = vals[i - 1, j - 1] + 1
                        dirs[i, j] = "|"
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"

            return vals, dirs

        def _wlcs(x, y, weight_factor):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(float)
            dirs = collections.defaultdict(int)
            lengths = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        length_tmp = lengths[i - 1, j - 1]
                        vals[i, j] = (
                            vals[i - 1, j - 1]
                            + (length_tmp + 1) ** weight_factor
                            - length_tmp ** weight_factor
                        )
                        dirs[i, j] = "|"
                        lengths[i, j] = length_tmp + 1
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                        lengths[i, j] = 0
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"
                        lengths[i, j] = 0

            return vals, dirs

        def _mark_lcs(mask, dirs, m, n):
            while m != 0 and n != 0:
                if dirs[m, n] == "|":
                    m -= 1
                    n -= 1
                    mask[m] = 1
                elif dirs[m, n] == "^":
                    m -= 1
                elif dirs[m, n] == "<":
                    n -= 1
                else:
                    raise UnboundLocalError("Illegal move")

            return mask

        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_unigrams_dict, evaluated_count = Rouge._get_unigrams(evaluated_sentences)
        reference_unigrams_dict, reference_count = Rouge._get_unigrams(reference_sentences)

        # Has to use weight factor for WLCS
        use_WLCS = weight_factor != 1.0
        if use_WLCS:
            evaluated_count = evaluated_count ** weight_factor
            reference_count = 0

        overlapping_count = 0.0
        for reference_sentence in reference_sentences:
            reference_sentence_tokens = reference_sentence.split()
            if use_WLCS:
                reference_count += len(reference_sentence_tokens) ** weight_factor
            hit_mask = [0 for _ in range(len(reference_sentence_tokens))]

            for evaluated_sentence in evaluated_sentences:
                evaluated_sentence_tokens = evaluated_sentence.split()

                if use_WLCS:
                    _, lcs_dirs = _wlcs(
                        reference_sentence_tokens,
                        evaluated_sentence_tokens,
                        weight_factor,
                    )
                else:
                    _, lcs_dirs = _lcs(reference_sentence_tokens, evaluated_sentence_tokens)
                _mark_lcs(
                    hit_mask,
                    lcs_dirs,
                    len(reference_sentence_tokens),
                    len(evaluated_sentence_tokens),
                )

            overlapping_count_length = 0
            for ref_token_id, val in enumerate(hit_mask):
                if val == 1:
                    token = reference_sentence_tokens[ref_token_id]
                    if evaluated_unigrams_dict[token] > 0 and reference_unigrams_dict[token] > 0:
                        evaluated_unigrams_dict[token] -= 1
                        reference_unigrams_dict[ref_token_id] -= 1

                        if use_WLCS:
                            overlapping_count_length += 1
                            if (
                                ref_token_id + 1 < len(hit_mask) and hit_mask[ref_token_id + 1] == 0
                            ) or ref_token_id + 1 == len(hit_mask):
                                overlapping_count += overlapping_count_length ** weight_factor
                                overlapping_count_length = 0
                        else:
                            overlapping_count += 1

        if use_WLCS:
            reference_count = reference_count ** weight_factor

        return evaluated_count, reference_count, overlapping_count

    def get_scores(self, hypothesis, references):
        if isinstance(hypothesis, str):
            hypothesis, references = [hypothesis], [references]

        if type(hypothesis) != type(references):
            raise ValueError("'hyps' and 'refs' are not of the same type")

        if len(hypothesis) != len(references):
            raise ValueError("'hyps' and 'refs' do not have the same length")
        scores = {}
        has_rouge_n_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].isdigit()]) > 0
        )
        if has_rouge_n_metric:
            scores.update(self._get_scores_rouge_n(hypothesis, references))
            # scores = {**scores, **self._get_scores_rouge_n(hypothesis, references)}

        has_rouge_l_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].lower() == "l"]) > 0
        )
        if has_rouge_l_metric:
            scores.update(self._get_scores_rouge_l_or_w(hypothesis, references, False))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, False)}

        has_rouge_w_metric = (
            len([metric for metric in self.metrics if metric.split("-")[-1].lower() == "w"]) > 0
        )
        if has_rouge_w_metric:
            scores.update(self._get_scores_rouge_l_or_w(hypothesis, references, True))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, True)}

        return scores

    def _get_scores_rouge_n(self, all_hypothesis, all_references):
        metrics = [metric for metric in self.metrics if metric.split("-")[-1].isdigit()]

        if self.apply_avg or self.apply_best:
            scores = {metric: {stat: 0.0 for stat in Rouge.STATS} for metric in metrics}
        else:
            scores = {
                metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))]
                for metric in metrics
            }

        for sample_id, (hypothesis, references) in enumerate(zip(all_hypothesis, all_references)):
            assert isinstance(hypothesis, str)
            has_multiple_references = False
            if isinstance(references, list):
                has_multiple_references = len(references) > 1
                if not has_multiple_references:
                    references = references[0]

            # Prepare hypothesis and reference(s)
            hypothesis = self._preprocess_summary_as_a_whole(hypothesis)
            references = (
                [self._preprocess_summary_as_a_whole(reference) for reference in references]
                if has_multiple_references
                else [self._preprocess_summary_as_a_whole(references)]
            )

            # Compute scores
            for metric in metrics:
                suffix = metric.split("-")[-1]
                n = int(suffix)

                # Aggregate
                if self.apply_avg:
                    # average model
                    total_hypothesis_ngrams_count = 0
                    total_reference_ngrams_count = 0
                    total_ngrams_overlapping_count = 0

                    for reference in references:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams(hypothesis, reference, n)
                        total_hypothesis_ngrams_count += hypothesis_count
                        total_reference_ngrams_count += reference_count
                        total_ngrams_overlapping_count += overlapping_ngrams

                    score = Rouge._compute_p_r_f_score(
                        total_hypothesis_ngrams_count,
                        total_reference_ngrams_count,
                        total_ngrams_overlapping_count,
                        self.alpha,
                    )

                    for stat in Rouge.STATS:
                        scores[metric][stat] += score[stat]
                else:
                    # Best model
                    if self.apply_best:
                        best_current_score = None
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            if best_current_score is None or score["r"] > best_current_score["r"]:
                                best_current_score = score

                        for stat in Rouge.STATS:
                            scores[metric][stat] += best_current_score[stat]
                    # Keep all
                    else:
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            for stat in Rouge.STATS:
                                scores[metric][sample_id][stat].append(score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for metric in metrics:
                for stat in Rouge.STATS:
                    scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _get_scores_rouge_l_or_w(self, all_hypothesis, all_references, use_w=False):
        metric = "rouge-w" if use_w else "rouge-l"
        if self.apply_avg or self.apply_best:
            scores = {metric: {stat: 0.0 for stat in Rouge.STATS}}
        else:
            scores = {
                metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))]
            }

        for sample_id, (hypothesis_sentences, references_sentences) in enumerate(
            zip(all_hypothesis, all_references)
        ):
            assert isinstance(hypothesis_sentences, str)
            has_multiple_references = False
            if isinstance(references_sentences, list):
                has_multiple_references = len(references_sentences) > 1
                if not has_multiple_references:
                    references_sentences = references_sentences[0]

            # Prepare hypothesis and reference(s)
            hypothesis_sentences = self._preprocess_summary_per_sentence(hypothesis_sentences)
            references_sentences = (
                [
                    self._preprocess_summary_per_sentence(reference)
                    for reference in references_sentences
                ]
                if has_multiple_references
                else [self._preprocess_summary_per_sentence(references_sentences)]
            )

            # Compute scores
            # Aggregate
            if self.apply_avg:
                # average model
                total_hypothesis_ngrams_count = 0
                total_reference_ngrams_count = 0
                total_ngrams_overlapping_count = 0

                for reference_sentences in references_sentences:
                    (
                        hypothesis_count,
                        reference_count,
                        overlapping_ngrams,
                    ) = Rouge._compute_ngrams_lcs(
                        hypothesis_sentences,
                        reference_sentences,
                        self.weight_factor if use_w else 1.0,
                    )
                    total_hypothesis_ngrams_count += hypothesis_count
                    total_reference_ngrams_count += reference_count
                    total_ngrams_overlapping_count += overlapping_ngrams

                score = Rouge._compute_p_r_f_score(
                    total_hypothesis_ngrams_count,
                    total_reference_ngrams_count,
                    total_ngrams_overlapping_count,
                    self.alpha,
                    self.weight_factor if use_w else 1.0,
                )
                for stat in Rouge.STATS:
                    scores[metric][stat] += score[stat]
            else:
                # Best model
                if self.apply_best:
                    best_current_score = None
                    best_current_score_wlcs = None
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            total_hypothesis_ngrams_count,
                            total_reference_ngrams_count,
                            total_ngrams_overlapping_count,
                            self.alpha,
                            self.weight_factor if use_w else 1.0,
                        )

                        if use_w:
                            reference_count_for_score = reference_count ** (
                                1.0 / self.weight_factor
                            )
                            overlapping_ngrams_for_score = overlapping_ngrams
                            score_wlcs = (
                                overlapping_ngrams_for_score / reference_count_for_score
                            ) ** (1.0 / self.weight_factor)

                            if (
                                best_current_score_wlcs is None
                                or score_wlcs > best_current_score_wlcs
                            ):
                                best_current_score = score
                                best_current_score_wlcs = score_wlcs
                        else:
                            if best_current_score is None or score["r"] > best_current_score["r"]:
                                best_current_score = score

                    for stat in Rouge.STATS:
                        scores[metric][stat] += best_current_score[stat]
                # Keep all
                else:
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                            self.alpha,
                            self.weight_factor,
                        )

                        for stat in Rouge.STATS:
                            scores[metric][sample_id][stat].append(score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for stat in Rouge.STATS:
                scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _preprocess_summary_as_a_whole(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            # By words
            if self.length_limit_type == "words":
                summary = " ".join(sentences)
                all_tokens = summary.split()  # Counting as in the perls script
                summary = " ".join(all_tokens[: self.length_limit])

            # By bytes
            elif self.length_limit_type == "bytes":
                summary = ""
                current_len = 0
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)

                    if current_len + sentence_len < self.length_limit:
                        if current_len != 0:
                            summary += " "
                        summary += sentence
                        current_len += sentence_len
                    else:
                        if current_len > 0:
                            summary += " "
                        summary += sentence[: self.length_limit - current_len]
                        break
        else:
            summary = " ".join(sentences)

        summary = Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary.lower()).strip()

        tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary))
        preprocessed_summary = [" ".join(tokens)]

        return preprocessed_summary

    def _preprocess_summary_per_sentence(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            final_sentences = []
            current_len = 0
            # By words
            if self.length_limit_type == "words":
                for sentence in sentences:
                    tokens = sentence.strip().split()
                    tokens_len = len(tokens)
                    if current_len + tokens_len < self.length_limit:
                        sentence = " ".join(tokens)
                        final_sentences.append(sentence)
                        current_len += tokens_len
                    else:
                        sentence = " ".join(tokens[: self.length_limit - current_len])
                        final_sentences.append(sentence)
                        break
            # By bytes
            elif self.length_limit_type == "bytes":
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)
                    if current_len + sentence_len < self.length_limit:
                        final_sentences.append(sentence)
                        current_len += sentence_len
                    else:
                        sentence = sentence[: self.length_limit - current_len]
                        final_sentences.append(sentence)
                        break
            sentences = final_sentences

        final_sentences = []
        for sentence in sentences:
            sentence = Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence.lower()).strip()

            tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence))

            sentence = " ".join(tokens)

            final_sentences.append(sentence)

        return final_sentences

In [41]:
'''
rouge score 계산을 위해 필요한 함수입니다.
데이터셋에 따라 compute rouge 함수를 조금씩 수정해주어야합니다.
아래의 코드는 설명을 진행합니다.
'''
import os
from glob import glob
from tqdm import tqdm

class RougeScorer:
    def __init__(self):

        self.rouge_evaluator = Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=1000,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
        )

    def compute_rouge(self, ref_list, hyp_list):
        '''
        ref_path : 정답 문장
        hyp_path : 예측 문장
        '''

        print("-" * 50)
        print("# of Testset :", len(hyp_list))
        print("-" * 50)

        self.reference_summaries = []
        self.generated_summaries = []

        for ref_doc, hyp_doc in tqdm(zip(ref_list, hyp_list), total=len(ref_list)):

            ref_doc = " ".join(ref_doc)
            hyp_doc = " ".join(hyp_doc)

            self.reference_summaries.append(ref_doc)
            self.generated_summaries.append(hyp_doc)

        scores = self.rouge_evaluator.get_scores(self.generated_summaries, self.reference_summaries) # score를 계산하는 함수

        str_scores = self.format_rouge_scores(scores)
        print(str_scores)
        self.save_rouge_scores(str_scores)
        return str_scores

    def save_rouge_scores(self, str_scores):
        with open("rouge_scores.txt", "w") as output:
            output.write(str_scores)

    def format_rouge_scores(self, scores):
        return """\n
    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE 2
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE L
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}""".format(
            scores["rouge-1"]["f"],
            scores["rouge-1"]["p"],
            scores["rouge-1"]["r"],
            scores["rouge-2"]["f"],
            scores["rouge-2"]["p"],
            scores["rouge-2"]["r"],
            scores["rouge-l"]["f"],
            scores["rouge-l"]["p"],
            scores["rouge-l"]["r"],
        )

In [None]:
rouge_eval = RougeScorer()
result = rouge_eval.compute_rouge(all_label, all_pred)

In [None]:
# TextRank의 성능은 아래와 같이 도출됩니다.

# ****** ROUGE SCORES ******
# ** ROUGE 1
# F1        >> 0.449
# Precision >> 0.441
# Recall    >> 0.483
# ** ROUGE 2
# F1        >> 0.274
# Precision >> 0.267
# Recall    >> 0.293
# ** ROUGE L
# F1        >> 0.313
# Precision >> 0.308
# Recall    >> 0.334