<a href="https://colab.research.google.com/github/Jisang-hwang93/NLP_Class/blob/master/09%20Document%20Summarization%20Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Document Summarization**

## **1. konlpy 다운로드**

In [45]:
# !sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
# !sudo apt-get install python-dev; pip install konlpy     # Python 2.x
# !sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
# !sudo apt-get install curl
# !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

## **2. Text Rank : 문서 요약 구현**

In [46]:
# 예시 문서
docs = ["딸기 바나나 사과 파인애플 수박",
        "바나나 사과 딸기 포도",
        "복숭아 수박",
        "파인애플 사과 딸기 바나나"]

In [47]:
from konlpy.tag import Mecab

import numpy as np
import math
import networkx as nx

mecab = Mecab()

# 토큰화
tokens = []
for line in docs:
    tokens.append([token for token in mecab.pos(line)])

tokens

[[('딸기', 'NNG'),
  ('바나나', 'NNG'),
  ('사과', 'NNG'),
  ('파인애플', 'NNG'),
  ('수박', 'NNG')],
 [('바나나', 'NNG'), ('사과', 'NNG'), ('딸기', 'NNG'), ('포도', 'NNG')],
 [('복숭아', 'NNG'), ('수박', 'NNG')],
 [('파인애플', 'NNG'), ('사과', 'NNG'), ('딸기', 'NNG'), ('바나나', 'NNG')]]

In [48]:
# 초기 세팅값
first_matrix = [[0*i]*len(docs) for i in range(len(docs))]

for i in range(len(docs)-1):
    for j in range(i+1, len(docs)):
        union = set(tokens[i]).union(set(tokens[j]))
        intersection = set(tokens[i]).intersection(set(tokens[j]))
        first_matrix[i][j] = len(intersection)/len(union)
        first_matrix[j][i] = len(intersection)/len(union)

first_matrix

[[0, 0.5, 0.16666666666666666, 0.8],
 [0.5, 0, 0.0, 0.6],
 [0.16666666666666666, 0.0, 0, 0.0],
 [0.8, 0.6, 0.0, 0]]

In [49]:
# 첫 스코어
first_score = [0] * len(docs)
for i in range(len(first_matrix)):
    for j in range(len(first_matrix[i])):
        first_score[i] += first_matrix[i][j]

first_score

[1.4666666666666668, 1.1, 0.16666666666666666, 1.4]

In [50]:
# 가중치 행렬
weight_matrix = [[0*i]*len(docs) for i in range(len(docs))]
for i in range(len(weight_matrix)):
    for j in range(len(weight_matrix[i])):
        weight_matrix[i][j] = first_matrix[i][j] / first_score[i]

weight_matrix

[[0.0, 0.3409090909090909, 0.11363636363636362, 0.5454545454545454],
 [0.45454545454545453, 0.0, 0.0, 0.5454545454545454],
 [1.0, 0.0, 0.0, 0.0],
 [0.5714285714285715, 0.4285714285714286, 0.0, 0.0]]

In [51]:
# 스코어 갱신 함수
def cal_score(matrix):
    score_sum = [0] * len(docs)
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            score_sum[i] += matrix[j][i] * 0.85 # Damping Factor = 0.85
        score_sum[i] = score_sum[i] + 0.15
    return score_sum

In [52]:
# 행렬 갱신 함수
def make_new(matrix):
    new_matrix = [[0*i]*len(docs) for i in range(len(docs))]
    for i in range(len(new_matrix)):
        for j in range(len(new_matrix[i])):
            new_matrix[i][j] = weight_matrix[i][j] * cal_score(matrix)[i]
    return new_matrix

In [53]:
# 반복 확인
matrix1 = make_new(first_matrix)
matrix2 = make_new(matrix1)
matrix3 = make_new(matrix2)
matrix4 = make_new(matrix3)
matrix5 = make_new(matrix4)
matrix6 = make_new(matrix5)
matrix7 = make_new(matrix6)

np.array(first_matrix), np.array(matrix1), np.array(matrix2), np.array(matrix3), np.array(matrix4), np.array(matrix5), np.array(matrix6), np.array(matrix7)

(array([[0.        , 0.5       , 0.16666667, 0.8       ],
        [0.5       , 0.        , 0.        , 0.6       ],
        [0.16666667, 0.        , 0.        , 0.        ],
        [0.8       , 0.6       , 0.        , 0.        ]]),
 array([[0.        , 0.47613636, 0.15871212, 0.76181818],
        [0.49318182, 0.        , 0.        , 0.59181818],
        [0.29166667, 0.        , 0.        , 0.        ],
        [0.76571429, 0.57428571, 0.        , 0.        ]]),
 array([[0.        , 0.50044717, 0.16681572, 0.80071547],
        [0.47402671, 0.        , 0.        , 0.56883205],
        [0.2849053 , 0.        , 0.        , 0.        ],
        [0.74319481, 0.5573961 , 0.        , 0.        ]]),
 array([[0.        , 0.48641175, 0.16213725, 0.7782588 ],
        [0.47689399, 0.        , 0.        , 0.57227279],
        [0.29179336, 0.        , 0.        , 0.        ],
        [0.75092308, 0.56319231, 0.        , 0.        ]]),
 array([[0.        , 0.49147802, 0.16382601, 0.78636484],
      

In [54]:
# 스코어 확인
import pandas as pd
pd.DataFrame(cal_score(matrix6), index=["문장1", "문장2", "문장3", "문장4"], columns=["score"])

Unnamed: 0,score
문장1,1.424424
문장2,1.035938
문장3,0.288136
문장4,1.294246


## **3. Class로 구현**

In [55]:
from konlpy.tag import Mecab

import numpy as np
import pandas as pd
import networkx as nx
import math

mecab = Mecab()

class summarization():
    def __init__(self, docs):
        self.tokens = []
        self.first_matrix = []
        self.first_score = []
        self.weight_matrix = []
        self.docs = docs
        self.score_sum = 0
    
    # 토큰화
    def make_token(self):
        for line in self.docs:
            self.tokens.append([token for token in mecab.pos(line)])
        return self.tokens

    # 초기 세팅값
    def make_first(self):
        self.first_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(self.docs)-1):
            for j in range(i+1, len(self.docs)):
                union = set(self.tokens[i]).union(set(self.tokens[j]))
                intersection = set(self.tokens[i]).intersection(set(self.tokens[j]))
                self.first_matrix[i][j] = len(intersection)/len(union)
                self.first_matrix[j][i] = len(intersection)/len(union)
        return self.first_matrix

    # 최초 스코어 계산
    def First_Score(self):
        self.first_score = [0] * len(docs)
        for i in range(len(self.first_matrix)):
            for j in range(len(self.first_matrix[i])):
                self.first_score[i] += self.first_matrix[i][j]
        return self.first_score

    # 가중치 행렬
    def make_weight(self):
        self.weight_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(self.weight_matrix)):
            for j in range(len(self.weight_matrix[i])):
                self.weight_matrix[i][j] = self.first_matrix[i][j] / self.first_score[i]
        return self.weight_matrix

    # 스코어 계산
    def cal_score(self, matrix):
        self.score_sum = [0] * len(self.docs)
        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                self.score_sum[i] += matrix[j][i] * 0.85
            self.score_sum[i] = self.score_sum[i] + 0.15
        return self.score_sum

    # 새로운 행렬에 반영
    def make_new(self, matrix):
        new_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(new_matrix)):
            for j in range(len(new_matrix[i])):
                new_matrix[i][j] = self.weight_matrix[i][j] * self.cal_score(matrix)[i]
        return new_matrix
    
    # 자동 실행 수행
    def run(self, num):
        self.make_token()
        matrix = self.make_first() # 초기 행렬
        score = self.First_Score() # 초기 스코어
        self.make_weight()
        # 반복 수행
        for i in range(num):
            matrix = self.make_new(matrix)
            score = self.cal_score(matrix)
            print(np.array(matrix))
            print("="*50)
        
        return pd.DataFrame(score, index=["문장1", "문장2", "문장3", "문장4"], columns=["score"])

In [56]:
# 예시 문서
docs = ["딸기 바나나 사과 파인애플 수박",
        "바나나 사과 딸기 포도",
        "복숭아 수박",
        "파인애플 사과 딸기 바나나"]

In [57]:
ds = summarization(docs)

In [58]:
ds.run(20)

[[0.         0.47613636 0.15871212 0.76181818]
 [0.49318182 0.         0.         0.59181818]
 [0.29166667 0.         0.         0.        ]
 [0.76571429 0.57428571 0.         0.        ]]
[[0.         0.50044717 0.16681572 0.80071547]
 [0.47402671 0.         0.         0.56883205]
 [0.2849053  0.         0.         0.        ]
 [0.74319481 0.5573961  0.         0.        ]]
[[0.         0.48641175 0.16213725 0.7782588 ]
 [0.47689399 0.         0.         0.57227279]
 [0.29179336 0.         0.         0.        ]
 [0.75092308 0.56319231 0.         0.        ]]
[[0.         0.49147802 0.16382601 0.78636484]
 [0.47371066 0.         0.         0.56845279]
 [0.28781666 0.         0.         0.        ]
 [0.74168677 0.55626508 0.         0.        ]]
[[0.         0.48672681 0.16224227 0.7787629 ]
 [0.47299165 0.         0.         0.56758998]
 [0.28925211 0.         0.         0.        ]
 [0.74376856 0.55782642 0.         0.        ]]
[[0.         0.48753766 0.16251255 0.78006026]
 [0.4717

Unnamed: 0,score
문장1,1.41188
문장2,1.025726
문장3,0.286401
문장4,1.280386
