In [1]:
import json
import pandas as pd
import re
from collections import Counter
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.preprocessing import normalize
import math
from math import log
from google.colab import drive

In [2]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git 
%cd Mecab-ko-for-Google-Colab 
!bash install_mecab-ko_on_colab190912.sh 

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 72 (delta 31), reused 20 (delta 5), pack-reused 0[K
Unpacking objects: 100% (72/72), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 10.3MB/s 
[?25hCollecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b1

In [3]:
# 키워드 추출을 위해 단어 그래프 생성하는 함수
def scan_vocabulary(sents, tokenize, min_count=2):
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

In [4]:
# 핵심 문장 선택을 위한 PageRank(Graph ranking 알고리즘) 함수
def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)

    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R

In [5]:
# 그래프 각 파라미터를 함수들에 적용
def sent_graph(sents, tokenize, similarity, min_count=2, min_sim=0.1):
    _, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    tokens = [[w for w in tokenize(sent) if w in vocab_to_idx] for sent in sents]
    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

In [6]:
# 문서 간 혹은 문장 간 유사도를 측정하기 위한 함수
def cosine_sent_sim(s1, s2):
    if (not s1) or (not s2):
        return 0
    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)
    return prod / (norm1 * norm2)

In [7]:
# 문서 간 혹은 문장 간 유사도를 측정하기 위한 함수
def textrank_sent_sim(s1, s2):
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

In [29]:
# 전처리 함수
def CleanText(article):
    article = re.sub('”', ' ', article)
    bracket = re.findall(r'\([^)]*\)', article )
    for i in bracket:
        word = i.strip('()')
        if word.isupper():
            end_index = article.find(i)
            word_len = article[end_index:0:-1].find(' ')
            start_index = end_index - word_len +1
            origin = article[start_index : end_index]
            article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        else:
            if '이하' in word:
                word = word[3:]
                n_space = word.count(' ')
                end_index = article.find(word)-4
                range_candidate = article[end_index-30:end_index].split(' ')[::-1]
                origin = ' '.join(range_candidate[:n_space+1][::-1])
                article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        article = article.replace(i,'')

    article = ''.join(re.findall('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', article)) # email 제거

    article = ''.join(re.findall('[가-힣a-zA-Z ]',  article) ) # 가-힣a-zA-Z0-9

    return article.strip(' ')

In [11]:
# 토크나이저
from konlpy.tag import Komoran, Okt, Kkma, Mecab

kkma = Kkma()
komoran = Komoran()
okt = Okt()
mecab = Mecab()

def komoran_tokenize(sent):
    words = mecab.pos(sent, join=True) # 품사 같이 나온 
    words = [w for w in words if ('/Foreign' not in w and '/Alpha' not in w and '/Unknown' not in w and '/Email' not in w and '/URL' not in w)] #okt
    # words = [w for w in words if ('/SL' not in w and '/XS' not in w and '/IC' not in w and '/NNBC' not in w and '/VA' not in w and '/XR' not in w)] #mecab
    return words

In [12]:
# 핵심 문장 인덱스 추출을 위한 함수
def textrank_keysentence(sents, tokenize, min_count, min_sim, similarity, df=0.1, max_iter=30, topk=3): # df deafult 0.85
    g = sent_graph(sents, tokenize,  similarity ,min_count, min_sim )
    R = pagerank(g, df, max_iter).reshape(-1)
    idxs = R.argsort()[-topk:]
    keysents = [(idx) for idx  in reversed(idxs)]
    return keysents

In [13]:
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
data = []
for line in open('/content/drive/MyDrive/빅데이터/문서 추출요약/train.jsonl', 'r', encoding='utf-8'):
    data.append(json.loads(line)) #extractive_test_v2, train

In [30]:
# 사용할 데이터 정리
clean_article = []
ori_article = []

id_list = []
for i in range(len(data)):
    s1 = []
    s2 = []
    id_list.append(data[i]['id'])
    for j in range(len(data[i]['article_original'])):
        s1.append(data[i]['article_original'][j])
        s2.append(CleanText(data[i]['article_original'][j]))
    ori_article.append(s1)
    clean_article.append(s2)

In [31]:
# 핵심 문장의 인덱스 추출
idx_list = []
for sents in clean_article:
    keysents = textrank_keysentence(sents, 
                                    komoran_tokenize,
                                    min_count= 6,
                                    min_sim = 0.2, 
                                    similarity = textrank_sent_sim,
                                    topk=3
                                   )
    idx = [sent_idx for sent_idx in keysents]
    idx_list.append(idx)

In [32]:
ori_idx_list = []
for i in range(len(data)):
    ori_idx_list.append(data[i]['extractive'])
    
result = pd.DataFrame(
                {'ori_index' : ori_idx_list,
                 're_index' : idx_list},
                )

compare_index_list = []
for i in range(len(result)):
    result_sum = 0
    for j in range(len(result['ori_index'][i])):
        if result['ori_index'][i][j] == result['re_index'][i][j] :
            result_sum += 1
        else :
            result_sum += 0
    compare_index_list.append(result_sum)

In [33]:
result = pd.DataFrame(
                {'ori_index' : ori_idx_list,
                 're_index' : idx_list,
                 'compare_index' : compare_index_list}
                )
result['compare_index'].sum()

4866

In [None]:
# min count 2 : 40966
# min count 4 : 41488
# min count 5 : 42076 
# min count 7 : 42454
# min count 8 : 40806
# min count 10 / min_sim 0.2 / textrank_sent_sim : 34850
# min count 6 / min_sim 0.1 / textrank_sent_sim : 42611
# min count 6 / min_sim 0.2 / textrank_sent_sim : 42614
# min count 6 / min_sim 0.3 / textrank_sent_sim : 42541
# min count 6 / min_sim 0.4 / textrank_sent_sim : 42614
# min count 6 / min_sim 0.5 / textrank_sent_sim : 42175
# min count 7 / min_sim 0.2 / textrank_sent_sim : 42454
# min count 10 / min_sim 0.2 / textrank_sent_sim : 37790
# min count 6 / min_sim 0.2 / cosine_sent_sim : 40841
# min count 2 / min_sim 0.2 / cosine_sent_sim : 40677
# min count 6 / min_sim 0.4 / textrank_sent_sim / max iter 10 : 42614
# df 0.95 : 42594
# df 0.75 : 42692
# df 0.8 : 42665
# df 0.65 : 42743
# df 0.55 : 42869
# df 0.45 : 43093
# df 0.35 : 43278
# df 0.2 : 43259
# df 0.15 : 43295
# df 0,125 : 43301
# df 0.1 : 43308
# df 0.075 : 43272
# df 0.05 :43227
# okt Foreign, Alpha, Unknown, Email, URL : 40458
# without A-Za-z : 43220
# without 0-9 : 43330

# 결과 저장


In [None]:
# 추출한 인덱스를 원문에 적용하여 핵심 문장 정리
result = []
for i in range(len(idx_list)) :
    ori_str = []
    for j in range(3) :
        ori_str.append(ori_article[i][idx_list[i][j]])
    result.append('\n'.join(ori_str))

In [None]:
result = pd.DataFrame(
                {'id' : id_list,
                 'summary' : result},
                )

In [None]:
# 결과 저장
result.to_csv('/content/drive/MyDrive/빅데이터/문서 추출요약/extractive_submission_19.csv', encoding="utf-8", index=False)
# result.to_csv('D:/BigData/document_extraction_data/extractive_submission_8.csv', sep=',', index=False)

# result.to_csv('C:/Users/MoonJu/Documents/Data_analysis/dacon_extract_summary/extractive_submission_13.csv', encoding="utf-8", index=False)
# # result.to_csv('D:/BigData/document_extraction_data/extractive_submission_8.csv', sep=',', index=False)

In [None]:
# csv_8 : komoran 한글자제거,한글처리 가-힣ㄱ-ㅎㅏ-ㅣ 
# csv_9 : 한글자 제대로 처리, 한글 영어 숫자 
# csv_10 : 한글자 처리, 한글 영어 숫자 사용
# csv_11 : okt
# csv_12 : kkma
# csv_13 : okt 전체
# csv_14 : komoran 전체
# csv_15 : mecab 전체
# csv_16 : mecab 태그 제거 후
# csv_18 : mecab df 0.35
# csv_19 : mecab df 0.1