In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re
import datetime 
import networkx
import json
from konlpy.tag import Komoran
from kss import split_sentences
import math

In [2]:
def tokenize(content):
    #split sentence
    sent_list = [sent for sent in split_sentences(''.join(content)) if sent is not None]
    return sent_list

In [3]:
def posTagging(sentence):
    #pos_tagging
    tagger = Komoran()
    stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV') ])
    posTag = list(filter(
            lambda y:y not in stopword and y[1] in ('NNG', 'NNP', 'VV', 'VA'), 
            tagger.pos(sentence)))
    return posTag

In [4]:
class RawSentence:
    def __init__(self, content):
        self.content = content
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
        
    def __iter__(self):
        for line in self.content:
            ch = self.rgxSplitter.split(line)
            print("이 것 : ",ch[1::2])
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield s

In [29]:
def similarity(a, b):
        n = len(a.intersection(b))
        return n / float(len(a) + len(b) - n) / (math.log(len(a)+1) * math.log(len(b)+1))

In [36]:
class TextRank:
    def __init__(self, window = 5, coef = 1.0, threshold=0.005):
        self.graph = None
        self.window = window
        self.coef = coef
        self.threshold = threshold
        self.dictCount = 0
        self.dictSentence = {}
        self.dictSimilarity = {}
        self.posTagger = posTagging
 
 
    def loadSentence(self, sentenceIter):    
        setSentence = []
        for sent in sentenceIter:
            if type(sent) == str:
                setSentence.append(set(filter(None, self.posTagger(sent))))
            else: 
                setSentence.append(set(sent))
            
            self.dictSentence[self.dictCount] = sent
            self.dictCount += 1
            
        for i in range(self.dictCount):
            for j in range(i+1, self.dictCount):
                s = similarity(setSentence[i], setSentence[j])
                if s < self.threshold: continue
                self.dictSimilarity[i, j] = s
    
    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictSentence.keys())
        for (a, b), n in self.dictSimilarity.items():
            self.graph.add_edge(a, b, weight=n*self.coef + (1-self.coef))
 
    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')
 

    def summarize(self, ratio = 0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r)*ratio)]
        return ' '.join(map(lambda k:self.dictSentence[k], sorted(ks)))


In [37]:
def summarise_contents(x):
    tr = TextRank(5, 1.0, 0.005) #window, coef, threshold

    tr.loadSentence(tokenize(x))

    tr.build()
    ranks = tr.rank()
    return tr.summarize(0.2)

In [38]:
summarise_contents(content)

'나경원 한국당 원내대표는 "오늘 가장 강하게 요청한 것은 특검법이었다"고 말했다. 반면 홍영표 민주당 원내대표는 말을 아꼈다.'

In [8]:
with open("../data.json", 'r') as f:
    data = json.load(f)

In [9]:
content = data[1]["content"]

In [None]:
content

In [None]:
summarise_contents(content)