In [1]:
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import pickle

In [2]:
class SentenceTokenizer:
    def __init__(self):
        self.twitter = Twitter()
        self.kkma = Kkma()
        f = open("stopword.txt", "rb")
        self.stopwords = pickle.load(f)
        f.close()
        
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (" " + sentences[idx])
                sentences[idx] = ""
                
        return sentences
        
    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not "":
                nouns.append(" ".join([noun for noun in self.twitter.nouns(str(sentence))
                                      if noun not in self.stopwords and len(noun) > 1]))
                
        return nouns

class GraphMatrix:
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
        
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        return self.graph_sentence
    
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}


class Rank:
    def get_ranks(self, graph, d=0.85):
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0
            link_sum = np.sum(A[:, id])
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1
            
        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B)
        return {idx: r[0] for idx, r in enumerate(ranks)}

class TextRank:
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
        
        self.sentences = self.sent_tokenize.text2sentences(text)
        
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        
        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
        
        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key = lambda k:
                                          self.sent_rank_idx[k], reverse=True)
        
        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key = lambda k:
                                          self.word_rank_idx[k], reverse=True)
        
    def summarize(self, sent_num=5):
        summary = []
        index = []
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
                
        index.sort()
        for idx in index:
            summary.append(self.sentences[idx])
                
        return summary
        
    def keywords(self, word_num=10):
        rank_idx = self.rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key = lambda k: rank_idx[k], reverse=True)
            
        keywords = []
        index=[]
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)
                
        for idx in index:
            keywords.append(self.idx2word[idx])
                
        return keywords


In [3]:
import pandas as pd
import os
import warnings

#파일 경로 설정
os.chdir('C:\\Users\\user')

#데이터 불러오기
with open("origin_data.pickle", "rb") as f:
    origin = pickle.load(f)

In [4]:
#warning 메세지 숨기기
warnings.filterwarnings(action='ignore')

#명사 키워드 추출
keyword=[]
for row in range(len(origin)):
    text=origin[row][1]
    tr=TextRank(text)
    keyword.append(tr.keywords())

KeyboardInterrupt: 

In [6]:
#데이터 저장
with open("keyword.pickle", "wb") as f:
    pickle.dump(keyword, f, pickle.HIGHEST_PROTOCOL)