In [2]:
## normalizing
import re
  
kor_begin     = 44032
kor_end       = 55203
chosung_base  = 588
jungsung_base = 28
jaum_begin = 12593
jaum_end = 12622
moum_begin = 12623
moum_end = 12643

chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 
        'ㅅ', 'ㅆ', 'ㅇ' , 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jungsung_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 
        'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 
        'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 
        'ㅡ', 'ㅢ', 'ㅣ']

jongsung_list = [
    ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ',
        'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 
        'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 
        'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jaum_list = ['ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 
              'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 
              'ㅃ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

moum_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']

doublespace_pattern = re.compile('\s+')
repeatchars_pattern = re.compile('(\w)\\1{3,}')


In [3]:
def normalize(doc, english=True, number=True, punctuation=False, remove_repeat = 0, remains={}):
    if remove_repeat > 0:
        doc = repeatchars_pattern.sub('\\1' * remove_repeat, doc)

    f = ''    
    for c in doc:
        i = ord(c)
        
        if (c == ' ') or (is_korean(i)) or (english and is_english(i)) or (number and is_number(i)) or (punctuation and is_punctuation(i)):
            f += c            
        elif c in remains:
            f += c        
        else:
            f += ' '
            
    return doublespace_pattern.sub(' ', f).strip()

In [4]:
def split_jamo(c):    
    i = ord(c)
    
    if not is_korean(i):
        return None
    elif is_jaum(i):
        return [c, ' ', ' ']
    elif is_moum(i):
        return [' ', c, ' ']
    
    i -= kor_begin
    
    cho  = i // chosung_base
    jung = ( i - cho * chosung_base ) // jungsung_base 
    jong = ( i - cho * chosung_base - jung * jungsung_base )
    
    return [chosung_list[cho], jungsung_list[jung], jongsung_list[jong]]

In [5]:
def is_korean(i):
    i = to_base(i)
    return (kor_begin <= i <= kor_end) or (jaum_begin <= i <= jaum_end) or (moum_begin <= i <= moum_end)

def is_number(i):
    i = to_base(i)
    return (i >= 48 and i <= 57)

def is_english(i):
    i = to_base(i)
    return (i >= 97 and i <= 122) or (i >= 65 and i <= 90)

def is_punctuation(i):
    i = to_base(i)
    return (i == 33 or i == 34 or i == 39 or i == 44 or i == 46 or i == 63 or i == 96)

def is_jaum(i):
    i = to_base(i)
    return (jaum_begin <= i <= jaum_end)

def is_moum(i):
    i = to_base(i)
    return (moum_begin <= i <= moum_end)

def to_base(c):
    if type(c) == str:
        return ord(c)
    elif type(c) == int:
        return c
    else:
        raise TypeError


In [6]:
def combine_jamo(chosung, jungsung, jongsung):
    return chr(kor_begin + chosung_base * chosung_list.index(chosung) + jungsung_base * jungsung_list.index(jungsung) + jongsung_list.index(jongsung))


In [7]:
class ConvolutionalNN_Encoder:
        
    def __init__(self, vocabs={}):
        self.vocabs = vocabs
        
        self.jungsung_hot_begin = 31
        self.jongsung_hot_begin = 52
        self.symbol_hot_begin = 83

        self.cvocabs_ = [' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 
                   'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅃ', 
                   'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 
                   'ㅎ', 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ',
                   'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 
                   'ㅢ', 'ㅣ']
        self.cvocabs = {}
        self.cvocabs = {c:len(cvocabs) + 1 for c in cvocabs_}

        # svocabs_ = ['.',  ',',  '?',  '!',  '-', ':', 
        #            '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9']
        # svocabs = {}
        # svocabs = {s:len(svocabs) + symbol_hot_begin for s in svocabs_}


    def encode_vocab(self, words, unknown=-1, blank=0, input_length=64):
        if len(words) > input_length:
            words = words[:input_length]
        return [self.vocabs[w] if w in self.vocabs else unknown for w in words] + [blank] * (input_length - len(words))

    def encode_jamo_onehot(self, chars, input_length=64, as_ndarray=False):
        ints = []
        return ints

    def encode_jamo_threehot(self, chars, input_length=64, as_ndarray=False):
        raise NotImplemented

In [8]:
## model
import tensorflow as tf
from tensorflow.contrib import rnn

class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, embedding_size, num_hidden, num_layers, learning_rate, beam_width, keep_prob, glove, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.beam_width = beam_width
        if not forward_only:
            self.keep_prob = keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and glove:
                init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings2 = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings2, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings2, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings2,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

In [9]:
# utils
import re
import numpy as np
import pickle

from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

import collections
from collections import defaultdict
import csv
from konlpy.tag import Twitter#t=Twitter()#tokens_ko=t.morphs(doc_ko)

In [10]:
import operator
import collections
from collections import defaultdict
def build_dict(step, toy=False):
    if step == "train":
      with open ('/home/lab11/0_project/dataset/news_sum2.csv', "r", encoding="utf-8") as f:
        head_li = []
        topic_li = []
        for x in f.readlines():
          x = x.split('|')
          if x[0]=='':
            continue
          if len(x)>2:
            head_li.append(x[1])
            topic_li.append(x[2])

      words = list()
      count = 0
      dic = defaultdict(lambda:[])

      for sentence in topic_li + head_li:
          sentence = normalize(sentence, punctuation=True)
          for idx,word in enumerate(sentence.split()):
              if len(word) > 0:
                  normalizedword=word[:3]
                  tmp=[]
                  for char in normalizedword:
                      if ord(char) < 12593 or ord(char) > 12643:
                          tmp.append(char)
                  normalizedword = ''.join(char for char in tmp)
                  if word not in dic:
                      dic[normalizedword].append(word)

      dic = sorted(dic.items(), key=operator.itemgetter(0))[1:]
      words=[]
      for i in range(len(dic)):
          word=[]
          word.append(dic[i][0])
          for w in dic[i][1]:
              if w not in word:
                  word.append(w)
          words.append(word)
      words.append(['<padding>'])
      words.append(['<unk>'])
      words.append(['<s>'])
      words.append(['</s>'])
      
      reversed_dict = {i:ch[0] for i,ch in enumerate(words)}
      word_dict={}
      for idx,words in enumerate(words):
          for word in words:
              word_dict[word]=idx
              
      with open("word_dict.pickle", "wb") as f:
          pickle.dump(word_dict, f)
      with open("ix_to_dict.pickle", "wb") as t:
          pickle.dump(word_dict, t)

    elif step == "valid":
      with open("word_dict.pickle", "rb") as f:
          word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 120
    summary_max_len = 18
    print("reversed dict:",len(reversed_dict),"word dict:",len(word_dict))
    return word_dict, reversed_dict, article_max_len, summary_max_len

In [11]:
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
      with open ('/home/lab11/0_project/dataset/news_sum2.csv', "r", encoding="utf-8") as f:
        head_li = []
        topic_li = []
        for x in f.readlines():
          x = x.split('|')
          if x[0]=='':
            continue
          if len(x)>2:
            head_li.append(x[1])
            topic_li.append(x[2])
    elif step == "valid":
      with open ('/home/lab11/0_project/dataset/news_sum2.csv', "r", encoding="utf-8") as f:
        topic_li = []
        for x in f.readlines():
          x = x.split('|')
          if x[0]=='':
            continue
          if len(x)>2:
            topic_li.append(x[2])
    else:
        raise NotImplementedError

    x = [normalize(d) for d in topic_li]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [normalize(d) for d in head_li]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y

In [12]:
def get_init_embedding(reversed_dict, embedding_size):
    glove_file = "/home/lab11/0_project/glove/glove.42B.300d.txt"
    word2vec_file = get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file, word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)

    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32)

        word_vec_list.append(word_vec)

    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

In [50]:
# def batch_iter(x, y, batch_size=32, num_epoch=10):
#         x = np.array(x)
#         y = np.array(y)
#         for _ in range(num_epoch):
#             for batch_num in range(0, len(x), batch_size):
#                 yield x[batch_num:(batch_num+batch_size)], y[batch_num:(batch_num+batch_size)]
def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)
    print(inputs)
    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for _ in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            # start_index = batch_num * batch_size
            # end_index = min((batch_num + 1) * batch_size, len(inputs))
            # yield inputs[start_index:end_index], outputs[start_index:end_index]
            yield inputs[batch_num:(batch_num+batch_size)], outputs[batch_num:(batch_num+batch_size)]

In [14]:
# train
import time
start = time.perf_counter()
import tensorflow as tf
import pickle
import os

In [15]:
embedding_size=300
num_hidden = 300
num_layers = 3
learning_rate = 0.001
beam_width = 10
keep_prob = 0.8
glove = True
batch_size=256
num_epochs=10

if not os.path.exists("saved_model"):
    os.mkdir("saved_model")
else:
    old_model_checkpoint_path = open('saved_model/checkpoint', 'r')
    old_model_checkpoint_path = "".join(["saved_model/",old_model_checkpoint_path.read().splitlines()[0].split('"')[1]])

print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", toy=True)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, toy=True)

Building dictionary...
reversed dict: 67802 word dict: 124368
Loading training dataset...


In [16]:
with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, embedding_size, num_hidden, num_layers, learning_rate, beam_width, keep_prob, glove)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if 'old_model_checkpoint_path' in globals():
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, batch_size, num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, "./saved_model/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")

Loading Glove vectors...
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use `tf.cast` instead.

Iteration starts.
Number of batches per epoch : 18
 Epoch 1: Model is saved. Elapsed: 00:19:52.39 

 Epoch 2: Model is saved. Elapsed: 00:28:27.21 

 Epoch 3: Model is saved. Elapsed: 00:37:02.69 

 Epoch 4: Model is saved. Elapsed: 00:45:40.67 

In [63]:
# main
def headline(data):
    
    embedding_size=300
    num_hidden = 300
    num_layers = 3
    learning_rate = 0.001
    beam_width = 10
    keep_prob = 0.8
    glove = True
    batch_size=128
    num_epochs=15
    while True:
        print("Loading dictionary...")
        word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", False)
        print('word dict : ',word_dict)
        content=normalize(data)
        print('content : ', content)
        x = [normalize(content)]
        x = [word_dict.get(w, word_dict["<unk>"]) for w in x]
        # x = x[:120]
        print('x : ',x)
        # valid_x = x + (article_max_len - len(x)) * [word_dict["<padding>"]]
        valid_x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
        # valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
        print('valid_x : ',valid_x)
        with tf.Session() as sess:
          print("Loading saved model...")
          model = Model(reversed_dict, article_max_len, summary_max_len, embedding_size, num_hidden, num_layers, learning_rate, beam_width, keep_prob, glove, forward_only=True)
          saver = tf.train.Saver(tf.global_variables())
          ckpt = tf.train.get_checkpoint_state("./saved_model/")
          saver.restore(sess, ckpt.model_checkpoint_path)

          batches = batch_iter(valid_x, [0] * len(valid_x), batch_size, 1)

          for batch_x, _ in batches:
              print(batch_x)
              batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
            #   batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

              valid_feed_dict = {
                  model.batch_size: len(batch_x),
                  model.X: batch_x,
                  model.X_len: batch_x_len,
              }

              prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
              prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
              summary=list()
              predict=""
              for word in prediction_output:
                  if word == "</s>":
                      break
                  if word not in summary:
                      summary.append(word)
                  predict=" ".join(summary)
              print(predict)

In [46]:
from lexrankr import LexRank

def summary(data):
    print("smry start")
    multi_summary = data
    
    lexrank = LexRank()
    lexrank.summarize(multi_summary)  # data (본문)가져와서 요약
    summaries = lexrank.probe(3)  # 3줄요약, summaries 타입은 list
    summaries = '. '.join(summaries)+'.'
    print("multi-summary= ",summaries)
    return summaries
    


In [64]:
txt = '''
미국 제약회사 길리어드사이언스의 항바이러스제 '렘데시비르'가 신종 코로나바이러스 감염증(코로나19) 환자에게 미치는 효과가 거의 없다는 세계보건기구(WHO)의 연구 결과가 나왔다.

WHO가 입원 환자 1만1천266명을 상대로 진행하고 있는 '연대 실험'에서 렘데시비르가 환자의 입원 기간을 줄이거나 사망률을 낮추지 못했다고 로이터통신이 15일 보도했다.

WHO의 연대 실험은 코로나19 치료제 후보군의 효능과 안전성을 검증하기 위한 다국적 임상시험이다. 렘데시비르 외에 말라리아 치료제 하이드록시클로로퀸, 인간면역결핍바이러스(HIV) 치료제인 로피나비르/리토나비르, 항바이러스제 인터페론 등을 대상으로 진행되고 있다.

시험 결과 이 약물 중 어떤 것도 실질적으로 사망률에 영향을 주거나 인공호흡기 사용 필요성을 줄여주지 못한 것으로 나타났다. 시험에 사용된 약물들은 렘데시비르와 하이드록시클로로퀸, 로피나비르, 인터페론 등이다. 또 이들 약물은 환자들의 병원 입원 기간에도 거의 영향을 주지 않았다.

길리어드사이언스는 이달 초 코로나19 입원 환자 1천62명을 대상으로 임상시험을 진행한 결과 렘데시비르가 회복 기간을 5일 단축해줬다고 밝힌 바 있다.

최근 코로나에 걸렸다가 완치 판정을 받은 도널드 트럼프 미국 대통령도 이 약을 투약했다.

한편, 한국에서는 지난 13일까지 62개 병원에서 600명의 환자에게 렘데시비르를 투여했다고 방역 당국이 밝힌 바 있다.

'''
tf.reset_default_graph()
headline(summary(txt))

고했': 67014, '회고했다.': 67014, '회관에': 67015, '회관에서는': 67015, '회귀는': 67016, '회귀에': 67017, '회귀한': 67018, '회기에': 67019, '회기에서': 67019, '회담': 67020, '회담과': 67021, '회담에': 67022, '회담에서': 67022, '회담에서도': 67022, '회담에선': 67022, '회담은': 67023, '회담을': 67024, '회담의': 67025, '회담이': 67026, '회답해': 67027, '회답해주시기': 67027, '회동': 67028, '회동도': 67029, '회동에': 67030, '회동에서': 67030, '회동에도': 67030, '회동은': 67031, '회동을': 67032, '회동하': 67033, '회동하면서': 67033, '회동해': 67034, '회동해,': 67034, '회람': 67035, '회랑': 67036, '회령시': 67037, '회령시에': 67037, '회룡천': 67038, '회룡천에서': 67038, '회룡천과': 67038, '회복': 67039, '회복되': 67040, '회복되면서': 67040, '회복되지는': 67040, '회복되고': 67040, '회복되지': 67040, '회복되긴': 67040, '회복때': 67041, '회복세': 67042, '회복세로': 67042, '회복세를': 67042, '회복세임에도': 67042, '회복세가': 67042, '회복시': 67043, '회복시키는': 67043, '회복에': 67044, '회복은': 67045, '회복을': 67046, '회복의': 67047, '회복이': 67048, '회복하': 67049, '회복하면서': 67049, '회복하는': 67049, '회복하면': 67049, '회복하고': 67049, '회복하기': 67049, '회복하고,': 67049, '회복하겠다': 67049, '회복하지': 67049, '회복할': 6

TypeError: object of type 'int' has no len()