In [114]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import numpy as np
import pandas as pd
import string

import re

## texts_to_matrix

In [4]:
texts = ['자연어 처리 알고리즘', '자연어 처리 방법', '자연어 NLP 알고리즘 알고리즘', '자연어 처리 전문가']

tkt = Tokenizer()

tkt.fit_on_texts(texts)

tkt.index_word

{1: '자연어', 2: '처리', 3: '알고리즘', 4: '방법', 5: 'nlp', 6: '전문가'}

In [9]:
tkt.texts_to_matrix(texts)

array([[0., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1.]])

In [10]:
tkt.texts_to_matrix(texts, mode='count')

array([[0., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 0.],
       [0., 1., 0., 2., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1.]])

In [11]:
tkt.texts_to_matrix(texts, mode='tfidf')

array([[0.        , 0.58778666, 0.69314718, 0.84729786, 0.        ,
        0.        , 0.        ],
       [0.        , 0.58778666, 0.69314718, 0.        , 1.09861229,
        0.        , 0.        ],
       [0.        , 0.58778666, 0.        , 1.43459998, 0.        ,
        1.09861229, 0.        ],
       [0.        , 0.58778666, 0.69314718, 0.        , 0.        ,
        0.        , 1.09861229]])

In [12]:
tkt.texts_to_matrix(texts, mode='freq') # 빈도를 비율로.

array([[0.        , 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        , 0.        ],
       [0.        , 0.33333333, 0.33333333, 0.        , 0.33333333,
        0.        , 0.        ],
       [0.        , 0.25      , 0.        , 0.5       , 0.        ,
        0.25      , 0.        ],
       [0.        , 0.33333333, 0.33333333, 0.        , 0.        ,
        0.        , 0.33333333]])

## NYT

In [15]:
df = pd.read_csv('c:/reposit/data/bigleader/NYT_2018.csv')
df.shape

(1324, 15)

In [16]:
df.head(3)

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...


In [18]:
df.headline.values # array로 변경

array(['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
       'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
       'The New Noma, Explained', ...,
       'Gen. Michael Hayden Has One Regret: Russia',
       'There Is Nothin’ Like a Tune', 'Unknown'], dtype=object)

In [20]:
list(df.headline.values)[:5] # list로 변경

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [23]:
headline = []
headline.extend(list(df.headline.values))
headline[:3]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained']

In [24]:
len(headline)

1324

In [30]:
%%timeit

df['headline'].value_counts() # 110

616 µs ± 21.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [32]:
%%timeit

headline.count('Unknown')

18.2 µs ± 1.26 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


+ headline에 저장된 뉴스 기사 제목으로 다음 단어를 생성하는 LSTM 기반 모델 설계


+ 동작 예
    - 입력: 생성하고자 하는 단어의 갯수
    - 출력: I was ... 완성

In [72]:
sents = [
    ['programmer', 'person'],
    ['programmer', 'word', 'secret']]

tkt = Tokenizer()
tkt.fit_on_texts(sents)
print(tkt.word_index)

# 문자 그대로 텍스트를 시퀀스형 자료로 바꿔줌.
tkt.texts_to_sequences(sents)

{'programmer': 1, 'person': 2, 'word': 3, 'secret': 4}


[[1, 2], [1, 3, 4]]

### 전처리

In [35]:
headline[0].lower()

'former n.f.l. cheerleaders’ settlement offer: $1 and a meeting with goodell'

In [None]:
# [pre_func(x) for x in headline] # 기사 제목을 함수로 한번에 처리

res = []

def pre_func(title):
    
    # 소문자 변환
    res.append(title.lower())
    
    return res

In [38]:
# [x.lower() for x in headline]

In [57]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [53]:
headline[0]

'Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell'

In [61]:
def pre_func(title):
    #소문자 변환
    res=''.join(w.lower() for w in title if w not in punctuation)
    return res

In [75]:
pre_headline = [pre_func(x) for x in headline if x != 'Unknown'] # 기사 제목을 함수로 한번에 처리

tkt = Tokenizer()
tkt.fit_on_texts(pre_headline)

sequences = []
for s in pre_headline:
    #print(tok.texts_to_sequences([s])[0]) #각 문장별 인코딩
    enc = tkt.texts_to_sequences([s])[0]
    for i in range(1, len(enc)):
        seq = enc[:i+1]
        sequences.append(seq)

sequences[:15]

[[95, 263],
 [95, 263, 1100],
 [95, 263, 1100, 1101],
 [95, 263, 1100, 1101, 572],
 [95, 263, 1100, 1101, 572, 50],
 [95, 263, 1100, 1101, 572, 50, 7],
 [95, 263, 1100, 1101, 572, 50, 7, 2],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365, 10],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365, 10, 1102],
 [96, 3],
 [96, 3, 1103],
 [96, 3, 1103, 2],
 [96, 3, 1103, 2, 14],
 [96, 3, 1103, 2, 14, 573]]

In [77]:
len(tkt.word_index)

3619

In [79]:
idx2word = {}

for k, v in tkt.word_index.items():
    idx2word[v] = k

idx2word[1]

'the'

In [84]:
# 다른 방법
dict(zip(tkt.word_index.values(), tkt.word_index.keys()))[1]

'the'

In [86]:
max(len(i) for i in sequences)

24

In [88]:
seq1 = pad_sequences(sequences, maxlen=24)
seq1

array([[   0,    0,    0, ...,    0,   95,  263],
       [   0,    0,    0, ...,   95,  263, 1100],
       [   0,    0,    0, ...,  263, 1100, 1101],
       ...,
       [   0,    0,    0, ...,    9, 3619,  110],
       [   0,    0,    0, ..., 3619,  110,    2],
       [   0,    0,    0, ...,  110,    2, 1014]])

In [91]:
X = []
y = []

for s in seq1:
    X.append(s[:-1])
    y.append(s[-1])

X[:3], y[:3]

([array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0, 95]),
  array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,  95, 263]),
  array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,   95,  263,
         1100])],
 [263, 1100, 1101])

In [101]:
seq1.shape

(7809, 24)

In [135]:
X = np.array([s[:-1] for s in seq1])
y = np.array([s[-1] for s in seq1])

print(X.shape, y.shape)

X[:3], y[:3]

(7809, 23) (7809,)


(array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           95],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   95,
          263],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,   95,  263,
         1100]]),
 array([ 263, 1100, 1101]))

In [136]:
vocab_size = len(tkt.index_word) + 1
vocab_size

3620

In [137]:
y = to_categorical(y, vocab_size)
X.shape, y.shape

((7809, 23), (7809, 3620))

In [138]:
model = Sequential()

model.add(Embedding(vocab_size, 10))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))

In [139]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 10)          36200     
                                                                 
 lstm_2 (LSTM)               (None, 128)               71168     
                                                                 
 dense_2 (Dense)             (None, 3620)              466980    
                                                                 
Total params: 574,348
Trainable params: 574,348
Non-trainable params: 0
_________________________________________________________________


In [140]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2675d61d2e0>

In [143]:
def gen_sent(model, tkt, c_word, n):
    
    # 구현
    enc = tkt.texts_to_sequences([c_word])[0]
    pad_sequences([enc], maxlen=24)
    
    res = model.predict(enc)
    res = np.argmax(res)
    print(res)
    
    for w, i in tkt.word_index.items():
        if i == res:
            break
    print(w)
    
    # return pred_sent

In [147]:
def gen_sent(model, tok, c_word, n):
    pred_sent=''
    # 구현
    
    for _ in range(n):
        enc = tok.texts_to_sequences([c_word])[0]
        enc = pad_sequences([enc], maxlen=24, padding='pre')
        #print(enc)
        res = model.predict(enc)
        res = np.argmax(res)
        for w, i in tok.word_index.items():
            if i == res:
                break
        print('예측단어: ',w)
        c_word=c_word+ ' ' + w    #  The=>  The new
    print(c_word)
    #pred_sent = 
    return pred_sent

In [148]:
gen_sent(model, tkt, 'the', 10)

예측단어:  new
예측단어:  glass
예측단어:  is
예측단어:  a
예측단어:  lawman
예측단어:  glass
예측단어:  a
예측단어:  lawman
예측단어:  warriors
예측단어:  2018
the new glass is a lawman glass a lawman warriors 2018


''