In [37]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences  #패딩 하는거(자리수 채우는거)
from tensorflow.keras.utils import to_categorical #원핫 인코딩 함수

In [38]:
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [39]:
text

'경마장에 있는 말이 뛰고 있다\n\n그의 말이 법이다\n\n가는 말이 고와야 오는 말이 곱다\n'

In [40]:
print(text)

경마장에 있는 말이 뛰고 있다

그의 말이 법이다

가는 말이 고와야 오는 말이 곱다



In [41]:
tokenizer=Tokenizer()

In [42]:
tokenizer.fit_on_texts([text])

In [43]:
tokenizer.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [44]:
vocab_size=len(tokenizer.word_index)+1    #코딩시 인덱스 번호가 0번부터 시작함으로 11까지 할라고 크기를 12로 설정함, 필수는 아님
print('단어 집합의 크기 : %d' %vocab_size)

단어 집합의 크기 : 12


In [45]:
tokenizer.texts_to_sequences(['경마장에 있는 말이 뛰고 있다'])[0]  #문자 구성하는 각각 단어들에 대해 숫자로 변환해줌

[2, 3, 1, 4, 5]

In [46]:
sequences = list()
for line in text.split('\n'): # 줄바꿈 문자를 기준으로 문장 토큰화
    encoded = tokenizer.texts_to_sequences([line])[0] #RNN사용하기 위해 숫자로 변환 ,[2, 3, 1, 4, 5]
    for i in range(1, len(encoded)):  #range -> (1,5)
        sequence = encoded[:i+1]
        sequences.append(sequence) #[[2, 3],[2, 3, 1],[2, 3, 1, 4],[2, 3, 1, 4, 5]]

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 11


In [47]:
#1	경마장에	있는
#2	경마장에 있는	말이
#3	경마장에 있는 말이	뛰고
#4	경마장에 있는 말이 뛰고	있다
#5	그의	말이
#6	그의 말이	법이다
#7	가는	말이
#8	가는 말이	고와야
#9	가는 말이 고와야	오는
#10	가는 말이 고와야 오는	말이
#11	가는 말이 고와야 오는 말이	곱다
sequences

# 2번이 입력되면 3번이 출력, (2,3)이 입력되면 1번이 출력되도록 해야해서 끝에 숫자를 타겟으로 둬야한다.
#출력 하고자 하는 단어는 끝에 한단어 씩이므로 return_sequences = False로 둔다

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 1],
 [6, 1, 7],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

In [48]:
max_len = max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 6


In [51]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre') #pre = 0이 앞쪽에 오고 post는 0이 뒤쪽에 채워진다

In [52]:
sequences

array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]])

In [53]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [55]:
X
y

array([ 3,  1,  4,  5,  1,  7,  1,  9, 10,  1, 11])

In [56]:
y = to_categorical(y, num_classes=vocab_size)

In [57]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [59]:
embedding_dim = 10  #임베딩 차원 : 10차원
#임베딩? 단어를 벡터 공간에 표현하는것
#임베딩 벡터공간 : 11차원(단어 종류 갯수) +1 =12차원

hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
#12차원 데이터를 10차원 공간 데이터로 표현해라 ->고차원의 벡터데이터를 저차원으로 줄여줌으로써 학습속도 개선해주는 효과 있음

#RNN셀로 이동

model.add(SimpleRNN(hidden_units))  #return_sequence가 없으므로 R에서 옆으로 쭉가서 출력
model.add(Dense(vocab_size, activation='softmax'))  #  -> 11개의 단어중 하나를 출력 함으로 softmax사용
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 1s - loss: 2.4657 - accuracy: 0.0000e+00 - 944ms/epoch - 944ms/step
Epoch 2/200
1/1 - 0s - loss: 2.4512 - accuracy: 0.0000e+00 - 4ms/epoch - 4ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4365 - accuracy: 0.0000e+00 - 3ms/epoch - 3ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4216 - accuracy: 0.2727 - 4ms/epoch - 4ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4064 - accuracy: 0.2727 - 3ms/epoch - 3ms/step
Epoch 6/200
1/1 - 0s - loss: 2.3907 - accuracy: 0.2727 - 3ms/epoch - 3ms/step
Epoch 7/200
1/1 - 0s - loss: 2.3746 - accuracy: 0.3636 - 3ms/epoch - 3ms/step
Epoch 8/200
1/1 - 0s - loss: 2.3579 - accuracy: 0.3636 - 3ms/epoch - 3ms/step
Epoch 9/200
1/1 - 0s - loss: 2.3406 - accuracy: 0.3636 - 4ms/epoch - 4ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3226 - accuracy: 0.3636 - 3ms/epoch - 3ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3038 - accuracy: 0.3636 - 3ms/epoch - 3ms/step
Epoch 12/200
1/1 - 0s - loss: 2.2842 - accuracy: 0.3636 - 3ms/epoch - 3ms/step
Epoch 13/200
1/1 - 0s - loss: 2.2638 - accura

Epoch 105/200
1/1 - 0s - loss: 0.6584 - accuracy: 0.8182 - 3ms/epoch - 3ms/step
Epoch 106/200
1/1 - 0s - loss: 0.6472 - accuracy: 0.8182 - 3ms/epoch - 3ms/step
Epoch 107/200
1/1 - 0s - loss: 0.6361 - accuracy: 0.8182 - 4ms/epoch - 4ms/step
Epoch 108/200
1/1 - 0s - loss: 0.6251 - accuracy: 0.8182 - 3ms/epoch - 3ms/step
Epoch 109/200
1/1 - 0s - loss: 0.6143 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 110/200
1/1 - 0s - loss: 0.6037 - accuracy: 0.9091 - 4ms/epoch - 4ms/step
Epoch 111/200
1/1 - 0s - loss: 0.5933 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 112/200
1/1 - 0s - loss: 0.5830 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 113/200
1/1 - 0s - loss: 0.5729 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 114/200
1/1 - 0s - loss: 0.5629 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 115/200
1/1 - 0s - loss: 0.5532 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 116/200
1/1 - 0s - loss: 0.5436 - accuracy: 0.9091 - 3ms/epoch - 3ms/step
Epoch 117/200
1/1 - 0s - loss: 0.5341 - 

<keras.src.callbacks.History at 0x11774de7a50>

In [82]:
tokenizer.texts_to_sequences(['가는'])[0]

[8]

In [84]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]   #'경마장애'=> [2]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre') #[2] => [[0 0 0 0 2]]
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)  #[[0 0 0 0 2]]
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [81]:
print(sentence_generation(model, tokenizer, '고와야',7 )) #RNN은 앞에 단어가 뭐가 오냐에 따라 영향을 받는다.

고와야 말이 법이다 말이 말이 오는 말이 곱다


In [86]:
import pandas as pd
import numpy as np
from string import punctuation  #특수문자 사용가능 라이브러리

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [100]:
df = pd.read_csv('ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [101]:
print('열의 개수: ',len(df.columns))
print(df.columns)

열의 개수:  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [102]:
print(df['headline'].isnull().values.any())

False


In [103]:
headline=[]

In [104]:
headline.extend(list(df.headline.values))

In [105]:
headline

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?',
 'Commuter Reprogramming',
 'Unknown',
 'Unknown',
 'Ford Changed Leaders, Looking for a Lift. It’s Still Looking.',
 'Romney Failed to Win at Utah Convention, But Few Believe He’s Doomed',
 'Chain Reaction',
 'He Forced the Vatican to Investigate Sex Abuse. Now He’s Meeting With Pope Francis.',
 'In Berlin, artists find a home',
 'Unknown',
 'The Right Stuff',
 'Jimmy Carter Knows What North Korea Wants',
 'The Truth Is Out There',
 'New Jersey Ruling Could Reignite Battle Over Church-State Separation',
 'Procrastinating',
 'Word + Quiz: dilatory',
 'My Life-Threatening Bout With E. Coli Food Poisoning',
 'Choosing Brexit, a Town Yearned for Its Seafa

In [106]:
print('총 샘플의 개수 : {}'.format(len(headline)))

총 샘플의 개수 : 1324


In [107]:
headline = [word for word in headline if word != "Unknown"]
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline)))

노이즈값 제거 후 샘플의 개수 : 1214


In [108]:
headline

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?',
 'Commuter Reprogramming',
 'Ford Changed Leaders, Looking for a Lift. It’s Still Looking.',
 'Romney Failed to Win at Utah Convention, But Few Believe He’s Doomed',
 'Chain Reaction',
 'He Forced the Vatican to Investigate Sex Abuse. Now He’s Meeting With Pope Francis.',
 'In Berlin, artists find a home',
 'The Right Stuff',
 'Jimmy Carter Knows What North Korea Wants',
 'The Truth Is Out There',
 'New Jersey Ruling Could Reignite Battle Over Church-State Separation',
 'Procrastinating',
 'Word + Quiz: dilatory',
 'My Life-Threatening Bout With E. Coli Food Poisoning',
 'Choosing Brexit, a Town Yearned for Its Seafaring Past, and Muddied Its Future',
 'A Quote Disproved',
 'Hot Stuff Turns Cold',
 'At the Top,

In [109]:
def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
    # 구두점 제거와 동시에 소문자화
    return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()  #punctuation 제거

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [110]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [111]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 3494


In [112]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'in': 5,
 'for': 6,
 'and': 7,
 'is': 8,
 'on': 9,
 'with': 10,
 'trump': 11,
 'as': 12,
 'at': 13,
 'new': 14,
 'how': 15,
 'from': 16,
 'it': 17,
 'an': 18,
 'that': 19,
 'be': 20,
 'season': 21,
 'us': 22,
 'you': 23,
 'its': 24,
 'what': 25,
 'episode': 26,
 'can': 27,
 'your': 28,
 'not': 29,
 'he': 30,
 'now': 31,
 'his': 32,
 'are': 33,
 'teaching': 34,
 'war': 35,
 'out': 36,
 'no': 37,
 'was': 38,
 'by': 39,
 'trumps': 40,
 'has': 41,
 'over': 42,
 'may': 43,
 'into': 44,
 'why': 45,
 'more': 46,
 'we': 47,
 'who': 48,
 'about': 49,
 'recap': 50,
 'activities': 51,
 '1': 52,
 'just': 53,
 'do': 54,
 'women': 55,
 'when': 56,
 'syria': 57,
 'trade': 58,
 'i': 59,
 '2': 60,
 'or': 61,
 'will': 62,
 'this': 63,
 'have': 64,
 'president': 65,
 'but': 66,
 'home': 67,
 'up': 68,
 'long': 69,
 'one': 70,
 'off': 71,
 'facebook': 72,
 'house': 73,
 'gop': 74,
 'our': 75,
 'case': 76,
 'they': 77,
 'life': 78,
 'end': 79,
 'right': 80,
 'some':

In [114]:
sequences = list()

for sentence in preprocessed_headline:

    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)


sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [115]:
index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key

In [116]:
index_to_word #빈도수가 높은 순서로 나열됨

{1: 'the',
 2: 'a',
 3: 'to',
 4: 'of',
 5: 'in',
 6: 'for',
 7: 'and',
 8: 'is',
 9: 'on',
 10: 'with',
 11: 'trump',
 12: 'as',
 13: 'at',
 14: 'new',
 15: 'how',
 16: 'from',
 17: 'it',
 18: 'an',
 19: 'that',
 20: 'be',
 21: 'season',
 22: 'us',
 23: 'you',
 24: 'its',
 25: 'what',
 26: 'episode',
 27: 'can',
 28: 'your',
 29: 'not',
 30: 'he',
 31: 'now',
 32: 'his',
 33: 'are',
 34: 'teaching',
 35: 'war',
 36: 'out',
 37: 'no',
 38: 'was',
 39: 'by',
 40: 'trumps',
 41: 'has',
 42: 'over',
 43: 'may',
 44: 'into',
 45: 'why',
 46: 'more',
 47: 'we',
 48: 'who',
 49: 'about',
 50: 'recap',
 51: 'activities',
 52: '1',
 53: 'just',
 54: 'do',
 55: 'women',
 56: 'when',
 57: 'syria',
 58: 'trade',
 59: 'i',
 60: '2',
 61: 'or',
 62: 'will',
 63: 'this',
 64: 'have',
 65: 'president',
 66: 'but',
 67: 'home',
 68: 'up',
 69: 'long',
 70: 'one',
 71: 'off',
 72: 'facebook',
 73: 'house',
 74: 'gop',
 75: 'our',
 76: 'case',
 77: 'they',
 78: 'life',
 79: 'end',
 80: 'right',
 81: 'so

In [117]:
sequences

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3],
 [100, 3, 1117],
 [100, 3, 1117, 2],
 [100, 3, 1117, 2, 14],
 [100, 3, 1117, 2, 14, 583],
 [100, 3, 1117, 2, 14, 583, 24],
 [100, 3, 1117, 2, 14, 583, 24, 1118],
 [100, 3, 1117, 2, 14, 583, 24, 1118, 373],
 [100, 3, 1117, 2, 14, 583, 24, 1118, 373, 374],
 [100, 3, 1117, 2, 14, 583, 24, 1118, 373, 374, 5],
 [100, 3, 1117, 2, 14, 583, 24, 1118, 373, 374, 5, 1119],
 [1, 14],
 [1, 14, 1120],
 [1, 14, 1120, 1121],
 [15, 2],
 [15, 2, 584],
 [15, 2, 584, 4],
 [15, 2, 584, 4, 215],
 [15, 2, 584, 4, 215, 375],
 [15, 2, 584, 4, 215, 375, 1122],
 [15, 2, 584, 4, 215, 375, 1122, 2],
 [15, 2, 584, 4, 215, 375, 1122, 2, 376],
 [15, 2, 584, 4, 215, 375, 1122, 2, 376, 

In [118]:
len(sequences)

7803

In [119]:
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 24


In [120]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [121]:
sequences

array([[   0,    0,    0, ...,    0,   99,  269],
       [   0,    0,    0, ...,   99,  269,  371],
       [   0,    0,    0, ...,  269,  371, 1115],
       ...,
       [   0,    0,    0, ...,    8, 3493,  115],
       [   0,    0,    0, ..., 3493,  115,    2],
       [   0,    0,    0, ...,  115,    2, 1025]])

In [122]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [129]:
X.shape

(7803, 23)

In [124]:
y

array([ 269,  371, 1115, ...,  115,    2, 1025])

In [125]:
vocab_size #3494차원으로 예측

3494

In [126]:
y = to_categorical(y, num_classes=vocab_size)

In [127]:
y.shape #7803개 훈련 문장에 대한 정답에 해당되는 7803개의 단어
#각 단어는 3494차원으로 원핫인코딩 되어 있음 

(7803, 3494)

In [128]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [130]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [131]:
embedding_dim = 10
hidden_units = 128  #128차원으로 출력

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim)) #3494 ->10차원
model.add(LSTM(hidden_units)) #lstm 셀 출력 : 128차원
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
244/244 - 5s - loss: 7.6373 - accuracy: 0.0263 - 5s/epoch - 22ms/step
Epoch 2/200
244/244 - 3s - loss: 7.1121 - accuracy: 0.0301 - 3s/epoch - 12ms/step
Epoch 3/200
244/244 - 3s - loss: 6.9759 - accuracy: 0.0313 - 3s/epoch - 11ms/step
Epoch 4/200
244/244 - 3s - loss: 6.8497 - accuracy: 0.0404 - 3s/epoch - 11ms/step
Epoch 5/200
244/244 - 3s - loss: 6.7004 - accuracy: 0.0436 - 3s/epoch - 11ms/step
Epoch 6/200
244/244 - 3s - loss: 6.5233 - accuracy: 0.0463 - 3s/epoch - 11ms/step
Epoch 7/200
244/244 - 3s - loss: 6.3278 - accuracy: 0.0540 - 3s/epoch - 11ms/step
Epoch 8/200
244/244 - 3s - loss: 6.1220 - accuracy: 0.0566 - 3s/epoch - 11ms/step
Epoch 9/200
244/244 - 3s - loss: 5.9248 - accuracy: 0.0602 - 3s/epoch - 11ms/step
Epoch 10/200
244/244 - 3s - loss: 5.7316 - accuracy: 0.0664 - 3s/epoch - 11ms/step
Epoch 11/200
244/244 - 3s - loss: 5.5569 - accuracy: 0.0716 - 3s/epoch - 11ms/step
Epoch 12/200
244/244 - 3s - loss: 5.3913 - accuracy: 0.0759 - 3s/epoch - 11ms/step
Epoch 13/200


Epoch 100/200
244/244 - 3s - loss: 0.7968 - accuracy: 0.8436 - 3s/epoch - 12ms/step
Epoch 101/200
244/244 - 3s - loss: 0.7635 - accuracy: 0.8524 - 3s/epoch - 12ms/step
Epoch 102/200
244/244 - 3s - loss: 0.7446 - accuracy: 0.8570 - 3s/epoch - 12ms/step
Epoch 103/200
244/244 - 3s - loss: 0.7226 - accuracy: 0.8595 - 3s/epoch - 12ms/step
Epoch 104/200
244/244 - 3s - loss: 0.7067 - accuracy: 0.8624 - 3s/epoch - 12ms/step
Epoch 105/200
244/244 - 3s - loss: 0.6890 - accuracy: 0.8651 - 3s/epoch - 12ms/step
Epoch 106/200
244/244 - 3s - loss: 0.6738 - accuracy: 0.8698 - 3s/epoch - 12ms/step
Epoch 107/200
244/244 - 3s - loss: 0.6589 - accuracy: 0.8715 - 3s/epoch - 12ms/step
Epoch 108/200
244/244 - 3s - loss: 0.6455 - accuracy: 0.8736 - 3s/epoch - 11ms/step
Epoch 109/200
244/244 - 3s - loss: 0.6300 - accuracy: 0.8793 - 3s/epoch - 12ms/step
Epoch 110/200
244/244 - 3s - loss: 0.6172 - accuracy: 0.8788 - 3s/epoch - 12ms/step
Epoch 111/200
244/244 - 3s - loss: 0.6062 - accuracy: 0.8800 - 3s/epoch - 12

Epoch 198/200
244/244 - 3s - loss: 0.2660 - accuracy: 0.9168 - 3s/epoch - 11ms/step
Epoch 199/200
244/244 - 3s - loss: 0.2668 - accuracy: 0.9162 - 3s/epoch - 11ms/step
Epoch 200/200
244/244 - 3s - loss: 0.2647 - accuracy: 0.9173 - 3s/epoch - 11ms/step


<keras.src.callbacks.History at 0x1177c117d10>

In [132]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [133]:
print(sentence_generation(model, tokenizer, 'i', 10))

i disapprove of school vouchers can i still apply for them


In [134]:
print(sentence_generation(model, tokenizer, 'how', 10))

how to make facebook more accountable can live on ohio of
