## Data Preparation

In [108]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [109]:
# 문장데이터

with open('sentences_5.txt', 'r', encoding='utf-8') as file:
    sentences_5 = file.read()

sentences_5 = sentences_5.split("\n")  # 개행을 공백으로 변경
sentences_5

['How was your day',
 'I love you',
 'You seem to be tired',
 'How have you been',
 'I have to go now',
 'See you later',
 'Take your time',
 'Hear me out',
 'I am free today',
 'He opened the door slowly',
 'She looked up and smiled',
 'They walked down the street',
 'Birds were flying above us',
 'The sun was shining brightly',
 'The child ran towards home',
 'It was a beautiful day',
 'He quickly closed the book',
 'She found a hidden message',
 'We danced under the stars',
 'They laughed at the joke',
 'The rain began to fall',
 'He answered without any hesitation',
 'She whispered into his ear',
 'We ate dinner outside tonight',
 'They watched the movie together',
 'The dog barked at strangers',
 'She forgot to bring lunch',
 'He took a long breath',
 'They traveled across the country',
 'He painted the wall blue',
 'She counted the stars above',
 'We swam in the ocean yesterday',
 'They met at the cafe',
 'He discovered a secret passage',
 'I like apple',
 'The sun rises every mo

In [110]:
# vocabulary 만들기
all_words_str = ",".join(sentences_5).replace(",", " ")
print(all_words_str)
all_words_lst = all_words_str.split(" ")

vocab = all_words_lst
print(vocab)

How was your day I love you You seem to be tired How have you been I have to go now See you later Take your time Hear me out I am free today He opened the door slowly She looked up and smiled They walked down the street Birds were flying above us The sun was shining brightly The child ran towards home It was a beautiful day He quickly closed the book She found a hidden message We danced under the stars They laughed at the joke The rain began to fall He answered without any hesitation She whispered into his ear We ate dinner outside tonight They watched the movie together The dog barked at strangers She forgot to bring lunch He took a long breath They traveled across the country He painted the wall blue She counted the stars above We swam in the ocean yesterday They met at the cafe He discovered a secret passage I like apple The sun rises every morning She reads books every night They play soccer after school We eat lunch at noon He enjoys walking in parks The cat sleeps all day Birds s

In [111]:
# vector space embedding하기 위해 단어와 integer(정수)사이의 양방향 mapping
vocab = list(set(vocab))
word_int_dict = {w: i for i, w in enumerate(vocab)}
print(word_int_dict)
int_word_dict = {i: w for i, w in enumerate(vocab)}
print(int_word_dict)

{'traveled': 0, 'hike': 1, 'whispered': 2, 'am': 3, 'treehouses': 4, 'songs': 5, 'grass': 6, 'swim': 7, 'stars': 8, 'for': 9, 'roof': 10, 'sing': 11, 'shade': 12, 'letters': 13, 'shines': 14, 'sleep': 15, 'toys': 16, 'poems': 17, 'time': 18, 'winter': 19, 'snow': 20, 'on': 21, 'movies': 22, 'paints': 23, 'quietly': 24, 'cookies': 25, 'Take': 26, 'Stars': 27, 'sets': 28, 'Flowers': 29, 'houses': 30, 'clay': 31, 'slowly': 32, 'life': 33, 'every': 34, 'peaceful': 35, 'creates': 36, 'walls': 37, 'soul': 38, 'forests': 39, 'loves': 40, 'sings': 41, 'summer': 42, 'message': 43, 'at': 44, 'horse': 45, 'read': 46, 'cool': 47, 'found': 48, 'evening': 49, 'plants': 50, 'its': 51, 'outside': 52, 'rocks': 53, 'swims': 54, 'Rain': 55, 'calm': 56, 'guitar': 57, 'kids': 58, 'door': 59, 'carves': 60, 'They': 61, 'art': 62, 'It': 63, 'day': 64, 'love': 65, 'plan': 66, 'runs': 67, 'animal': 68, 'write': 69, 'tired': 70, 'I': 71, 'walk': 72, 'rises': 73, 'discovered': 74, 'during': 75, 'sleeps': 76, 'fre

In [112]:
print(len(vocab), len(set(vocab)))


346 346


In [113]:
id_mat = np.eye(len(vocab))
id_mat

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [114]:
# training data

input_batch = []
target_batch = []

# input_batch = [[she loves], [i like], [they are]]
# target_batch = [[me], [cat], [happy]]
"she loves me"
for sentence in sentences_5:
    words = sentence.split(" ")
    input = [word_int_dict[w] for w in words[:-1]]
    target = [word_int_dict[w] for w in words[-1:]]

    # print(f"input : {input}")
    # print(f"target : {target}")

    input_batch.append(id_mat[input])
    target_batch.append(target)


print(f'input batch : {input_batch}')
print(f'target batch : {target_batch}')

input batch : [array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
     

In [115]:
for i, array in enumerate(input_batch):
    print(f"Index {i}: Shape {array.shape}")

import numpy as np

# 최대 크기 찾기
max_shape = tuple(max(sizes) for sizes in zip(*(arr.shape for arr in input_batch)))

# 패딩 적용
padded_batch = []
for array in input_batch:
    padded_array = np.zeros(max_shape, dtype=np.float32)  # 최대 크기로 초기화
    slices = tuple(slice(0, dim) for dim in array.shape)
    padded_array[slices] = array  # 기존 배열 값 복사
    padded_batch.append(padded_array)

input_batch = np.stack(padded_batch)  # 리스트를 numpy 배열로 변환


Index 0: Shape (3, 346)
Index 1: Shape (2, 346)
Index 2: Shape (4, 346)
Index 3: Shape (3, 346)
Index 4: Shape (4, 346)
Index 5: Shape (2, 346)
Index 6: Shape (2, 346)
Index 7: Shape (2, 346)
Index 8: Shape (3, 346)
Index 9: Shape (4, 346)
Index 10: Shape (4, 346)
Index 11: Shape (4, 346)
Index 12: Shape (4, 346)
Index 13: Shape (4, 346)
Index 14: Shape (4, 346)
Index 15: Shape (4, 346)
Index 16: Shape (4, 346)
Index 17: Shape (4, 346)
Index 18: Shape (4, 346)
Index 19: Shape (4, 346)
Index 20: Shape (4, 346)
Index 21: Shape (4, 346)
Index 22: Shape (4, 346)
Index 23: Shape (4, 346)
Index 24: Shape (4, 346)
Index 25: Shape (4, 346)
Index 26: Shape (4, 346)
Index 27: Shape (4, 346)
Index 28: Shape (4, 346)
Index 29: Shape (4, 346)
Index 30: Shape (4, 346)
Index 31: Shape (5, 346)
Index 32: Shape (4, 346)
Index 33: Shape (4, 346)
Index 34: Shape (2, 346)
Index 35: Shape (4, 346)
Index 36: Shape (4, 346)
Index 37: Shape (4, 346)
Index 38: Shape (4, 346)
Index 39: Shape (4, 346)
Index 40: 

In [116]:
import numpy as np
import torch

# input_batch 처리
input_batch = np.stack(input_batch)  # 리스트를 numpy 배열로 변환
input_batch = torch.tensor(input_batch, dtype=torch.float32)  # torch 텐서로 변환

# target_batch 처리
target_batch = np.array(target_batch).flatten()  # 2D 리스트를 1D numpy 배열로 변환
target_batch = torch.tensor(target_batch, dtype=torch.int64)  # torch 텐서로 변환

# 최종 확인
print(f"Input batch shape: {input_batch.shape}")  # (배치 크기, 시퀀스 길이, 입력 차원)
print(f"Target batch shape: {target_batch.shape}")  # (배치 크기,)


Input batch shape: torch.Size([130, 5, 346])
Target batch shape: torch.Size([130])


In [117]:
print(input_batch)
print([len(seq) if isinstance(seq, list) else "Not a list" for seq in input_batch])


tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0.,

## RNN

In [118]:
INPUT_SIZE = len(vocab)
NUM_LAYERS = 1
HIDDEN_DIM = 4

In [119]:
class SimpleRNN(nn.Module):
    def __init__(self) -> None:
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size=INPUT_SIZE, num_layers=1, hidden_size=HIDDEN_DIM)
        self.fc = nn.Linear(HIDDEN_DIM, len(vocab))


    def forward(self, initial_hidden, X):
        X = X.transpose(0, 1)
        outs, hiddens  =  self.rnn(X, initial_hidden)
        out = outs[-1]
        final_out = self.fc(out)
        return final_out

model = SimpleRNN()

In [120]:
test_input_lst = torch.zeros(1, 2, INPUT_SIZE, dtype=torch.float32)  # 크기 (1, 2, 638)로 선언
test_input_tensor = torch.tensor(test_input_lst, dtype=torch.float32, requires_grad=False)
print(test_input_tensor)

test_initial_hidden = torch.zeros(NUM_LAYERS, 1, HIDDEN_DIM, requires_grad=False) # [[[0, 0, 0, 0]]]
print(test_initial_hidden)

result = model(test_initial_hidden, test_input_tensor)
print(result)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

  test_input_tensor = torch.tensor(test_input_lst, dtype=torch.float32, requires_grad=False)


In [121]:
# Training

LEARNING_RATE = 0.01
BATCH_SIZE = 130
TOTAL_EPOCH = 2000

model.train()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

print(f'input_batch: {input_batch}')

initial_hidden = torch.zeros(NUM_LAYERS, BATCH_SIZE, HIDDEN_DIM, requires_grad=False)

for epoch in range(TOTAL_EPOCH):

    output = model(initial_hidden, input_batch)
    loss = criterion(output, target_batch.flatten())

    print(f'EPOCH : {epoch + 1}, cost : {loss}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


input_batch: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [122]:
# prediction

model.eval()

trained_output = model(initial_hidden, input_batch)
print(f'trained_output : {trained_output}')


trained_output : tensor([[-3.3179, -2.5374, -2.7522,  ..., -3.5866, -2.5464, -3.0870],
        [-3.6675, -2.7953, -3.1420,  ..., -4.0012, -2.9744, -3.3709],
        [-0.8010, -0.2074, -0.7093,  ..., -0.7600, -0.0591, -0.1116],
        ...,
        [-2.9299, -1.9319, -2.3096,  ..., -3.2101, -2.0779, -2.6532],
        [-1.2861, -0.2889, -0.6257,  ..., -1.1209,  0.0789, -0.8334],
        [-3.5746, -2.6290, -3.0225,  ..., -3.9151, -2.8520, -3.2708]],
       grad_fn=<AddmmBackward0>)


In [123]:
# output을 prediction data로 쓸 수 있는 형태로 가져오는 과정
print(trained_output.data)
print(trained_output.data.max())
print(trained_output.data.max(1))
print(trained_output.data.max(1)[1])

tensor([[-3.3179, -2.5374, -2.7522,  ..., -3.5866, -2.5464, -3.0870],
        [-3.6675, -2.7953, -3.1420,  ..., -4.0012, -2.9744, -3.3709],
        [-0.8010, -0.2074, -0.7093,  ..., -0.7600, -0.0591, -0.1116],
        ...,
        [-2.9299, -1.9319, -2.3096,  ..., -3.2101, -2.0779, -2.6532],
        [-1.2861, -0.2889, -0.6257,  ..., -1.1209,  0.0789, -0.8334],
        [-3.5746, -2.6290, -3.0225,  ..., -3.9151, -2.8520, -3.2708]])
tensor(30.3007)
torch.return_types.max(
values=tensor([25.9704, 29.5894, 25.1381, 24.9239, 25.0114, 24.8682, 20.7059, 19.1543,
        26.6839, 25.8107, 25.7069, 27.8873, 16.0924, 28.6362, 20.3423, 25.8661,
        20.1340, 20.2549, 19.1727, 25.9332, 26.0977, 17.2891, 29.6841, 15.6558,
        19.7527, 29.7965, 24.6345, 24.5312, 29.5710, 24.0478, 24.7034, 28.3299,
        28.2892, 25.9849, 29.9121, 26.0133, 23.5934, 26.3058, 21.1224, 27.9307,
        25.9555, 25.5887, 24.2711, 24.3303, 24.9296, 24.8281, 23.5194, 25.2767,
        17.4938, 23.3379, 18.9284, 19.9

In [124]:
# input에 대해 테스트
predictions = trained_output.data.max(1)[1]

for i in range(0, 3):
    print(int_word_dict[predictions[i].item()])

day
ear
tired


In [125]:
predictions

tensor([ 64, 272,  70, 194, 300, 179,  18, 275, 204,  32, 238, 210, 124, 315,
        232,  64, 290,  43,   8, 178,  90, 292, 272, 250, 138, 257, 155, 109,
        159, 313, 171, 283, 306, 287,  87, 321, 153, 263, 312, 269,  64, 321,
        153,  62, 166, 342, 240,  38,  33, 320, 108, 274,  53, 282, 330, 181,
        208, 158,  99,  42, 306, 309, 160,  19,  49, 298, 306, 240, 254,  81,
          6, 166, 288, 280, 119, 170, 288,  24, 243,  91, 274,  10,  22, 291,
         31, 328, 259, 321, 108, 222, 257,  49, 233, 166,  35, 189,  42, 321,
         85, 166, 337, 261,   8,  16, 203, 117, 147, 105, 330, 107, 164, 288,
        277, 166, 243, 153,  49, 278,  42,  15,  12, 111,  37, 268, 291,  56,
         95, 321, 192, 148])

# PR1 마지막 단어 예측

In [126]:
# prediction이 얼마나 맞는지 정확도 체크

correct_cnt = 0

for i in range(0, len(sentences_5)):
    pred = int_word_dict[predictions[i].item()]
    sentences_splitted = sentences_5[i].split(" ")
    sentence_size = len(sentences_splitted)
    print(f'input: {sentences_splitted[0:sentence_size-1]}  , true : {sentences_splitted[sentence_size-1]} , pred : {pred}')
    if sentences_splitted[sentence_size-1] == pred:
        correct_cnt += 1

correct_ratio = correct_cnt / len(sentences_5)
print(f'correct ratio : {correct_ratio}')

input: ['How', 'was', 'your']  , true : day , pred : day
input: ['I', 'love']  , true : you , pred : ear
input: ['You', 'seem', 'to', 'be']  , true : tired , pred : tired
input: ['How', 'have', 'you']  , true : been , pred : been
input: ['I', 'have', 'to', 'go']  , true : now , pred : now
input: ['See', 'you']  , true : later , pred : later
input: ['Take', 'your']  , true : time , pred : time
input: ['Hear', 'me']  , true : out , pred : out
input: ['I', 'am', 'free']  , true : today , pred : today
input: ['He', 'opened', 'the', 'door']  , true : slowly , pred : slowly
input: ['She', 'looked', 'up', 'and']  , true : smiled , pred : smiled
input: ['They', 'walked', 'down', 'the']  , true : street , pred : street
input: ['Birds', 'were', 'flying', 'above']  , true : us , pred : us
input: ['The', 'sun', 'was', 'shining']  , true : brightly , pred : brightly
input: ['The', 'child', 'ran', 'towards']  , true : home , pred : home
input: ['It', 'was', 'a', 'beautiful']  , true : day , pred : d