In [6]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # SOS 와 EOS 포함

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

#/content/Data.xlsx
def get_max_len(pairs):
  data = np.array(pairs)
  word_data = data[:,1]
  text_data = data[:,0]

  text = []
  word = []

  for i in range(len(data)):
    text.append(word_tokenize(text_data[i]))
    word.append(word_tokenize(word_data[i]))
  text_max_len = max(len(item) for item in text)
  word_max_len = max(len(item) for item in word)
  print(f'text_max_len = {text_max_len}\nword_max_len = {word_max_len}')
  return word_max_len, text_max_len

def readLangs(lang1, lang2, reverse=False, data = list):
    print("Reading lines...") 
    data = data.values

    Non_norm_pairs = [[s for s in l] for l in data]
    # 모든 줄을 쌍으로 분리하고 정규화
    pairs = [[normalizeString(s) for s in l] for l in data]

    # 쌍을 뒤집고, Lang 인스턴스 생성
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs, Non_norm_pairs


eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2, reverse=False,data = list):
    input_lang, output_lang, pairs, non_pairs = readLangs(lang1, lang2, reverse, data)
    print("++++++++++readLANGS++++++++",pairs)
    print("Read %s sentence pairs" % len(pairs))
    #pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[1])
        output_lang.addSentence(pair[0])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    #print("find_max_pairlen(pairs)",find_max_pairlen(pairs))
    #print(pairs)
    return input_lang, output_lang, pairs, non_pairs


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#input size == 8
#output size = 32
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=int):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

teacher_forcing_ratio = 0.5

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[1])
    target_tensor = tensorFromSentence(output_lang, pair[0])
    return (input_tensor, target_tensor)
# text_max_len = 31
# word_max_len = 6
#이부분 max_length 구할 수 있도록 코드 작성하기(숙제! 내일까지!)
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=int):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing 포함: 목표를 다음 입력으로 전달
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Teacher forcing 미포함: 자신의 예측을 다음 입력으로 사용
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01,max_length = int):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # print_every 마다 초기화
    plot_loss_total = 0  # plot_every 마다 초기화

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion,max_length= max_length)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)    

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # 주기적인 간격에 이 locator가 tick을 설정
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


def evaluate(encoder, decoder, sentence, max_length=int):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)

        #print('>', pair[0])
        #print('=', pair[1])
        print("입력 단어",pair[1])
        print("정답 문장",pair[0])
        output_words, attentions = evaluate(encoder, decoder, pair[1])
        output_sentence = ' '.join(output_words)
        print("예측 문장", output_sentence)
        #print('<', output_sentence)
        #print('')

def evaluateUsers(encoder, decoder):
  words= ''
  words =input()
  print("입력 단어",words)
  output_words, attentions = evaluate(encoder, decoder,words)
  output_sentence = ' '.join(output_words)
  print("예측 문장", output_sentence)
  




In [7]:
data = pd.read_excel('/content/Data2.xlsx')
data = data[data.columns[:2]][:-1]
data[:3]

Unnamed: 0,Sentence,Word
0,A queen sat at a window.,"queen, window, sit"
1,A queen had a wonderful looking-glass.,"queen, looking-glass"
2,When the queen stood in front of looking-glass...,"queen, looking-glass, say, fairest"


In [8]:
input_lang, output_lang, pairs,non_norm_pairs = prepareData('word', 'text', False, data)
input_max_len, output_max_len = get_max_len(non_norm_pairs)
print(random.choice(pairs))

Reading lines...
++++++++++readLANGS++++++++ [['a queen sat at a window .', 'queen window sit'], ['a queen had a wonderful looking glass .', 'queen looking glass'], ['when the queen stood in front of looking glass and looked at herself in it and said looking glass looking glass who in this land is the fairest of all . ', 'queen looking glass say fairest'], ['the looking glass answered queen you the fairest of all . ', 'looking glass answer fairest'], ['snow white was talking with animal friends .', 'snow white talk animal'], ['but now the poor snow white was all alone in the great forest .', 'snow white forest'], ['show white ran as long as her feet would go until it was almost evening then she saw a little cottage and went into it to rest herself .', 'snow white run cottage rest'], ['little snow white was so hungry and thirsty that she ate some vegetables and bread from plate and drank water out of mug .', 'snow white hungry thirsty ate vegetables bread plate mug water'], ['as snow wh

In [14]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1,max_length=output_max_len).to(device)

#trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
trainIters(encoder1, attn_decoder1, n_iters = 2000, print_every=100,max_length = output_max_len)
print('Train End')

0m 4s (- 1m 26s) (100 5%) 3.4254
0m 8s (- 1m 16s) (200 10%) 2.9585
0m 12s (- 1m 12s) (300 15%) 3.0051
0m 17s (- 1m 8s) (400 20%) 2.7876
0m 21s (- 1m 3s) (500 25%) 2.4224
0m 25s (- 1m 0s) (600 30%) 2.2322
0m 30s (- 0m 57s) (700 35%) 1.7656
0m 35s (- 0m 53s) (800 40%) 1.2436
0m 40s (- 0m 49s) (900 45%) 1.0045
0m 45s (- 0m 45s) (1000 50%) 1.0458
0m 50s (- 0m 41s) (1100 55%) 0.4488
0m 56s (- 0m 37s) (1200 60%) 0.2951
1m 1s (- 0m 33s) (1300 65%) 0.1560
1m 6s (- 0m 28s) (1400 70%) 0.0621
1m 12s (- 0m 24s) (1500 75%) 0.0426
1m 17s (- 0m 19s) (1600 80%) 0.0373
1m 23s (- 0m 14s) (1700 85%) 0.0295
1m 28s (- 0m 9s) (1800 90%) 0.0251
1m 34s (- 0m 4s) (1900 95%) 0.0220
1m 39s (- 0m 0s) (2000 100%) 0.0190
Train End


In [99]:
evaluateUsers(encoder1, attn_decoder1)

red queen riding hood bed stone
입력 단어 red queen riding hood bed stone
예측 문장 the little red riding hood was entertaining by gathering nuts running after butterflies and gathering bouquets of little flowers . <EOS>
