In [0]:
!pip3 install http://download.pytorch.org/whl/cu90/torch-1.0.0-cp36-cp36m-linux_x86_64.whl
!pip3 install torchvision
!pip3 install tqdm

In [0]:
emb = nn.Embedding(10000, 20, padding_idx=0)
# Embedding 계층의 입력은 int64 Tensor
inp = torch.tensor([1, 2, 5, 2, 10], dtype=torch.int64)
# 출력은 float32 Tensor
out = emb(inp)


In [0]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xf aclImdb_v1.tar.gz


--2018-12-14 09:30:27--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2018-12-14 09:30:34 (12.4 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
import glob
import pathlib
import re

remove_marks_regex = re.compile("[,\.\(\)\[\]\*:;]|<.*?>")
shift_marks_regex = re.compile("([?!])")

def text2ids(text, vocab_dict):
    # !? 이외의 기호 삭제
    text = remove_marks_regex.sub("", text)
    # !?와 단어 사이에 공백 삽입
    text = shift_marks_regex.sub(r" \1 ", text)
    tokens = text.split()
    return [vocab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes \
            + [0] * (max_len - len(token_idxes))
    return torch.tensor(token_idxes, dtype=torch.int64), n_tokens



In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, 
                              DataLoader,
                              TensorDataset)
import tqdm


In [0]:
class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True,
                 max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath("imdb.vocab")
        
        # 용어집 파일을 읽어서 행 단위로 분할
        self.vocab_array = vocab_path.open() \
                            .read().strip().splitlines()
        # 단어가 키이고 값이 ID인 dict 만들기
        self.vocab_dict = dict((w, i+1) \
            for (i, w) in enumerate(self.vocab_array))
        
        if train:
            target_path = path.joinpath("train")
        else:
            target_path = path.joinpath("test")
        pos_files = sorted(glob.glob(
            str(target_path.joinpath("pos/*.txt"))))
        neg_files = sorted(glob.glob(
            str(target_path.joinpath("neg/*.txt"))))
        # pos는 1, neg는 0인 label을 붙여서
        # (file_path, label)의 튜플 리스트 작성
        self.labeled_files = \
            list(zip([0]*len(neg_files), neg_files )) + \
            list(zip([1]*len(pos_files), pos_files))
    
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        # 파일의 텍스트 데이터를 읽어서 소문자로 변환
        data = open(f).read().lower()
        # 텍스트 데이터를 ID 리스트로 변환
        data = text2ids(data, self.vocab_dict)
        # ID 리스트를 Tensor로 변환
        data, n_tokens = list2tensor(data, self.max_len, self.padding)
        return data, label, n_tokens


In [0]:
train_data = IMDBDataset("/content/aclImdb/")
test_data = IMDBDataset("/content/aclImdb/", train=False)
train_loader = DataLoader(train_data, batch_size=32,
                          shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32,
                        shuffle=False, num_workers=4)


In [0]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings,
                 embedding_dim=50, 
                 hidden_size=50,
                 num_layers=1,
                 dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim,
                            padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size, num_layers,
                            batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)


        
        
    def forward(self, x, h0=None, l=None):
        # ID를 Embedding으로 다차원 벡터로 변환
        # x는 (batch_size, step_size) 
        # -> (batch_size, step_size, embedding_dim)
        x = self.emb(x)
        # 초기 상태 h0와 함께 RNN에 x를 전달
        # x는(batch_size, step_size, embedding_dim)
        # -> (batch_size, step_size, hidden_dim)
        x, h = self.lstm(x, h0)
        # 마지막 단계만 추출
        # xは(batch_size, step_size, hidden_dim)
        # -> (batch_size, 1)
        if l is not None:
            # 입력의 원래 길이가 있으면 그것을 이용
            x = x[list(range(len(x))), l-1, :]
        else:
            # 없으면 단순히 마지막 것을 이용
            x = x[:, -1, :]
        # 추출한 마지막 단계를 선형 계층에 넣는다
        x = self.linear(x)
        # 불필요한 차원을 삭제
        # (batch_size, 1) -> (batch_size, )
        x = x.squeeze()
        return x


In [0]:
def eval_net(net, data_loader, device="cpu"):
    net.eval()
    ys = []
    ypreds = []
    for x, y, l in data_loader:
        x = x.to(device)
        y = y.to(device)
        l = l.to(device)
        with torch.no_grad():
            y_pred = net(x, l=l)
            y_pred = (y_pred > 0).long()
            ys.append(y)
            ypreds.append(y_pred)
    ys = torch.cat(ys)
    ypreds = torch.cat(ypreds)
    acc = (ys == ypreds).float().sum() / len(ys)
    return acc.item()


In [0]:
from statistics import mean

# num_embeddings에는 0을 포함해서 train_data.vocab_size+1를 넣는다
net = SequenceTaggingNet(train_data.vocab_size+1, 
num_layers=2)
net.to("cuda:0")
opt = optim.Adam(net.parameters())
loss_f = nn.BCEWithLogitsLoss()

for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        l = l.to("cuda:0")
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y.float())
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())
    train_acc = eval_net(net, train_loader, "cuda:0")
    val_acc = eval_net(net, test_loader, "cuda:0")
    print(epoch, mean(losses), train_acc, val_acc)


In [0]:
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression

train_X, train_y = load_svmlight_file(
    "/content/aclImdb/train/labeledBow.feat")
test_X, test_y = load_svmlight_file(
    "/content/aclImdb/test/labeledBow.feat",
    n_features=train_X.shape[1])

model = LogisticRegression(C=0.1, max_iter=1000)
model.fit(train_X, train_y)
model.score(train_X, train_y), model.score(test_X, test_y)




(0.89876, 0.39608)

In [0]:
class SequenceTaggingNet2(SequenceTaggingNet):
    def forward(self, x, h0=None, l=None):
        # ID를 Embedding으로 다차원 벡터로 변환
        x = self.emb(x)
        
        # 길이가 주어진 경우 PckedSequence 만들기
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(
                x, l, batch_first=True)
        
        # RNN에 입력
        x, h = self.lstm(x, h0)
        
        # 마지막 단계를 추출해서 선형 계층에 넣는다
        if l is not None:
            # 길이 정보가 있으면 마지막 계층의
            # 내부 상태 벡터를 직접 이용할 수 있다
            # LSTM는 보통 내부 상태 외에 블럭 셀 상태도
            # 가지고 있으므로 내부 상태만 사용한다
            hidden_state, cell_state = h
            x = hidden_state[-1]
        else:
            x = x[:, -1, :]
        
        # 선형 계층에 넣는다
        x = self.linear(x).squeeze()
        return x


In [0]:
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in tqdm.tqdm(train_loader):
        # 길이 배열을 길이 순으로 정렬
        l, sort_idx = torch.sort(l, descending=True)
        # 얻은 인덱스를 사용해서 x,y도 정렬
        x = x[sort_idx]
        y = y[sort_idx]
        
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y.float())
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())
    train_acc = eval_net(net, train_loader, "cuda:0")
    val_acc = eval_net(net, test_loader, "cuda:0")
    print(epoch, mean(losses), train_acc, val_acc)


In [0]:
# 모든 ascii 문자로 사전 만들기
import string
all_chars = string.printable

vocab_size = len(all_chars)
vocab_dict = dict((c, i) for (i, c) in enumerate(all_chars))

# 문자열을 수치 리스트로 변환하는 함수
def str2ints(s, vocab_dict):
    return [vocab_dict[c] for c in s]
  
# 수치 리스트를 문자열로 변환하는 함수
def ints2str(x, vocab_array):
    return "".join([vocab_array[i] for i in x])


In [0]:
from google.colab import files
# 창이 뜨면 파일을 선택해서 업로드한다
uploaded = files.upload()


Saving tinyshakespeare.txt to tinyshakespeare.txt


In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, 
                           DataLoader,
                           TensorDataset)
import tqdm

In [0]:
class ShakespeareDataset(Dataset):
    def __init__(self, path, chunk_size=200):
        # 파일을 읽어서 수치 리스트로 변환
        data = str2ints(open(path).read().strip(), vocab_dict)
        
        # Tensor로 변환해서 split 한다
        data = torch.tensor(data, dtype=torch.int64).split(chunk_size)
        
        # 마지막 덩어리(chunk)의 길이를 확인해서 부족한 경우 버린다
        if len(data[-1]) < chunk_size:
            data = data[:-1]
        
        self.data = data
        self.n_chunks = len(self.data)
    
    def __len__(self):
        return self.n_chunks

    def __getitem__(self, idx):
        return self.data[idx]


In [0]:
ds = ShakespeareDataset("/content/tinyshakespeare.txt",  chunk_size=200)
loader = DataLoader(ds, batch_size=32, shuffle=True, num_workers=4)


In [0]:
class SequenceGenerationNet(nn.Module):
    def __init__(self, num_embeddings, 
                 embedding_dim=50, 
                 hidden_size=50,
                 num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout)
        # Linear의 output 크기는 첫 Embedding의 
        # input 크기와 같은 num_embeddings
        self.linear = nn.Linear(hidden_size, num_embeddings)
        
    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        x = self.linear(x)
        return x, h


In [0]:
def generate_seq(net, start_phrase="The King said ",
                 length=200, temperature=0.8, device="cpu"):
    # 모델을 평가 모드로 설정
    net.eval()
    # 출력 수치를 저장할 리스트
    result = []
    
    # 시작 문자열을 Tensor로 변환
    start_tensor = torch.tensor(
        str2ints(start_phrase, vocab_dict),
        dtype=torch.int64
    ).to(device)
    # 선두에 batch 차원을 붙인다
    x0 = start_tensor.unsqueeze(0) 
    # RNN을 통해서 출력과 새로운 내부 상태를 얻는다
    o, h = net(x0)
    # 출력을 정규화돼있지 않은 확률로 변환
    out_dist = o[:, -1].view(-1).exp()
    # 확률로부터 실제 문자의 인덱스를 샘플링グ
    top_i = torch.multinomial(out_dist, 1)[0]
    # 결과 저장
    result.append(top_i)
    
    # 생성된 결과를 차례로 RNN에 넣는다
    for i in range(length):
        inp = torch.tensor([[top_i]], dtype=torch.int64)
        inp = inp.to(device)
        o, h = net(inp, h)
        out_dist = o.view(-1).exp()
        top_i = torch.multinomial(out_dist, 1)[0]
        result.append(top_i)
    
    # 시작 문자열과 생성된 문자열을 모아서 반환
    return start_phrase + ints2str(result, all_chars)


In [0]:
from statistics import mean
net = SequenceGenerationNet(vocab_size, 20, 50,
                            num_layers=2, dropout=0.1)
net.to("cuda:0")
opt = optim.Adam(net.parameters())
# 다중 식별 문제이므로 SoftmaxCrossEntropyLoss가 손실 함수가 된다
loss_f = nn.CrossEntropyLoss()

for epoch in range(50):
    net.train()
    losses = []
    for data in tqdm.tqdm(loader):
        # x는 처음부터 마지막의 하나 앞 문자까지
        x = data[:, :-1]
        # y는 두 번째부터 마지막 문자까지
        y = data[:, 1:]
        
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        
        y_pred, _ = net(x)
        # batch와 step 축을 통합해서 CrossEntropyLoss에 전달
        loss = loss_f(y_pred.view(-1, vocab_size), y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())
    # 현재 손실 함수와 생성된 문장 예 표시
    print(epoch, mean(losses))
    with torch.no_grad():
        print(generate_seq(net, device="cuda:0"))


100%|██████████| 175/175 [00:08<00:00, 19.89it/s]


0 3.4874473898751397


  0%|          | 0/175 [00:00<?, ?it/s]

The King said yireehpeade x:Cun aesn  no
h 
Thbe
p :oi ioghEecb dat;tsnoiesG I
wnwr'o e br 'tRtfmsWen:eEdgceolf nsO.wn
ovr toiiwb u ;hy] F
h headTroGcfu y  awA
hti
 Tbmh,eB owaEg anla   cN aa
sef uasE'eeieuoto taswA


100%|██████████| 175/175 [00:08<00:00, 20.73it/s]


1 3.17616738319397


  0%|          | 0/175 [00:00<?, ?it/s]

The King said aOmsoulsuiSautt wiboklscs .uk,ogeonhlan
a so i
yo pe ar wM et akonnil ]tiutI tiayer  r irro  yir ret;EBP!dene 
o, il nys tores d e, youptuZ wPokih 
fosnjteda -roisnha snrt ih lhaatnlk lutscadloycon?
F.


100%|██████████| 175/175 [00:08<00:00, 20.24it/s]


2 2.7361675017220635


  0%|          | 0/175 [00:00<?, ?it/s]

The King said t uwe fineey the sod wos ir af:me,
Weameo tpursuYdpem se vo uud minollyrer gif mel's onut, me
l, o tel yot lhfe ogh fe os whi'n om casug paek,ensr fooqn hh eyoni whboid toet mhe;

I rhage ranl
AOS yo s


100%|██████████| 175/175 [00:08<00:00, 20.17it/s]


3 2.510513159888131


  0%|          | 0/175 [00:00<?, ?it/s]

The King said soun woth
Ghivet hhuld targele, ak suny ofe
qd: nols af bae rahet kongiTham kelennw nmo kitiy, t I fted,o monk tost kovas karte bu tiur eselt she mtos hiad coocekres fine of we can
ma'bth,
Comilt:
Acem


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


4 2.387717330115182


  0%|          | 0/175 [00:00<?, ?it/s]

The King said yot a Tormowres ying scrash dwikee iv sor gone afund at I d't meakte iit, wors sorgimourt bed so pinr
dreolt to dop irreo fort in my-lime gauln seos ok;

RIIO IRCIID:
Mid heracled tiul, houch woot ce-n


100%|██████████| 175/175 [00:08<00:00, 20.78it/s]


5 2.303103140422276


  0%|          | 0/175 [00:00<?, ?it/s]

The King said boxerle Thanl gitt mler und, sirkare ntarleadgo noucoas I o tave, to on I poung
BreWlererpimail.
Yur; a thaun thag eelid aws me hiusith go has
yonpe coot sou-thte ferast.
Wo'd hakcanth
Boy nilled
The t


100%|██████████| 175/175 [00:08<00:00, 20.04it/s]


6 2.2370858464922225


  0%|          | 0/175 [00:00<?, ?it/s]

The King said coim fortuntamfen houod
Honsim, lufe, oust, me harsen, to o swele srortfend fok hpabe:
Yithave mmorse rorxsens, thins, as nevy rempaom, Cobgibdy dopes, eswen thee? I tamen bysrim,
I tteed to Soband sta


100%|██████████| 175/175 [00:08<00:00, 20.21it/s]


7 2.1819877665383474


  0%|          | 0/175 [00:00<?, ?it/s]

The King said me she hive aw-ndest foum, an mele peasnode, whe ou thou ssind, haesslesst ceniptur-the the thented mross
Ilr deaughy,
btoo buam focude the beighour bent is nnrelithee the: lord meeld
Sorg; tale,
As ch


100%|██████████| 175/175 [00:08<00:00, 20.67it/s]


8 2.13589848109654


  0%|          | 0/175 [00:00<?, ?it/s]

The King said you sins bomest she wo edonce of shoup rithn
Thued, wuller
Tiunshornortintes, till deand'r laar wha shingw ther,
Theid wes and thabr?

HoTRour sing. Thet of herellet enre dungendiot ar mith: apts,'-
It


100%|██████████| 175/175 [00:08<00:00, 20.12it/s]


9 2.0947112124306817


  0%|          | 0/175 [00:00<?, ?it/s]

The King said your whil
Kum ilks cisingnrert.
Jor thou
To bame march for sall at of and demer sud or Gawere nor be brwasu, hould,
Noocitmestirpan care rear am and sordlince dord for theot lo-dan thhand aforatess,
In


100%|██████████| 175/175 [00:08<00:00, 20.10it/s]


10 2.058685073852539


  0%|          | 0/175 [00:00<?, ?it/s]

The King said brotwer mellpers I el? I 's's sore ther hin, Hew allio; with Crettulont the kake fhard is homg.
Bloors of heaven in our hid am aports, n thet the blound demer all.
Thlam but roJ.
Bruntoeee grome meby t


100%|██████████| 175/175 [00:08<00:00, 20.10it/s]


11 2.026507339477539


  0%|          | 0/175 [00:00<?, ?it/s]

The King said bookt at you hoo sert anture you farvest, a mernkan you reeserch mone the do gromlbing the to thurd sumot notisihed
Kjop. Tloure is if thay youd you and the nos youd worcture, as hate
And in my, fallfy


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


12 1.9982477705819266


  0%|          | 0/175 [00:00<?, ?it/s]

The King said thee:
his janse,
Wtenster, jestanlly it my fopen,
But he' hadania, frible to? thas foare is abkand arl?

Thigle:
en plath hoRitor to Ginks snise, lake to now I lalle ? Clovend, the so frarlperg vach no


100%|██████████| 175/175 [00:08<00:00, 20.66it/s]


13 1.97252817767007


  0%|          | 0/175 [00:00<?, ?it/s]

The King said nimet hath bpealitiof it at a colmst viok now die bast hear;
Sitne to for in mance in mows;
Where
gorther beckwise, but thy sude
Thell mattire with me gears o-ricer the shas tre ray you not am; he Teti


100%|██████████| 175/175 [00:08<00:00, 20.12it/s]


14 1.9506914063862393


  0%|          | 0/175 [00:00<?, ?it/s]

The King said you main to me with mose cark.

FirRCICUAL IT:
Your for
the pusters covere,
To thou mish fintes hone,
To, us eil heed wief wowtl?
Core and to see dikdy in'd: all, and Cunteekiin hertent.

DUCKIEAS:
Sin


100%|██████████| 175/175 [00:08<00:00, 20.27it/s]


15 1.9313139390945435


  0%|          | 0/175 [00:00<?, ?it/s]

The King said me Dack have fantan,
Notrou wimn queunk.
Thus alt m dethe usint one m gould:
Fake in his fail heak and bloice:
Heann pent rwell, fe thou me doncons.
Lerwed, it marbe divefu now,
And Cicerpcouk not ip; 


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


16 1.9133997317722866


  0%|          | 0/175 [00:00<?, ?it/s]

The King said asail,--

Y:
Thou forlereft,
Weash erther you link fake as Rowhnout usserse bedter! tither hach my, praster? and sues an og I were cungeries fitk I lind the at but:
I wilt yOull with nowh a no planvere


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


17 1.89730605670384


  0%|          | 0/175 [00:00<?, ?it/s]

The King said if windnoud.

HENALT:
Leck her can natreak and here
Rown torsee compituiu be thou plose be?
My thou we koothing. O haud heoll muts, catune seet or horep are thou
nides?
I Vabers deau-ster and his we mo


100%|██████████| 175/175 [00:08<00:00, 20.81it/s]


18 1.8828251021248954


  0%|          | 0/175 [00:00<?, ?it/s]

The King said the dowmireding fairtrove.

AUGINY:
No hel meseicided juretic the wres-main, me but froble and ridunce and be not fick?
O lies bests but suat, trantlt frow.

KING PAURENT:
The breet ears of a of the po


100%|██████████| 175/175 [00:08<00:00, 20.21it/s]


19 1.868360216958182


  0%|          | 0/175 [00:00<?, ?it/s]

The King said tendevea.

ISAPARD:
Spe what atty spriinght on may licly strestert?

First:
That hout at Gost best he dood at well:
Nakes peent's rate, her have him'nes; Angaip.

QUEEN O
GETERET:
The bome!

ISABELNO:



100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


20 1.8567959458487375


  0%|          | 0/175 [00:00<?, ?it/s]

The King said more erching hiw and hivole; whon thach with from more the awiness Plongue?
'Tat,
I imave,
Are buw.

MERNILA:
I lost fnoms, the wisigh?

LERY RORS:
For as Juck is may lemilent they dewach, me caretw ma


100%|██████████| 175/175 [00:08<00:00, 20.54it/s]


21 1.8450466857637677


  0%|          | 0/175 [00:00<?, ?it/s]

The King said shousl; marlio sold, neach tills, tare diritour we geate afiils mory
Of and, be, those, wBat arriore to I his as bray. You not a aral turses,
Myt nothald. And be yourselfs, plekasters,
Agely hear of bi


100%|██████████| 175/175 [00:08<00:00, 20.17it/s]


22 1.8326169320515224


  0%|          | 0/175 [00:00<?, ?it/s]

The King said befure no nor prainttius unto hivers mallus your leaswer hvear, Datue, in evee dea nore us
I hatter? Cloings and roven;:
Fo coun's
dould draunds, I go, hele
Soarts,
My quil art on, with I witt to man,



100%|██████████| 175/175 [00:08<00:00, 20.09it/s]


23 1.8224037892477853


  0%|          | 0/175 [00:00<?, ?it/s]

The King said cesm there of him
Yang enorring besuncost fawiy with you, a eice tempse
Sune, bethy to esplest:
My lovemeal
No nose, worefike sheed the rovate tuoder cemund of sint is we;
She wear be horecige begnentl


100%|██████████| 175/175 [00:08<00:00, 20.19it/s]


24 1.8127262919289726


  0%|          | 0/175 [00:00<?, ?it/s]

The King said in of saalfalty.

HENRON MORGHBOMFOR:
No, May to bemorcand me; Gefure, with there it for your in here,
And mardaten him thoo thy dicsing or the gome.

SICINIUS:
Well.
Or thing as spring.

Nur!
Ceas! If


100%|██████████| 175/175 [00:08<00:00, 20.12it/s]


25 1.8027762760434831


  0%|          | 0/175 [00:00<?, ?it/s]

The King said that be song,
Hy kake that:
Are that senty; hath wcut thou stiste,
Nopgoumbs the proin.

Turd:
Now,
Hose, and wake that a deaw menred is's hath gain;
And not since!

CORIOLANUS:
It, if this that yost b


100%|██████████| 175/175 [00:08<00:00, 20.22it/s]


26 1.7947967842647008


  0%|          | 0/175 [00:00<?, ?it/s]

The King said krath
word?
Hot.

Prundon:
Is were, where
if untee dogmost me Pryaory;

MIRLANIUS:
I our love aromoury Micting.

EDA TRANV:
Ald my ast will a bobker.

NORNIOLANA:
Yes pluck, so compinbit:
Econd let nos


100%|██████████| 175/175 [00:08<00:00, 20.18it/s]


27 1.7864916706085205


  0%|          | 0/175 [00:00<?, ?it/s]

The King said the browred Warloon to I emer'd rometopius out that itsrom?

Sis:
Sill deton wo; in I sire,
The repelt;
To thou thee, in the dods, not so lirds growich
The fince of whis that mister!

KING RICHARD IIGI


100%|██████████| 175/175 [00:08<00:00, 20.12it/s]


28 1.7788366671970912


  0%|          | 0/175 [00:00<?, ?it/s]

The King said shangs that thou not Sack best satake to'st not mine doth,
Where, siresing? let did, if no mebrow as hearly to sail you not plook the myald and forgen:
And?

DUCENTIO:

LADY GLOUCESTER:
What be cboom s


100%|██████████| 175/175 [00:08<00:00, 20.65it/s]


29 1.771244032042367


  0%|          | 0/175 [00:00<?, ?it/s]

The King said adf? If his dreak exer, his plastuly coldand'd you upow will perceact.

CAPULET:

TRENHES:
Thou geact!
Beatel's ut know, then stace the ucender and st-to day you;
Crrown.

Pedax:
That your aster hered,


100%|██████████| 175/175 [00:08<00:00, 19.95it/s]


30 1.7643335594449725


  0%|          | 0/175 [00:00<?, ?it/s]

The King said bere?

GLOUCESTER:
But advere.

HASTINGS:
Nast a of you, this excungle of my coursed I aremed expare;
No sen?
Go thar yet stoud: shall prueder, what thy mading thou alours:
Go that beap,
On the copen-t


100%|██████████| 175/175 [00:08<00:00, 20.78it/s]


31 1.7582889366149903


  0%|          | 0/175 [00:00<?, ?it/s]

The King said I not dear,
And Rach heor
To are wish likes our most amry'd I over?

FRIAR:
Nor your lidion; of ere have
At Your's uses;
Aur bant one your fuets Ebpath,
I vence, your day?

BRAPUL:
The pirt wats, Anlie


100%|██████████| 175/175 [00:08<00:00, 20.15it/s]


32 1.7516407121930804


  0%|          | 0/175 [00:00<?, ?it/s]

The King said wed his poyile not thou have one to and is ted endone?

QUEEN:
What, he stand thou erts Rosm, and his that the woo?

GLOUCESTER:
How on ofias blood notonore? this bestieg leight to dunglen that us with


100%|██████████| 175/175 [00:08<00:00, 20.48it/s]


33 1.7456249598094395


  0%|          | 0/175 [00:00<?, ?it/s]

The King said now that lectactine: with word, son of I misting free?

ENTOLIUS:
Abhave have best ween till Tlabes Lave now feet and lrocens,
My fabe; welliss conceagtland, foul in you arings nogrhool of retiend,
And


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


34 1.740707768712725


  0%|          | 0/175 [00:00<?, ?it/s]

The King said sbald his warrop,
Hath this fabtond, way swears not them is'll of and me lie a gence ip;
What, you weer, but that the word; more make her my insting;
For prispry fancost
Soeint at our houd,
Have us, me


100%|██████████| 175/175 [00:08<00:00, 20.20it/s]


35 1.734825713975089


  0%|          | 0/175 [00:00<?, ?it/s]

The King said a or;
Fingike:
Fervel cinesner sea.

Mess wor:
What the ke'll, and more!

ESCELUS:
Thear they is cand thee you the condise is him meory,
We rosungifts follawaky,
Say putight, re your wouth his mees mot


100%|██████████| 175/175 [00:08<00:00, 20.07it/s]


36 1.7297313942228045


  0%|          | 0/175 [00:00<?, ?it/s]

The King said out
and made:;
And liejos.
Here seep, I my ofly did, and shear Cannsely do beoves it sest, and where shall dadfore.
'This ruturous.

BUKARG EDWARD:
Piy clomy.

KING RICHARD III:
Sroth,
Hould my shall M


100%|██████████| 175/175 [00:08<00:00, 20.14it/s]


37 1.7241147375106811


  0%|          | 0/175 [00:00<?, ?it/s]

The King said to now feance.

FEALUS:
Nay, ig hight? for omes his fair. His me and
both periss: Thy malights
and is's desistaus Rophinf exine, segreeling,
Of the chortur'd to, are
Will lives
And jare; oot upinour ai


100%|██████████| 175/175 [00:08<00:00, 20.15it/s]


38 1.719525955745152


  0%|          | 0/175 [00:00<?, ?it/s]

The King said d'll:
Bock both?

DUKE VINCENTIO:
Gress, let; moon of 'cit,
Shath bele lord you revishtdy, my matterings,
And takem's tell hath; saven, reslen-swan, Broke--for as anf will was than this have spee one a


100%|██████████| 175/175 [00:08<00:00, 20.24it/s]


39 1.715180173601423


  0%|          | 0/175 [00:00<?, ?it/s]

The King said men heak with genpen ey, To my whose wourds wonly this prace
Toocwers yournal recather Stirn of thing
And we deoven'd of you me not must Evence.

ANGELO:
Why, my gave.

CORIOLANUS:
I spew no propsting



100%|██████████| 175/175 [00:08<00:00, 20.17it/s]


40 1.7106239713941302


  0%|          | 0/175 [00:00<?, ?it/s]

The King said grine
And word, I very kour umbrong stould their
of Fead, in uple, ertenlesch'd so by
I to my proizent he beod are?

Nurse servish I doose renase hiok thou not ewers the hack, the
come at foent,
Mercio


100%|██████████| 175/175 [00:08<00:00, 20.22it/s]


41 1.7058634001868112


  0%|          | 0/175 [00:00<?, ?it/s]

The King said is be a thand.

IRGANG:
And all her honior.
Sorar feast, Kushon I manmy;
Youghters. No the lainss dicio my full us!

JULILT:
Serobly pity.

GREMIO:
I'll im love;
Fray, wholy would denet of Fheace worse


100%|██████████| 175/175 [00:08<00:00, 20.76it/s]


42 1.7017618438175746


  0%|          | 0/175 [00:00<?, ?it/s]

The King said and will this red, and felses
Aincmen. Thou will spead'sb that not on: and you thy wire will valleat one of the bear out.
Coo king; thy will enter.

Plernt:
My not! Mower'nes, take now have I have do n


100%|██████████| 175/175 [00:08<00:00, 20.08it/s]


43 1.697121993473598


  0%|          | 0/175 [00:00<?, ?it/s]

The King said is advingely hove not to fesure another: I soart
shall me show what will caming yweise!
Is I sharble my Tomet brean erest!
Where the beance: God be end agemonison warily her and lies right in ill
Then 


100%|██████████| 175/175 [00:08<00:00, 20.62it/s]


44 1.693954348564148


  0%|          | 0/175 [00:00<?, ?it/s]

The King said of thy his weet on untart
We my lord?

GREMIO:
Gentle not my. Mainss
With nrows they blood beenount we him joy,
Your restert.
Both in more I same't his virmoness.

LADTA:
O, he barkings of contel?  car


100%|██████████| 175/175 [00:08<00:00, 20.17it/s]


45 1.6896929018838065


  0%|          | 0/175 [00:00<?, ?it/s]

The King said dead with our; And, me, wife sen
If your stings, mises.

CLEOMRIO:
I'll speek in a'wemp.

Rower:
Command unqouest.
To lendt me we from you welpect your well goast's
The other wary, my the brie; to serl


100%|██████████| 175/175 [00:08<00:00, 20.17it/s]


46 1.6848550408227103


  0%|          | 0/175 [00:00<?, ?it/s]

The King said respomen as his fay.
And till heavens; and no gentlesbent.

DUKE VINCENTIO:
What, so not my know Riseip
Come,
Thy lord is a brode this both rike mes and he'll, whou! 'Thas not hear to for the bear of b


100%|██████████| 175/175 [00:08<00:00, 20.16it/s]


47 1.6820763717378888


  0%|          | 0/175 [00:00<?, ?it/s]

The King said I this:
Go Ameects' hine me there untruen,
Of ecken not we price--O,
Denise the felf this king:
Dud not up nomere the one again:
A ttright, seace, these death,
You, my purising privones!

LUCIO:
Who
Th


100%|██████████| 175/175 [00:08<00:00, 19.97it/s]


48 1.6790780305862427


  0%|          | 0/175 [00:00<?, ?it/s]

The King said to the will.

LUCIO:
This paul, where Cloece and marrades shales
To tapeck; bay,
Gods heart; spuck; and say, then hourming it of pleaf it,
And at speak priy.

QUEEN MARGARET:
Amonds ffacence evoy that 


100%|██████████| 175/175 [00:08<00:00, 20.79it/s]


49 1.675383790561131
The King said father: the course a speak generion of thingt wan agay
Blawirg your asporen, Would on me is broughts of forgoor spaicen by
Fair Bamends appeaseth worth;
To unthee one intay to call by I bring, his shou


In [0]:
!wget http://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip


In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, 
                              DataLoader,
                              TensorDataset)
import tqdm


In [0]:
import re
import collections
import itertools

remove_marks_regex = re.compile(
    "[\,\(\)\[\]\*:;¿¡]|<.*?>")
shift_marks_regex = re.compile("([?!\.])")

unk = 0
sos = 1
eos = 2

def normalize(text):
    text = text.lower()
    # 불필요한 문자 제거
    text = remove_marks_regex.sub("", text)
    # ?!. 와 단어 사이에 공백 삽입
    text = shift_marks_regex.sub(r" \1", text)
    return text
  
def parse_line(line):
    line = normalize(line.strip())
    # 번역 대상(src)과 번역 결과(trg) 각각의 토큰을 리스트로 만든다
    src, trg = line.split("\t")
    src_tokens = src.strip().split()
    trg_tokens = trg.strip().split()
    return src_tokens, trg_tokens
  
def build_vocab(tokens):
    # 파일 안의 모든 문장에서 토큰의 등장 횟수를 확인
    counts = collections.Counter(tokens)
    # 토큰의 등장 횟수를 많은 순으로 나열
    sorted_counts = sorted(counts.items(), 
                           key=lambda c: c[1], reverse=True)
    # 세 개의 태그를 추가해서 정방향 리스트와 역방향 용어집 만들기
    word_list = ["<UNK>", "<SOS>", "<EOS>"] \
        + [x[0] for x in sorted_counts]
    word_dict = dict((w, i) for i, w in enumerate(word_list))
    return word_list, word_dict
  
def words2tensor(words, word_dict, max_len, padding=0):
    # 끝에 종료 태그를 붙임
    words = words + ["<EOS>"]
    # 사전을 이용해서 수치 리스트로 변환
    words = [word_dict.get(w, 0) for w in words]
    seq_len = len(words)
    # 길이가 max_len이하이면 패딩한다
    if seq_len < max_len + 1:
        words = words + [padding] * (max_len + 1 - seq_len)
    # Tensor로 변환해서 반환
    return torch.tensor(words, dtype=torch.int64), seq_len



In [0]:
class TranslationPairDataset(Dataset):
    def __init__(self, path, max_len=15):
        # 단어 수사 많은 문장을 걸러내는 함수
        def filter_pair(p):
            return not (len(p[0]) > max_len 
                        or len(p[1]) > max_len)
        # 파일을 열어서, 파스 및 필터링       
        with open(path) as fp:
            pairs = map(parse_line, fp)
            pairs = filter(filter_pair, pairs)
            pairs = list(pairs)
        # 문장의 소스와 타켓으로 나눔
        src = [p[0] for p in pairs]
        trg = [p[1] for p in pairs]
        #각각의 어휘집 작성
        self.src_word_list, self.src_word_dict = \
            build_vocab(itertools.chain.from_iterable(src))
        self.trg_word_list, self.trg_word_dict = \
            build_vocab(itertools.chain.from_iterable(trg))
        # 어휘집을 사용해서 Tensor로 변환
        self.src_data = [words2tensor(
            words, self.src_word_dict, max_len)
                for words in src]
        self.trg_data = [words2tensor(
            words, self.trg_word_dict, max_len, -100)
                         for words in trg]
    def __len__(self):
        return len(self.src_data)
      
    def __getitem__(self, idx):
        src, lsrc = self.src_data[idx]
        trg, ltrg = self.trg_data[idx]
        return src, lsrc, trg, ltrg



In [0]:
batch_size = 64
max_len = 10
path = "/content/spa.txt"
ds = TranslationPairDataset(path, max_len=max_len)
loader = DataLoader(ds, batch_size=batch_size, shuffle=True,
                    num_workers=4)


In [0]:
class Encoder(nn.Module):
    def __init__(self, num_embeddings,
                 embedding_dim=50, 
                  hidden_size=50,
                 num_layers=1,
                 dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, 
          embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size, num_layers,
                            batch_first=True,
dropout=dropout)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(
                x, l, batch_first=True)
        _, h = self.lstm(x, h0)
        return h


In [0]:
class Decoder(nn.Module):
    def __init__(self, num_embeddings,
                 embedding_dim=50, 
                 hidden_size=50,
                 num_layers=1,
                 dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size,
                            num_layers, batch_first=True,
                            dropout=dropout)
        self.linear = nn.Linear(hidden_size, num_embeddings)
    
    def forward(self, x, h, l=None):
        x = self.emb(x)
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(
                x, l, batch_first=True)
        x, h = self.lstm(x, h)
        if l is not None:
            x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0)[0]
        x = self.linear(x)
        return x, h


In [0]:
def translate(input_str, enc, dec, max_len=15, device="cpu"):
    # 입력 문자열을 수치화해서 Tensor로 변환
    words = normalize(input_str).split()
    input_tensor, seq_len = words2tensor(words, 
        ds.src_word_dict, max_len=max_len)
    input_tensor = input_tensor.unsqueeze(0)
    # 엔코더에서 사용하므로 입력값의 길이도 리스트로 만들어둔다
    seq_len = [seq_len]
    # 시작 토큰 준비
    sos_inputs = torch.tensor(sos, dtype=torch.int64)
    input_tensor = input_tensor.to(device)
    sos_inputs = sos_inputs.to(device)
    # 입력 문자열을 엔코더에 넣어서 컨텍스트 얻기
    ctx = enc(input_tensor, l=seq_len)
    # 시작 토큰과 컨텍스트를 디코더의 초깃값으로 설정
    z = sos_inputs
    h = ctx
    results = []
    for i in range(max_len):
        # Decoder로 다음 단어 예측
        o, h = dec(z.view(1, 1), h)
        # 선형 계층의 출력이 가장 큰 위치가 다음 단어의 ID
        wi = o.detach().view(-1).max(0)[1]
        if wi.item() == eos:
            break
        results.append(wi.item())
        # 다음 입력값으로 현재 출력 ID를 사용
        z = wi
    # 기록해둔 출력 ID를 문자열로 변환
    return " ".join(ds.trg_word_list[i] for i in results)


In [0]:
enc = Encoder(len(ds.src_word_list), 100, 100, 2)
dec = Decoder(len(ds.trg_word_list), 100, 100, 2)
translate("I am a student.", enc, dec)


'susurro liberaron salida trasladé trasladé moscú moscú memorizando moscú moscú moscú moscú memorizando moscú moscú'

In [0]:
enc = Encoder(len(ds.src_word_list), 100, 100, 2)
dec = Decoder(len(ds.trg_word_list), 100, 100, 2)
enc.to("cuda:0")
dec.to("cuda:0")
opt_enc = optim.Adam(enc.parameters(), 0.002)
opt_dec = optim.Adam(dec.parameters(), 0.002)
loss_f = nn.CrossEntropyLoss()


In [0]:
from statistics import mean

def to2D(x):
    shapes = x.shape
    return x.reshape(shapes[0] * shapes[1], -1)
  
for epoc in range(30):
    # 신경망을 훈련 모드로 설정
    enc.train(), dec.train()
    losses = []
    for x, lx, y, ly in tqdm.tqdm(loader):
        # x의 PackedSequence를 만들기 위해 번역 소스의 길이로 내림차순 정렬한다
        lx, sort_idx = lx.sort(descending=True)
        x, y, ly = x[sort_idx], y[sort_idx], ly[sort_idx]
        x, y = x.to("cuda:0"), y.to("cuda:0")
        # 번역 소스를 엔코더에 넣어서 컨텍스트를 얻는다
        ctx = enc(x, l=lx)
        # y의 PackedSequence를 만들기 위해 번역 소스의 길이로 내림차순 정렬
        ly, sort_idx = ly.sort(descending=True)
        y = y[sort_idx]
        # Decoder의 초깃값 설정
        h0 = (ctx[0][:, sort_idx, :], ctx[1][:, sort_idx, :])
        z = y[:, :-1].detach()
        # -100인 상태에선 Embedding 계산에서 오류가 발생하므로 0으로 변경
        z[z==-100] = 0
        # 디코더에 넣어서 손실 함수 계산
        o, _ = dec(z, h0, l=ly-1)
        loss = loss_f(to2D(o[:]), to2D(y[:, 1:max(ly)]).squeeze())
        # Backpropagation(오차 역전파 실행)
        enc.zero_grad(), dec.zero_grad()
        loss.backward()
        opt_enc.step(), opt_dec.step()
        losses.append(loss.item())
    # 전체 데이터의 계산이 끝나면 현재의
    # 손실 함수 값이나 번역 결과를 표시
    enc.eval(), dec.eval()
    print(epoc, mean(losses))
    with torch.no_grad():
        print(translate("I am a student.",
                         enc, dec, max_len=max_len, 
device="cuda:0"))
        print(translate("He likes to eat pizza.",
                         enc, dec, max_len=max_len, 
device="cuda:0"))
        print(translate("She is my mother.",
                         enc, dec, max_len=max_len, 
device="cuda:0"))


100%|██████████| 1623/1623 [01:21<00:00, 19.96it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

0 5.343641384545477
un poco .
que mary estaba .
es mi padre .


100%|██████████| 1623/1623 [01:21<00:00, 19.60it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

1 3.357381745759749
un niño .
que él estaba equivocado .
es mi madre .


100%|██████████| 1623/1623 [01:23<00:00, 19.34it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

2 2.1432380262011743
una mujer .
que él estaba equivocado .
es mi madre .


100%|██████████| 1623/1623 [01:24<00:00, 19.21it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

3 1.6379780181748442
una novia .
que era difícil .
es mi madre .


100%|██████████| 1623/1623 [01:25<00:00, 19.44it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

4 1.402317685105517
una estudiante .
que era difícil .
es mi madre .


100%|██████████| 1623/1623 [01:27<00:00, 18.62it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

5 1.2394604528677442
una estudiante .
que era difícil .
es mi madre .


100%|██████████| 1623/1623 [01:26<00:00, 19.09it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

6 1.1156018136169024
una estudiante .
que nos gusta aprender .
mi madre .


100%|██████████| 1623/1623 [01:27<00:00, 18.58it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

7 1.0169713380743675
una estudiante .
que era difícil de comer .
mi madre .


100%|██████████| 1623/1623 [01:28<00:00, 17.30it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

8 0.9360715021928445
una estudiante .
que nos gusta aprender .
es mi madre .


100%|██████████| 1623/1623 [01:28<00:00, 18.39it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

9 0.8705784938839285
una estudiante .
que nos gusta aprender .
es mi madre .


100%|██████████| 1623/1623 [01:26<00:00, 18.66it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

10 0.8169924354230043
una estudiante .
que nos gusta aprender .
es mi madre .


100%|██████████| 1623/1623 [01:26<00:00, 18.71it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

11 0.7711671313531444
una estudiante .
que nos gusta aprender .
mi madre .


100%|██████████| 1623/1623 [01:25<00:00, 19.24it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

12 0.7310275454177433
soy estudiante .
que era difícil de aprender .
es mi madre .


100%|██████████| 1623/1623 [01:21<00:00, 19.81it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

13 0.6965028822642377
un estudiante .
que ella estaba asando especialmente .
.


100%|██████████| 1623/1623 [01:22<00:00, 19.64it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

14 0.6662255061568849
un estudiante .
que ella se come a las críticas .
mío .


100%|██████████| 1623/1623 [01:22<00:00, 19.66it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

15 0.6392701635996795
un estudiante .
que él se come a las 230 .
mi madre .


100%|██████████| 1623/1623 [01:21<00:00, 20.21it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

16 0.6144772928508798
soy estudiante .
que nos gusta aprender .
mi madre .


100%|██████████| 1623/1623 [01:21<00:00, 20.23it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

17 0.5946919174195805
un estudiante .
que él estaba deliciosa .
mi madre .


100%|██████████| 1623/1623 [01:21<00:00, 19.92it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

18 0.5737439397302441
un estudiante .
que él se estudia con facilidad .
mi madre .


100%|██████████| 1623/1623 [01:21<00:00, 19.94it/s]
  0%|          | 0/1623 [00:00<?, ?it/s]

19 0.556303559228071
un estudiante .
que nos gusta aprender .
mi madre .


 74%|███████▍  | 1208/1623 [01:00<00:20, 20.08it/s]