<a href="https://colab.research.google.com/github/KamonohashiPerry/PyTorch/blob/master/PyTorch_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorchのインストール
+ Colabの編集→ノートブックの設定でGPUモードにすることをお忘れなく。

In [0]:
# pip install http://download.pytorch.org/whl/cu80/torch-0.4.0-cp36-cp36m-linux_x86_64.whl

In [0]:
# pip install torchvision

In [0]:
# pip install tqdm

インストールできたかの確認

In [0]:
import torch

In [2]:
torch.tensor([1, 2, 3]).to("cuda:0")

tensor([1, 2, 3], device='cuda:0')

In [0]:
# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
# !apt-get update -qq 2>&1 > /dev/null
# !apt-get -y install -qq google-drive-ocamlfuse fuse

# from google.colab import auth
# auth.authenticate_user()
# from oauth2client.client import GoogleCredentials
# creds = GoogleCredentials.get_application_default()
# import getpass
# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL

# vcode = getpass.getpass()
# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
# # drive mean root directory of  google drive
# !mkdir -p drive
# !google-drive-ocamlfuse drive
# !ls drive/"Colab Notebooks"/PyTorch

In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)
import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
!wget http://www.manythings.org/anki/spa-eng.zip

--2019-12-22 13:16:02--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4752884 (4.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2019-12-22 13:16:05 (1.49 MB/s) - ‘spa-eng.zip’ saved [4752884/4752884]



In [4]:
!unzip spa-eng.zip

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [5]:
ls

_about.txt  [0m[01;34msample_data[0m/  spa-eng.zip  spa.txt


In [0]:
# from google.colab import files
# files.download("spa.txt")

In [0]:
import re
import collections
import itertools

remove_marks_regex = re.compile("[\,\(\)\[\]\*:;¿¡]|<.*?>")
shift_marks_regex = re.compile("([?!\.])")

unk = 0
sos = 1
eos = 2

def normalize(text):
  # 小文字にする
  text = text.lower()
  # 不要な文字を除去
  text = remove_marks_regex.sub("", text)
  # ?!.と単語の間に空白を挿入
  text = shift_marks_regex.sub(r" \1", text)
  return text

def parse_line(line):
  line = normalize(line.strip())
  # 翻訳元(src)と翻訳先(trg)それぞれのトークンのリストを作る
  src, trg, _ = line.split("\t")
  src_tokens = src.strip().split()
  trg_tokens = trg.strip().split()
  return src_tokens, trg_tokens

In [0]:
def build_vocab(tokens):
  # ファイル中の全ての文章でのトークンの出現数を数える
  counts = collections.Counter(tokens)
  # トークンの出現数の多い順に並べる
  sorted_counts = sorted(counts.items(),
                            key=lambda c: c[1], reverse=True)
  # 3つのタグを追加して正引きリストと逆引き用辞書を作る
  word_list = ["<UNK>", "<SOS>", "<EOS>"] + [x[0] for x in sorted_counts]
  word_dict = dict((w, i) for i, w in enumerate(word_list))
  return word_list, word_dict

def words2tensor(words, word_dict, max_len, padding=0):
  # 末尾に終了タグをつける
  words = words + ["<EOS>"]
  # 辞書を利用して数値のリストに変換する
  words = [word_dict.get(w, 0) for w in words]
  seq_len = len(words)
  # 長さがmax_len以下の場合はパディングする
  if seq_len < max_len + 1:
    words = words + [padding] * (max_len + 1 - seq_len)
    # Tensorに変換して返す
  return torch.tensor(words, dtype=torch.int64), seq_len


In [0]:
class TranslationPairDataset(Dataset):
  def __init__(self, path, max_len=15):
    # 単語数が多い文章をフィルタリングする関数
    def filter_pair(p):
      return not (len(p[0]) > max_len or len(p[1]) > max_len )

    # ファイルを開き、パース/フィルタリングをする
    with open(path) as fp:
      pairs = map(parse_line, fp)
      pairs = filter(filter_pair, pairs)
      pairs = list(pairs)

    # 文章のペアをソースとターゲットに分ける
    src = [p[0] for p in pairs]
    trg = [p[1] for p in pairs]
    # それぞれの語彙集を作成する
    self.src_word_list, self.src_word_dict = build_vocab(itertools.chain.from_iterable(src))
    self.trg_word_list, self.trg_word_dict = build_vocab(itertools.chain.from_iterable(trg))

    # 語彙集を使用してTensorに変換する
    self.src_data = [words2tensor(
        words, self.src_word_dict, max_len)
          for words in src]

    # -100でパディングすることでPyTorchの損失関数の計算に含めないようにするらしい。
    # 可変長の系列の扱いが容易になるらしい。
    self.trg_data = [words2tensor(
        words, self.trg_word_dict, max_len, -100)
          for words in trg]

  def __len__(self):
    return len(self.src_data)

  def __getitem__(self, idx):
    src, lsrc = self.src_data[idx]
    trg, ltrg = self.trg_data[idx]
    return src, lsrc, trg, ltrg


In [9]:
pwd

'/content'

In [0]:
# DatasetとDataLoaderの作成
batch_size = 64
max_len = 10

path = "/content/spa.txt"
ds = TranslationPairDataset(path, max_len=max_len)
loader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=4)

In [0]:
# Encoderの作成
class Encoder(nn.Module):
  def __init__(self, num_embeddings,
               embedding_dim=50,
               hidden_size=50,
               num_layers=1,
               dropout=0.2):
    super().__init__()
    self.emb = nn.Embedding(num_embeddings, embedding_dim,
                            padding_idx=0)
    self.lstm = nn.LSTM(embedding_dim,
                        hidden_size,
                        num_layers,
                        batch_first=True,
                        dropout=dropout)
    
  def forward(self, x, h0=None, l=None):
    x = self.emb(x)
    if l is not None:
      x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
    # 内部状態のみをDecoderにわたすので、出力は破棄している。
    _, h = self.lstm(x, h0)
    return h

In [0]:
# Decoderの作成
class Decoder(nn.Module):
  def __init__(self, num_embeddings,
               embedding_dim=50,
               hidden_size=50,
               num_layers=1,
               dropout=0.2):
    super().__init__()
    # 畳み込み層
    self.emb = nn.Embedding(num_embeddings, embedding_dim,padding_idx=0)
    # LSTM
    self.lstm = nn.LSTM(embedding_dim, hidden_size,
                        num_layers,batch_first=True,
                        dropout=dropout)
    # 線形
    self.linear = nn.Linear(hidden_size, num_embeddings)

  def forward(self, x, h, l=None):
    x = self.emb(x)
    if l is not None:
      x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
    x, h = self.lstm(x, h)
    if l is not None:
      x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0)[0]
    x = self.linear(x)
    return x, h

In [0]:
# 翻訳する関数の作成
def translate(input_str, enc, dec, max_len=15,
              device="cpu"):
  # 入力文字列を数値化してTensorに変換
  words = normalize(input_str).split()
  input_tensor, seq_len = words2tensor(words, ds.src_word_dict, max_len=max_len)
  input_tensor = input_tensor.unsqueeze(0)
  
  # Encoderで使用するので入力の長さもリストにしておく
  seq_len = [seq_len]

  # 開始トークンを準備
  sos_inputs = torch.tensor(sos, dtype=torch.int64)
  input_tensor = input_tensor.to(device)
  sos_inputs = sos_inputs.to(device)

  # 入力文字列をEncoderに入れてコンテキストを得る
  ctx = enc(input_tensor, l=seq_len)

  # 開始トークンとコンテキストをDecoderの初期値にセット
  z = sos_inputs
  h = ctx
  results = []
  for i in range(max_len):
    # Decoderで次の単語を予測
    o, h = dec(z.view(1,1), h)

    # 線形層の出力が最も大きい場所が次の単語のID
    wi = o.detach().view(-1).max(0)[1]
    if wi.item() == eos:
      break
    results.append(wi.item())
    # 次の入力は今回の出力のIDを使用する
    z = wi
  # 記憶しておいた出力のIDを文字列に変換
  return " ".join(ds.trg_word_list[i] for i in results)


In [14]:
# 関数の動作の確認
enc = Encoder(len(ds.src_word_list), 100, 100, 2)
dec = Decoder(len(ds.trg_word_list), 100, 100, 2)
translate("I am a student.", enc, dec)

'quédatelo revueltos porciento gustaba desafortunadamente usara apóyame apóyame arde admitirá h2o propio sarcástico sarcástico indefensión'

In [0]:
# モデルの学習
enc = Encoder(len(ds.src_word_list), 100, 100, 2)
dec = Decoder(len(ds.trg_word_list), 10, 100, 2)
enc.to("cuda:0")
dec.to("cuda:0")
# パラメータはAdam
opt_enc = optim.Adam(enc.parameters(), 0.002)
opt_dec = optim.Adam(dec.parameters(), 0.002)
# 損失関数
loss_f = nn.CrossEntropyLoss()

In [0]:
# 学習
from statistics import mean

def to2D(x):
  shapes = x.shape
  return x.reshape(shapes[0] * shapes[1], -1)

for epoc in range(30):
  # ネットワークを訓練モードにする
  enc.train(), dec.train()
  losses = []
  for x, lx, y, ly in tqdm.tqdm(loader):
    # xのPackedSequenceを作るために翻訳元の長さで降順にソート
    lx, sort_idx = lx.sort(descending=True)
    x, y, ly = x[sort_idx], y[sort_idx], ly[sort_idx]
    x, y = x.to("cuda:0"), y.to("cuda:0")
    # 翻訳元をEncoderに入れてコンテキストを得る
    ctx = enc(x, l=lx)

    # yのPackedSequenceを作るために翻訳先の長さで降順にソート
    ly, sort_idx = ly.sort(descending=True)
    y = y[sort_idx]
    # Decoderの初期値をセット
    h0 = (ctx[0][:, sort_idx, :], ctx[1][:, sort_idx, :])
    z = y[:, :-1].detach()

    # -100のままだとEmbeddingの計算でエラーが出てしまうので値を0に変更しておく
    z[z==-100] = 0
    # Decoderに通して損失関数を計算
    o, _ = dec(z, h0, l = ly-1)
    loss = loss_f(to2D(o[:]), to2D(y[:, 1:max(ly)]).squeeze())

    # Backpropagationを実行
    # 勾配をゼロにする
    enc.zero_grad(), dec.zero_grad()
    # 誤差逆伝播法
    loss.backward()
    # 勾配を更新する
    opt_enc.step(), opt_dec.step()
    # 損失関数を更新する
    losses.append(loss.item())

  # データセットに対して一通り計算したら現在の損失関数の値や翻訳結果を表示
  enc.eval(), dec.eval()
  print(epoc, mean(losses))
  with torch.no_grad():
    print(translate('I am a student.', enc, dec, max_len, device="cuda:0"))
    print(translate("He likes to eat pizza.",enc, dec, max_len, device="cuda:0"))
    print(translate("She is my mother.",enc, dec, max_len, device="cuda:0"))
     

100%|██████████| 1655/1655 [00:26<00:00, 61.39it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

0 5.488104742747419
un poco .
a tom a tiempo .
mi mi padre .


100%|██████████| 1655/1655 [00:26<00:00, 61.74it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

1 3.4714398600183585
un error .
a nadie .
a mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 60.67it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

2 2.218498650634397
un estudiante .
a nadie .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.05it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

3 1.696375139746421
un estudiante .
a comer .
es mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.07it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

4 1.4431026473866488
un estudiante .
a comer como comer .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.20it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

5 1.2708912077267005
un estudiante .
a comer a comer .
es mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 64.53it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

6 1.1405899618687587
un estudiante .
a comer pizza .
es mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 61.50it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

7 1.0382784509946932
un estudiante .
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 61.42it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

8 0.9568050959319147
un estudiante .
a comer pizza .
es mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.04it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

9 0.8905366038988004
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.25it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

10 0.8369986615152157
un estudiante .
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.07it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

11 0.7913984891151014
un estudiante .
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 63.02it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

12 0.751652119346976
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 60.81it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

13 0.718876376000776
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.06it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

14 0.6892945402338425
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.13it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

15 0.6640722540750245
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 62.02it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

16 0.6412687392033118
a estudiante .
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 62.00it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

17 0.6203101562589317
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 63.66it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

18 0.6030126099680125
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 61.35it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

19 0.5849999731040794
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 61.30it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

20 0.5699995981422435
un estudiante .
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.24it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

21 0.5562672111202943
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:26<00:00, 61.50it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

22 0.5426408112409252
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.06it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

23 0.530776603632463
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 61.05it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

24 0.5190069630246869
un estudiante .
a comer pizza .
a mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 60.67it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

25 0.5093481370329497
.
a comer pizza .
mi madre .


100%|██████████| 1655/1655 [00:27<00:00, 60.94it/s]
  0%|          | 0/1655 [00:00<?, ?it/s]

26 0.49907839807859
soy estudiante .
a comer pizza .
mi madre .


 21%|██        | 351/1655 [00:05<00:21, 60.15it/s]