<a href="https://colab.research.google.com/github/KamonohashiPerry/PyTorch/blob/master/PyTorch_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorchのインストール
+ Colabの編集→ノートブックの設定でGPUモードにすることをお忘れなく。

In [0]:
# pip install http://download.pytorch.org/whl/cu80/torch-0.4.0-cp36-cp36m-linux_x86_64.whl

In [0]:
# pip install torchvision

In [0]:
# pip install tqdm

インストールできたかの確認

In [0]:
import torch

In [5]:
torch.tensor([1, 2, 3]).to("cuda:0")

tensor([1, 2, 3], device='cuda:0')

In [0]:
# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
# !apt-get update -qq 2>&1 > /dev/null
# !apt-get -y install -qq google-drive-ocamlfuse fuse

# from google.colab import auth
# auth.authenticate_user()
# from oauth2client.client import GoogleCredentials
# creds = GoogleCredentials.get_application_default()
# import getpass
# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL

# vcode = getpass.getpass()
# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
# # drive mean root directory of  google drive
# !mkdir -p drive
# !google-drive-ocamlfuse drive
# !ls drive/"Colab Notebooks"/PyTorch

In [0]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)
import tqdm

import warnings
warnings.filterwarnings('ignore')

In [8]:
!wget http://www.manythings.org/anki/spa-eng.zip

--2019-12-17 11:57:10--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4752884 (4.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2019-12-17 11:57:17 (2.91 MB/s) - ‘spa-eng.zip’ saved [4752884/4752884]



In [9]:
!unzip spa-eng.zip

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [10]:
ls

_about.txt  [0m[01;34msample_data[0m/  spa-eng.zip  spa.txt


In [0]:
# from google.colab import files
# files.download("spa.txt")

In [0]:
import re
import collections
import itertools

remove_marks_regex = re.compile("[\,\(\)\[\]\*:;¿¡]|<.*?>")
shift_marks_regex = re.compile("([?!\.])")

unk = 0
sos = 1
eos = 2

def normalize(text):
  # 小文字にする
  text = text.lower()
  # 不要な文字を除去
  text = remove_marks_regex.sub("", text)
  # ?!.と単語の間に空白を挿入
  text = shift_marks_regex.sub(r" \1", text)
  return text

def parse_line(line):
  line = normalize(line.strip())
  # 翻訳元(src)と翻訳先(trg)それぞれのトークンのリストを作る
  src, trg, _ = line.split("\t")
  src_tokens = src.strip().split()
  trg_tokens = trg.strip().split()
  return src_tokens, trg_tokens

In [0]:
def build_vocab(tokens):
  # ファイル中の全ての文章でのトークンの出現数を数える
  counts = collections.Counter(tokens)
  # トークンの出現数の多い順に並べる
  sorted_counts = sorted(counts.items(),
                            key=lambda c: c[1], reverse=True)
  # 3つのタグを追加して正引きリストと逆引き用辞書を作る
  word_list = ["<UNK>", "<SOS>", "<EOS>"] + [x[0] for x in sorted_counts]
  word_dict = dict((w, i) for i, w in enumerate(word_list))
  return word_list, word_dict

def words2tensor(words, word_dict, max_len, padding=0):
  # 末尾に終了タグをつける
  words = words + ["<EOS>"]
  # 辞書を利用して数値のリストに変換する
  words = [word_dict.get(w, 0) for w in words]
  seq_len = len(words)
  # 長さがmax_len以下の場合はパディングする
  if seq_len < max_len + 1:
    words = words + [padding] * (max_len + 1 - seq_len)
    # Tensorに変換して返す
  return torch.tensor(words, dtype=torch.int64), seq_len


In [0]:
class TranslationPairDataset(Dataset):
  def __init__(self, path, max_len=15):
    # 単語数が多い文章をフィルタリングする関数
    def filter_pair(p):
      return not (len(p[0]) > max_len or len(p[1]) > max_len )

    # ファイルを開き、パース/フィルタリングをする
    with open(path) as fp:
      pairs = map(parse_line, fp)
      pairs = filter(filter_pair, pairs)
      pairs = list(pairs)

    # 文章のペアをソースとターゲットに分ける
    src = [p[0] for p in pairs]
    trg = [p[1] for p in pairs]
    # それぞれの語彙集を作成する
    self.src_word_list, self.src_word_dict = build_vocab(itertools.chain.from_iterable(src))
    self.trg_word_list, self.trg_word_dict = build_vocab(itertools.chain.from_iterable(trg))

    # 語彙集を使用してTensorに変換する
    self.src_data = [words2tensor(
        words, self.src_word_dict, max_len)
          for words in src]

    # -100でパディングすることでPyTorchの損失関数の計算に含めないようにするらしい。
    # 可変長の系列の扱いが容易になるらしい。
    self.trg_data = [words2tensor(
        words, self.trg_word_dict, max_len, -100)
          for words in trg]

  def __len__(self):
    return len(self.src_data)

  def __getitem__(self, idx):
    src, lsrc = self.src_data[idx]
    trg, ltrg = self.trg_data[idx]
    return src, lsrc, trg, ltrg


In [0]:
pwd

'/content'

In [0]:
# DatasetとDataLoaderの作成
batch_size = 64
max_len = 10

path = "/content/spa.txt"
ds = TranslationPairDataset(path, max_len=max_len)
loader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=4)

In [0]:
# Encoderの作成
class Encoder(nn.Module):
  def __init__(self, num_embeddings,
               embedding_dim=50,
               hidden_size=50,
               num_layers=1,
               dropout=0.2):
    super().__init__()
    self.emb = nn.Embedding(num_embeddings, embedding_dim,
                            padding_idx=0)
    self.lstm = nn.LSTM(embedding_dim,
                        hidden_size,
                        num_layers,
                        batch_first=True,
                        dropout=dropout)
    
    def forward(self, x, h0=None, l=None):
      x = self.emb(x)
      if l is not None:
        x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
      # 内部状態のみをDecoderにわたすので、出力は破棄している。
      _, h = self.lstm(x, h0)
      return h

In [0]:
# Decoderの作成
class Decoder(nn.Module):
  def __init__(self, num_embeddings,
               embedding_dim=50,
               hidden_size=50,
               num_layers=1,
               dropout=0.2):
    super().__init__()
    self.emb = nn.Embedding(num_embeddings, embedding_dim,padding_idx=0)
    self.lstm = nn.LSTM(embedding_dim, hidden_size,
                        num_layers,batch_first=True,
                        dropout=dropout)
    self.linear = nn.Linear(hidden_size, num_embeddings)

  def forward(self, x, h, l=None):
    x = self.emb(x)
    if l is not None:
      x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
      x, h = self.lstm(x, h)
      if l is not None:
        x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=0)[0]
      x = self.linear(x)
      return x, h