## Translation using tensor2tensor on Cloud ML Engine

This notebook illustrates using the <a href="https://github.com/tensorflow/tensor2tensor">tensor2tensor</a> library to do from-scratch, distributed training of a English-German translator. Then, the trained model is deployed to Cloud ML Engine and used to translate new pieces of text.
<p/>
### Install tensor2tensor, and specify Google Cloud Platform project and bucket

In [None]:
%bash
pip install tensor2tensor

In [None]:
import os
PROJECT = 'cloud-training-demos' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'cloud-training-demos-ml' # REPLACE WITH YOUR BUCKET NAME
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

### Download data


In [None]:
%bash
wget http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
wget http://data.statmt.org/wmt17/translation-task/dev.tgz

In [None]:
%bash
gsutil cp -m training-parallel-nc-v12.tgz dev.tgz gs://${BUCKET}/translate_ende/

### Set up a Problem
The Problem in tensor2tensor is where you specify parameters like the size of your vocabulary and where to get the training data from.

In [None]:
%bash
rm -rf ende
mkdir ende

In [None]:
%%writefile ende/__init__.py
from . import problem

In [None]:
!pwd

In [None]:
%writefile ende/problem.py
import tensorflow as tf
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import wsj_parsing
import tensor2tensor.data_generators.wmt as wmt
from tensor2tensor.utils import registry

#TOPDIR="gs://{}/translate_ende/".format("BUCKET_NAME")
TOPDIR="file:///content/training-data-analyst/blogs/t2t"

_ENDE_TRAIN_DATASETS = [
    [
        "{}/training-parallel-nc-v12.tgz".format(TOPDIR),
        ("training/news-commentary-v12.de-en.en",
         "training/news-commentary-v12.de-en.de")
    ],
]
_ENDE_TEST_DATASETS = [
    [
        "{}/dev.tgz".format(TOPDIR),
        ("dev/newstest2013.en", "dev/newstest2013.de")
    ],
]

@registry.register_problem
class MyTranslateProblem(wmt.TranslateProblem):
  @property
  def targeted_vocab_size(self):
    return 2**13  # 8192

  def generator(self, data_dir, tmp_dir, train):
    symbolizer_vocab = generator_utils.get_or_generate_vocab(
        data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, sources=_ENDE_TRAIN_DATASETS)
    datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
    tag = "train" if train else "dev"
    data_path = wmt._compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
    return wmt.token_generator(data_path + ".lang1", data_path + ".lang2",
                           symbolizer_vocab, text_encoder.EOS_ID)

  @property
  def input_space_id(self):
    return problem.SpaceID.EN_TOK

  @property
  def target_space_id(self):
    return problem.SpaceID.DE_TOK

In [None]:
%bash
PROBLEM=my_translate_problem
#PROBLEM=translate_ende_wmt8k
DATA_DIR=./t2t_data
TMP_DIR=$DATA_DIR/tmp
rm -rf $DATA_DIR $TMP_DIR
mkdir -p $DATA_DIR $TMP_DIR
# Generate data
t2t-datagen \
  --t2t_usr_dir=./ende \
  --problem=$PROBLEM \
  --data_dir=$DATA_DIR \
  --tmp_dir=$TMP_DIR