### Initialization
* Check whether the runtime is host or local.
* Mount Google Drive when using the host runtime.

In [0]:
try:
    from google.colab import drive
    drive.mount('/gdrive')
    runtime = "host"
except:
    runtime = "local"

### Parameters

In [0]:
#@markdown |Name  |Description|
#@markdown |:---  |:---|
#@markdown |`seed`|The random seed|
seed = 20367 #@param {type: "number"}

#@markdown ### `nlprog` Repositories
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`repository_url`|The URL of `nlprog` git repository (enabled only in the host runtime)|
#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|
repository_url = "https://github.com/HiroakiMikami/NL2Prog" #@param {type: "string"}
branch_name = "master" #@param {type: "string"}

#@markdown ### Dataset Settings
#@markdown |Name               |Description|
#@markdown |:---               |:---|
#@markdown |`word_threshold`   ||
#@markdown |`token_threshold`  ||
word_threshold = 5 #@param {type: "number"}
token_threshold = 5 #@param {type: "number"}

#@markdown ### Model Parameters
#@markdown |Name                     |Description|
#@markdown |:---                     |:---|
#@markdown |`embedding_dim`          |The dimension of word, token, and rule embeddings|
#@markdown |`node_type_embedding_dim`|The dimension of node type embedding dim|
#@markdown |`lstm_state_size`        |The size of LSTM state|
#@markdwon |`hidden_state_size`      |The size of attention hidden state|
embedding_dim = 128 #@param {type: "number"}
node_type_embedding_dim = 64 #@param {type: "number"}
lstm_state_size = 256 #@param {type: "number"}
hidden_state_size = 50 #@param {type: "number"}

#@markdown ### Training Settings
#@markdown |Name        |Description|
#@markdown |:---        |:---|
#@markdown |`batch_size`|The minibatch size|
#@markdown |`dropout`   |The probability of dropout|
#@markdown |`num_epochs`|The numer of epoch|
#@markdown |`num_train` |The number of entries used for training|
#@markdown |`num_save_models`|The number of models to be saved.|
batch_size = 10 #@param {type: "number"}
dropout = 0.2 #@param {type: "number"}
num_epochs = 50 #@param {type: "number"}
num_train = 0 #@param {type: "number"}
num_save_models = 3 #@param {type: "number"}

#@markdown ### Other Settings
#@markdown |Name    |Description|
#@markdown |:---    |:---|
#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|
device = 0 #@param {type: "number"}

#@markdown ### Filepathes
#@markdown |Name                 |Description|
#@markdown |:---                 |:---|
#@markdown |`output_dir_path`    |The directory of the directory that will contain the training results.|
output_dir_path = "/gdrive/My Drive/NL2Prog/django/nl2code" #@param {type: "string"}

### Setup
* Download the codebase (when using the host runtime)
  1. Clone git repository and move to the specified branch
  2. Install modules
* Use GPU
* Fix the random seed

In [0]:
if runtime == "host":
    %cd /content
    !rm -rf NL2Prog
    !git clone $repository_url NL2Prog
    %cd NL2Prog
    !git checkout $branch_name
    !pip install .
# load tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip

In [0]:
import torch
if device != -1:
    torch.cuda.set_device(device)

In [0]:
import numpy as np
import random
import torch

SEED_MAX = 2**32 - 1

root_rng = np.random.RandomState(seed)
random.seed(root_rng.randint(SEED_MAX))
np.random.seed(root_rng.randint(SEED_MAX))
torch.manual_seed(root_rng.randint(SEED_MAX))

### Setup training
* Load the dataset
* Split the dataset into train, test, valid
* Create and save encoder
* Prepare dataset
* Create model
* Create optimizer

In [0]:
from nl2prog.dataset.django import download
dataset = download()

In [0]:
from nl2prog.utils.data import ListDataset, Entry
train_raw_dataset = dataset["train"]
if num_train != 0:
    train_raw_dataset = ListDataset(list(train_raw_dataset)[:num_train])

In [0]:
from torchnlp.encoders import LabelEncoder
from nl2prog.encoders import ActionSequenceEncoder
from nl2prog.utils.data import get_samples, get_words
from nl2prog.utils.python import tokenize_token
from nl2prog.dataset.django import parse, tokenize_query
from nl2prog.language.action import code_to_action_sequence as to_seq
import pickle
import os

to_action_sequence = lambda x: to_seq(x, parse, tokenize=tokenize_token)
words = get_words(train_raw_dataset, tokenize_query)
samples = get_samples(train_raw_dataset, tokenize_token, to_action_sequence)
qencoder = LabelEncoder(words, word_threshold)
aencoder = ActionSequenceEncoder(samples, token_threshold)

os.makedirs(output_dir_path, exist_ok=True)
with open(os.path.join(output_dir_path, "encoder.pickle"), "wb") as file:
    pickle.dump({
        "query_encoder": qencoder,
        "action_sequence_encoder": aencoder
    }, file)

In [0]:
from nl2prog.utils.transform \
    import TransformDataset, TransformCode, TransformGroundTruth
from nl2prog.utils.transform.nl2code import TransformQuery, TransformEvaluator

tquery = TransformQuery(tokenize_query, qencoder)
tcode = TransformCode(to_action_sequence)
teval = TransformEvaluator(aencoder)
tgt = TransformGroundTruth(aencoder)
transform = TransformDataset(tquery, tcode, teval, tgt)
train_dataset = transform(train_raw_dataset)

In [0]:
from nl2prog.nn.nl2code import TrainModel
model = TrainModel(qencoder, aencoder, embedding_dim, node_type_embedding_dim,
                   lstm_state_size, hidden_state_size,
                   dropout)
if device != -1:
    model = model.cuda()

In [0]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

### Training Loop
* Run training

In [0]:
from tqdm import tqdm_notebook as tqdm
import torch
from torch.utils.data import DataLoader
from nl2prog.utils import TopKModel
from nl2prog.nn import Loss, Accuracy
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils.data.nl2code import collate_train_dataset


model_dir_path = os.path.join(output_dir_path, "models")
os.makedirs(model_dir_path, exist_ok=True)
models = TopKModel(num_save_models, model_dir_path)
loss_function = Loss()
acc_function = Accuracy()

for epoch in tqdm(range(num_epochs)):
    loader = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=4,
                        collate_fn=collate_train_dataset)
    avg_loss = 0.0
    avg_acc = 0.0
    model.train()
    for (query, action, prev_action), ground_truth in loader:
        query = nrnn.pad_sequence(query, padding_value=-1)
        action = nrnn.pad_sequence(action, padding_value=-1)
        prev_action = nrnn.pad_sequence(prev_action, padding_value=-1)
        ground_truth = nrnn.pad_sequence(ground_truth, padding_value=-1)
        if device != -1:
            query.data = query.data.cuda()
            query.mask = query.mask.cuda()
            action.data = action.data.cuda()
            action.mask = action.mask.cuda()
            prev_action.data = prev_action.data.cuda()
            prev_action.mask = prev_action.mask.cuda()
            ground_truth.data = ground_truth.data.cuda()
            ground_truth.mask = ground_truth.mask.cuda()

        rule_prob, token_prob, copy_prob, _, _ = model(query, action, prev_action)
        loss = loss_function(rule_prob, token_prob, copy_prob, ground_truth)
        with torch.no_grad():
            acc = acc_function(rule_prob, token_prob, copy_prob, ground_truth)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(loader)
        avg_acc += acc.item() / len(loader)
    print(epoch, avg_loss, avg_acc)
    models.save(avg_acc, str(epoch), model)