### Initialization
* Check whether the runtime is host or local.
* Mount Google Drive when using the host runtime.

In [0]:
try:
  from google.colab import auth
  auth.authenticate_user()
  from google.colab import drive
  drive.mount('/gdrive')
  runtime = "host"
except:
  runtime = "local"

### Parameters

In [0]:
#@title Parameters
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`seed`|The random seed|
seed = 20367 #@param {type: "number"}

#@markdown ### `nl2prog` Repositories
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`repository_url`|The URL of `nl2prog` git repository (enabled only in the host runtime)|
#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|
repository_url = "https://github.com/HiroakiMikami/NL2Prog" #@param {type: "string"}
branch_name = "master" #@param {type: "string"}

#@markdown ### Dataset Settings
#@markdown |Name               |Description|
#@markdown |:---               |:---|
#@markdown |`max_action_length`|The maximum action length|
#@markdown |`word_threshold`   ||
#@markdown |`token_threshold`  ||
max_action_length = 350 #@param {type: "number"}
word_threshold = 3 #@param {type: "number"}
token_threshold = 0 #@param {type: "number"}

#@markdown ### Model Parameters
#@markdown |Name                     |Description|
#@markdown |:---                     |:---|
#@markdown |`embedding_dim`          |The dimension of word, token, and rule embeddings|
#@markdown |`node_type_embedding_dim`|The dimension of node type embedding dim|
#@markdown |`lstm_state_size`        |The size of LSTM state|
#@markdwon |`hidden_state_size`      |The size of attention hidden state|
embedding_dim = 128 #@param {type: "number"}
node_type_embedding_dim = 64 #@param {type: "number"}
lstm_state_size = 256 #@param {type: "number"}
hidden_state_size = 50 #@param {type: "number"}

#@markdown ### Training Settings
#@markdown |Name        |Description|
#@markdown |:---        |:---|
#@markdown |`batch_size`|The minibatch size|
#@markdown |`dropout`   |The probability of dropout|
#@markdown |`num_epochs`|The numer of epoch|
#@markdown |`num_train` |The number of entries used for training|
batch_size = 10 #@param {type: "number"}
dropout = 0.2 #@param {type: "number"}
num_epochs = 50 #@param {type: "number"}
num_train =  0#@param {type: "number"}


#@markdown ### Evaluation Settings
#@markdown |Name                 |Description|
#@markdown |:---                 |:---|
#@markdown |`beam_size`          |The beam size|
#@markdown |`evaluation_interval`|The number of epochs between two evaluation|
beam_size = 15 #@param {type: "number"}
evaluation_interval = 5 #@param {type: "number"}

#@markdown ### Other Settings
#@markdown |Name    |Description|
#@markdown |:---    |:---|
#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|
device = 0 #@param {type: "number"}

#@markdown ### Filepathes
#@markdown |Name                 |Description|
#@markdown |:---                 |:---|
#@markdown |`output_dir_path`    |The directory of the directory that will contain the training results.|
#@markdown |`checkpoint_dir_path`|The directory of the directory that will contain the checkpoints.|
output_dir_path = "/gdrive/My Drive/NL2Code/nl2bash/result/" #@param {type: "string"}
checkpoint_dir_path = "/gdrive/My Drive/NL2Code/nl2bash/checkpoint/" #@param {type: "string"}

### Setup
* Download the codebase (when using the host runtime)
  1. Clone git repository and move to the specified branch
  2. Install modules
* Use GPU
* Fix the random seed

In [0]:
if runtime == "host":
    %cd /content
    !rm -rf NL2Prog
    #![ ! -e NL2Code ] && git clone $repository_url NL2Prog
    !gcloud source repos clone NL2Prog --project development-environment-192405
    !mv NL2Prog NL2Prog
    %cd NL2Prog
    #!git checkout $branch_name
    !git checkout origin/feature/treegen
    !pip install -e .
    !pip install -e . ".[examples]"
# load tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip

In [0]:
import torch
if device != -1:
    torch.cuda.set_device(device)

In [0]:
import numpy as np
import random
import torch

SEED_MAX = 2**32 - 1

root_rng = np.random.RandomState(seed)
random.seed(root_rng.randint(SEED_MAX))
np.random.seed(root_rng.randint(SEED_MAX))
torch.manual_seed(root_rng.randint(SEED_MAX))

### Setup training
* Load the dataset
* Split the dataset into train, test, valid
* Create and save encoder
* Prepare dataset
* Create model
* Create optimizer
* Prepare evaluation
* Load checkpoint

In [0]:
from nl2prog.dataset.nl2bash import download
dataset = download("/content/NL2Code/bin/")

In [0]:
from nl2prog.utils.data import ListDataset
train_raw_dataset = dataset["train"]
test_raw_dataset = dataset["test"]
val_raw_dataset = dataset["valid"]
if num_train != 0:
    train_raw_dataset = ListDataset(list(train_raw_dataset)[:num_train])
    test_raw_dataset = ListDataset(list(train_raw_dataset)[:num_train])
    val_raw_dataset = ListDataset(list(train_raw_dataset)[:num_train])

In [0]:
from nl2prog.utils.data.nl2code import Encoder, get_samples
from nl2prog.utils.nl2code import to_action_sequence as to_seq
from nl2prog.dataset.nl2bash.nl2code import tokenize_query, tokenize_token
from nl2prog.language.bash import parse
import pickle
import os

to_action_sequence = lambda x: to_seq(x, parse, tokenize_token)
!mkdir -p "$output_dir_path"
samples = get_samples(train_raw_dataset, tokenize_query, tokenize_token,
                      to_action_sequence)
encoder = Encoder(samples, word_threshold, token_threshold)
with open(os.path.join(output_dir_path, "encoder.pickle"), "wb") as file:
    pickle.dump(encoder, file)


In [0]:
from nl2prog.utils.data.nl2code import to_train_dataset, to_eval_dataset
train_dataset = to_train_dataset(train_raw_dataset, tokenize_query, tokenize_token, to_action_sequence, encoder)
test_dataset = to_eval_dataset(test_raw_dataset, tokenize_query, tokenize_token, to_action_sequence, encoder)
valid_dataset = to_eval_dataset(val_raw_dataset, tokenize_query, tokenize_token, to_action_sequence, encoder)

In [0]:
from nl2prog.nn.nl2code import TrainModel
model = TrainModel(encoder, embedding_dim, node_type_embedding_dim,
                   lstm_state_size, hidden_state_size,
                   dropout)
if device != -1:
    model = model.cuda()

In [0]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [0]:
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils.nl2code import BeamSearchSynthesizer, synthesize as _synthesize
from nl2prog.language.bash import is_subtype, parse, unparse
from nl2prog.metrics import Accuracy, Bleu

from typing import List

def encode_query(query: List[str]):
    x = encoder.annotation_encoder.batch_encode(query)
    if device != -1:
        x = x.cuda()
    embedding =  model.encoder(nrnn.pad_sequence([x]))
    embedding = embedding.data
    return embedding.view(len(query), -1)


synthesizer = BeamSearchSynthesizer(beam_size, model.predictor,
                                    encoder.action_sequence_encoder, is_subtype,
                                    max_steps=max_action_length)

def synthesize(query: str):
    return _synthesize(query, encode_query, synthesizer)

accuracy = Accuracy(parse, unparse)
bleu = Bleu(parse, unparse)
metrics = { "accuracy": accuracy, "bleu": bleu }


In [0]:
!mkdir -p "$checkpoint_dir_path"
checkpoint_path = os.path.join(checkpoint_dir_path, "checkpoint.pickle")
if os.path.exists(checkpoint_path):
    print("Load checkpoint: {}".format(checkpoint_path))
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    start_epoch = checkpoint["epoch"]
    model.load_state_dict(checkpoint["model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
else:
    start_epoch = 0

### Training Loop
* Run training

In [0]:
import torch
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import DataLoader
import torch.nn.utils.rnn as rnn
from nl2prog.utils import evaluate
from nl2prog.nn.nl2code import Loss, Accuracy
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils.data.nl2code import collate_train_dataset


loss_function = Loss()
acc_function = Accuracy()
best_score = -1
for epoch in range(start_epoch, num_epochs):
    loader = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True,
                        num_workers=4,
                        collate_fn=collate_train_dataset)
    avg_loss = 0.0
    model.train()
    for i, (query, action, prev_action) in enumerate(loader):
        query = rnn.pack_sequence(query, enforce_sorted=False)
        action = rnn.pack_sequence(action, enforce_sorted=False)
        prev_action_train = [x[:-1] for x in prev_action]
        action_ground_truth = [x[1:] for x in prev_action]
        prev_action_train = rnn.pack_sequence(prev_action_train, enforce_sorted=False)
        action_ground_truth = rnn.pack_sequence(action_ground_truth, enforce_sorted=False)
        if device != -1:
            query = query.cuda()
            action = action.cuda()
            prev_action_train = prev_action_train.cuda()
            action_ground_truth = action_ground_truth.cuda()
        query = nrnn.pad_packed_sequence(query, padding_value=-1)
        action = nrnn.pad_packed_sequence(action, padding_value=-1)
        prev_action_train = nrnn.pad_packed_sequence(prev_action_train, padding_value=-1)
        action_ground_truth = nrnn.pad_packed_sequence(action_ground_truth, padding_value=-1)

        rule_prob, token_prob, copy_prob, _, _ = model(query, action, prev_action_train)
        loss = loss_function(rule_prob, token_prob, copy_prob, action_ground_truth)
        model.zero_grad()
        loss.backward()
        optimizer.step()

        step = epoch * len(train_dataset) + i * batch_size
        l = loss.cpu().detach().numpy()
        avg_loss += l / len(loader)
    print(epoch, avg_loss)
    checkpoint = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "epoch": epoch + 1
    }
    torch.save(checkpoint, checkpoint_path)

    # Evaluate test dataset
    if (epoch + 1) % evaluation_interval == 0:
        model.eval()
        result = evaluate(tqdm(test_dataset), synthesize, top_n=[1, 3], metrics=metrics)
        print(epoch, result.metrics)
        score = result.metrics[3]["bleu"]
        if score > best_score:
            best_score = score
            torch.save(model.state_dict(), os.path.join(output_dir_path, "best_model.pickle"))

if best_score < 0:
    torch.save(model.state_dict(), os.path.join(output_dir_path, "best_model.pickle"))


### Run Validation

In [0]:
from tqdm import tqdm_notebook as tqdm
import os
import pickle
import torch
import torch.nn.utils.rnn as rnn
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils import evaluate

# Validate the model
model.load_state_dict(torch.load(os.path.join(output_dir_path, "best_model.pickle")))
model.eval()
result = evaluate(tqdm(valid_dataset), synthesize, top_n=[1, 3], metrics=metrics)
print(result.metrics)
with open(os.path.join(output_dir_path, "validation_results.pickle"), "wb") as file:
    pickle.dump(result.results, file)
