### Initialization
* Check whether the runtime is host or local.
* Mount Google Drive when using the host runtime.

In [0]:
try:
    from google.colab import drive
    drive.mount('/gdrive')
    runtime = "host"
except:
    runtime = "local"

### Parameters

In [0]:
#@title Parameters
#@markdown |Name  |Description|
#@markdown |:---  |:---|
#@markdown |`seed`|The random seed|
seed = 20367 #@param {type: "number"}

#@markdown ### `nlprog` Repositories
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`repository_url`|The URL of `nlprog` git repository (enabled only in the host runtime)|
#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|
repository_url = "https://github.com/HiroakiMikami/NL2Prog" #@param {type: "string"}
branch_name = "master" #@param {type: "string"}

#@markdown ### Dataset Settings
#@markdown |Name               |Description|
#@markdown |:---               |:---|
#@markdown |`max_action_length`|The maximum action length|
max_action_length = 100 #@param {type: "number"}

#@markdown ### Model Parameters
#@markdown |Name                     |Description|
#@markdown |:---                     |:---|
#@markdown |`embedding_dim`          |The dimension of word, token, and rule embeddings|
#@markdown |`node_type_embedding_dim`|The dimension of node type embedding dim|
#@markdown |`lstm_state_size`        |The size of LSTM state|
#@markdwon |`hidden_state_size`      |The size of attention hidden state|
embedding_dim = 128 #@param {type: "number"}
node_type_embedding_dim = 64 #@param {type: "number"}
lstm_state_size = 256 #@param {type: "number"}
hidden_state_size = 50 #@param {type: "number"}

#@markdown ### Settings
#@markdown |Name        |Description|
#@markdown |:---        |:---|
#@markdown |`beam_size` |The beam size|
#@markdown |`dropout`   |The probability of dropout|
beam_size = 15 #@param {type: "number"}
dropout = 0.2 #@param {type: "number"}

#@markdown ### Other Settings
#@markdown |Name    |Description|
#@markdown |:---    |:---|
#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|
device = 0 #@param {type: "number"}

#@markdown ### Filepathes
#@markdown |Name                 |Description|
#@markdown |:---                 |:---|
#@markdown |`output_dir_path`    |The directory of the directory that will contain the training results.|
output_dir_path = "/gdrive/My Drive/NL2Prog/django/nl2code" #@param {type: "string"}


### Setup
* Download the codebase (when using the host runtime)
  1. Clone git repository and move to the specified branch
  2. Install modules
* Use GPU
* Fix the random seed

In [0]:
if runtime == "host":
    %cd /content
    !rm -rf NL2Prog
    !git clone $repository_url NL2Prog
    %cd NL2Prog
    !git checkout $branch_name
    !pip install .
# load tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip

In [0]:
import torch
if device != -1:
    torch.cuda.set_device(device)

In [0]:
import numpy as np
import random
import torch

SEED_MAX = 2**32 - 1

root_rng = np.random.RandomState(seed)
random.seed(root_rng.randint(SEED_MAX))
np.random.seed(root_rng.randint(SEED_MAX))
torch.manual_seed(root_rng.randint(SEED_MAX))

### Setup training
* Load the dataset
* Split the dataset into train, test, valid
* Create and save encoder
* Prepare dataset
* Create model
* Create optimizer
* Prepare evaluation
* Load checkpoint

In [0]:
from nl2prog.dataset.django import download
dataset = download()

In [0]:
from nl2prog.utils.data import ListDataset, Entry
test_raw_dataset = dataset["test"]
val_raw_dataset = dataset["valid"]

In [0]:
import pickle
import os


with open(os.path.join(output_dir_path, "encoder.pickle"), "rb") as file:
    encoder = pickle.load(file)
    qencoder = encoder["query_encoder"]
    aencoder = encoder["action_sequence_encoder"]

In [0]:
from nl2prog.utils.data import to_eval_dataset


test_dataset = to_eval_dataset(test_raw_dataset)
valid_dataset = to_eval_dataset(val_raw_dataset)

In [0]:
from nl2prog.nn.nl2code import TrainModel
model = TrainModel(qencoder, aencoder, embedding_dim, node_type_embedding_dim,
                   lstm_state_size, hidden_state_size,
                   dropout)
if device != -1:
    model = model.cuda()

In [0]:
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils import synthesize as _synthesize
from nl2prog.utils.nl2code import BeamSearchSynthesizer
from nl2prog.dataset.django import tokenize_query, parse
from nl2prog.language.python import is_subtype, unparse
from nl2prog.metrics import Accuracy
from nl2prog.metrics.python import Bleu


synthesizer = BeamSearchSynthesizer(beam_size, tokenize_query,
                                    model.encoder, model.predictor,
                                    qencoder, aencoder, is_subtype,
                                    max_steps=max_action_length)

def synthesize(query: str):
    return _synthesize(query, synthesizer)

accuracy = Accuracy(parse, unparse)
bleu = Bleu(parse, unparse)
metrics = { "accuracy": accuracy, "bleu": bleu }

### Run Validation

In [0]:
from tqdm import tqdm_notebook as tqdm
import os
import torch
import torch.nn.utils.rnn as rnn
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils import evaluate

# Test the model
best_score = -1
best_score_path = None
model_dir_path = os.path.join(output_dir_path, "models")
for m in os.listdir(model_dir_path):
    path = os.path.join(model_dir_path, m)
    model.load_state_dict(torch.load(path)["model"])
    model.eval()
    result = evaluate(tqdm(test_dataset), synthesize, top_n=[1], metrics=metrics)
    print(m, result.metrics)
    score = result.metrics[1]["bleu"]
    if score > best_score:
        best_score = score
        best_score_path = path
print("Best Model: {}".format(best_score_path))

In [0]:
from tqdm import tqdm_notebook as tqdm
import os
import pickle
import torch
import torch.nn.utils.rnn as rnn
import nl2prog.nn.utils.rnn as nrnn
from nl2prog.utils import evaluate

# Validate the model
model.load_state_dict(torch.load(best_score_path)["model"])
model.eval()
result = evaluate(tqdm(valid_dataset), synthesize, top_n=[1], metrics=metrics)
print(result.metrics)
with open(os.path.join(output_dir_path, "validation_results.pickle"), "wb") as file:
    pickle.dump(result.results, file)