### Initialization
* Check whether the runtime is host or local.
* Mount Google Drive when using the host runtime.

In [0]:
try:
  from google.colab import auth
  auth.authenticate_user()
  from google.colab import drive
  drive.mount('/gdrive')
  runtime = "host"
except:
  runtime = "local"

### Parameters

In [0]:
#@title Parameters
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`seed`|The random seed|
seed = 20367 #@param {type: "number"}

#@markdown ### `nl2code` Repositories
#@markdown |Name            |Description|
#@markdown |:---            |:---|
#@markdown |`repository_url`|The URL of `nl2code` git repository (enabled only in the host runtime)|
#@markdown |`branch_name`   |The branch name (enabled only in the host runtime)|
repository_url = "https://github.com/HiroakiMikami/NL2Code-reimplementation" #@param {type: "string"}
branch_name = "master" #@param {type: "string"}

#@markdown ### Dataset Settings
#@markdown |Name               |Description|
#@markdown |:---               |:---|
#@markdown |`max_action_length`|The maximum action length|
#@markdown |`word_threshold`   ||
#@markdwon |`token_threshold`  ||
max_action_length = 100 #@param {type: "number"}
word_threshold = 5 #@param {type: "number"}
token_threshold = 5 #@param {type: "number"}

#@markdown ### Model Parameters
#@markdown |Name                     |Description|
#@markdown |:---                     |:---|
#@markdown |`embedding_dim`          |The dimension of word, token, and rule embeddings|
#@markdown |`node_type_embedding_dim`|The dimension of node type embedding dim|
#@markdown |`lstm_state_size`        |The size of LSTM state|
#@markdwon |`hidden_state_size`      |The size of attention hidden state|
embedding_dim = 128 #@param {type: "number"}
node_type_embedding_dim = 64 #@param {type: "number"}
lstm_state_size = 256 #@param {type: "number"}
hidden_state_size = 50 #@param {type: "number"}

#@markdown ### Training Settings
#@markdown |Name                |Description|
#@markdown |:---                |:---|
#@markdown |`batch_size`        |The minibatch size|
#@markdown |`dropout`           |The probability of dropout|
#@markdown |`num_epochs`        |The numer of epoch|
#@markdown |`num_train`         |The number of entries used for training|
batch_size = 10 #@param {type: "number"}
dropout = 0.2 #@param {type: "number"}
num_epochs = 50 #@param {type: "number"}
num_train = 0 #@param {type: "number"}


#@markdown ### Evaluation Settings
#@markdown |Name                 |Description|
#@markdown |:---                 |:---|
#@markdown |`beam_size`          |The beam size|
#@markdown |`evaluation_interval`|The number of epochs between two evaluation|
#@markdown |`evaluation_metrics` |The metrics used for evaluation|
beam_size = 15 #@param {type: "number"}
evaluation_interval = 5 #@param {type: "number"}
evaluation_metrics = "bleu4" #@param ["bleu4", "accuracy"]

#@markdown ### Other Settings
#@markdown |Name    |Description|
#@markdown |:---    |:---|
#@markdown |`device`|The id of GPU. `-1` means that CPU is used.|
device = 0 #@param {type: "number"}

#@markdown ### Filepathes
#@markdown |Name             |Description|
#@markdown |:---             |:---|
#@markdown |`dataset_path`   |The path of the dataset.|
#@markdown |`output_dir_path`|The directory of the directory that will contain the training results.|
dataset_path = "/gdrive/My Drive/NL2Code/django/dataset.pickle" #@param {type: "string"}
output_dir_path = "/gdrive/My Drive/NL2Code/django/result/" #@param {type: "string"}



### Setup
* Download the codebase (when using the host runtime)
  1. Clone git repository and move to the specified branch
  2. Install modules
* Use GPU
* Fix the random seed

In [0]:
if runtime == "host":
    %cd /content
    !rm -rf NL2Code
    #![ ! -e NL2Code ] && git clone $repository_url NL2Code
    !gcloud source repos clone NL2Code-reimplementation --project development-environment-192405
    !mv NL2Code-reimplementation NL2Code
    %cd NL2Code
    #!git checkout $branch_name
    !git checkout origin/feature/pytorch
    !pip install -e .
    !pip install -e . ".[examples]"
# load tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip

In [0]:
import torch
if device != -1:
    torch.cuda.set_device(device)

In [0]:
import numpy as np
import random
import torch

SEED_MAX = 2**32 - 1

root_rng = np.random.RandomState(seed)
random.seed(root_rng.randint(SEED_MAX))
np.random.seed(root_rng.randint(SEED_MAX))
torch.manual_seed(root_rng.randint(SEED_MAX))

### Setup training
* Load the dataset
* Split the dataset into train, test, valid
* Create and save encoder
* Prepare dataset
* Create model
* Create optimizer
* Use GPU (if needed)

In [0]:
import pickle
from nl2code_examples.django import RawDataset
with open(dataset_path, "rb") as f:
    dataset = pickle.load(f)
dataset = RawDataset(dataset)

In [0]:
from nl2code_examples.django import RawDataset, Entry
train_raw_dataset = RawDataset(dataset["train"])
test_raw_dataset = RawDataset(dataset["test"])
val_raw_dataset = RawDataset(dataset["valid"])
if num_train != 0:
    train_raw_dataset = RawDataset(list(train)[:num_train])
    test_raw_dataset = RawDataset(list(train)[:num_train])
    val_raw_dataset = RawDataset(list(train)[:num_train])

In [0]:
from nl2code_examples.django import DatasetEncoder
import pickle
import os

!mkdir -p "$output_dir_path"
encoder = DatasetEncoder(train_raw_dataset.samples, word_threshold, token_threshold)
with open(os.path.join(output_dir_path, "encoder.pickle"), "wb") as file:
    pickle.dump(encoder, file)


In [0]:
from nl2code_examples.django import RawDataset, TrainDataset, EvalDataset
train_dataset = TrainDataset(train_raw_dataset, encoder)
test_dataset = EvalDataset(test_raw_dataset, encoder, max_action_length, skip_impossible_entry=False)
valid_dataset = EvalDataset(val_raw_dataset, encoder, max_action_length, skip_impossible_entry=False)

In [0]:
from nl2code_examples.django import TrainingModel
model = TrainingModel(encoder, embedding_dim, node_type_embedding_dim,
                      lstm_state_size, hidden_state_size,
                      dropout)
if device != -1:
    model = model.cuda()

In [0]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

### Training Loop
* Launch TensorBoard
* Run training

In [12]:
%load_ext tensorboard
tensorboard_path = "tensorboard"
if runtime == "host":
    %tensorboard --logdir $tensorboard_path
else:
    !pkill tensorboard.
    import subprocess
    import os
    subprocess.Popen(["tensorboard", "--logdir", tensorboard_path])


In [0]:
from tqdm import tqdm_notebook as tqdm
import torch
from torch.utils.data import DataLoader
import torch.nn.utils.rnn as rnn
from typing import List
from nl2code_examples.django import TrainDataset, validate
from nl2code.nn import Loss, Accuracy
import nl2code.nn.utils.rnn as nrnn
from nl2code.language.python import is_subtype
from nl2code import BeamSearchSynthesizer
from torch.utils.tensorboard import SummaryWriter

def query_embedding(query: List[str]):
    x = encoder.annotation_encoder.batch_encode(query)
    if device != -1:
        x = x.cuda()
    embedding =  model.encoder(nrnn.pad_sequence([x]))
    embedding = embedding.data
    return embedding.view(len(query), -1)

synthesizer = BeamSearchSynthesizer(beam_size, model.predictor,
                                    encoder.action_sequence_encoder, is_subtype,
                                    max_steps=max_action_length)
writer = SummaryWriter(tensorboard_path)

loss_function = Loss()
acc_function = Accuracy()
best_score = -1
for epoch in tqdm(range(num_epochs)):
    loader = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True,
                        num_workers=4,
                        collate_fn=TrainDataset.collate)
    avg_loss = 0.0
    model.train()
    for i, (query, action, prev_action) in enumerate(loader):
        query = rnn.pack_sequence(query, enforce_sorted=False)
        action = rnn.pack_sequence(action, enforce_sorted=False)
        prev_action_train = [x[:-1] for x in prev_action]
        action_ground_truth = [x[1:] for x in prev_action]
        prev_action_train = rnn.pack_sequence(prev_action_train, enforce_sorted=False)
        action_ground_truth = rnn.pack_sequence(action_ground_truth, enforce_sorted=False)
        if device != -1:
            query = query.cuda()
            action = action.cuda()
            prev_action_train = prev_action_train.cuda()
            action_ground_truth = action_ground_truth.cuda()
        query = nrnn.pad_packed_sequence(query, padding_value=-1)
        action = nrnn.pad_packed_sequence(action, padding_value=-1)
        prev_action_train = nrnn.pad_packed_sequence(prev_action_train, padding_value=-1)
        action_ground_truth = nrnn.pad_packed_sequence(action_ground_truth, padding_value=-1)

        rule_prob, token_prob, copy_prob, _, _ = model(query, action, prev_action_train)
        loss = loss_function(rule_prob, token_prob, copy_prob, action_ground_truth)
        model.zero_grad()
        loss.backward()
        optimizer.step()

        step = epoch * len(train_dataset) + i * batch_size
        l = loss.cpu().detach().numpy()
        avg_loss += l / len(loader)
        writer.add_scalars('training', { "loss": l }, step)
    print(epoch, avg_loss)

    # Evaluate test dataset
    if (epoch + 1) % evaluation_interval == 0:
        model.eval()
        accuracy = 0
        bleu4 = 0
        for query, query_with_placeholder, ground_truth in tqdm(test_dataset):
            result = validate(query, query_with_placeholder, ground_truth, query_embedding, synthesizer)
            accuracy += 1 if result.is_match else 0
            bleu4 += result.bleu4
        accuracy /= len(test_dataset)
        bleu4 /= len(test_dataset)
        writer.add_scalars('test', { "accuracy": accuracy, "bleu4": bleu4 }, epoch)
        print(epoch, bleu4, accuracy)
        score = bleu4
        if score > best_score:
            best_score = score
            torch.save(model.state_dict(), os.path.join(output_dir_path, "best_model.pickle"))

if best_score < 0:
    torch.save(model.state_dict(), os.path.join(output_dir_path, "best_model.pickle"))


### Run Validation

In [0]:
from tqdm import tqdm_notebook as tqdm
import os
import pickle
import torch
from torch.utils.data import DataLoader
import torch.nn.utils.rnn as rnn
from typing import List
from torch.utils.tensorboard import SummaryWriter
from nl2code.nn import Loss
import nl2code.nn.utils.rnn as nrnn
from nl2code.language.python import is_subtype
from nl2code import BeamSearchSynthesizer
from nl2code_examples.django import validate, unparse

def query_embedding(query: List[str]):
    x = encoder.annotation_encoder.batch_encode(query)
    if device != -1:
        x = x.cuda()
    embedding =  model.encoder(nrnn.pad_sequence([x]))
    embedding = embedding.data
    return embedding.view(len(query), -1)

synthesizer = BeamSearchSynthesizer(beam_size, model.predictor,
                                    encoder.action_sequence_encoder, is_subtype,
                                    max_steps=max_action_length)


# Validate the model
model.load_state_dict(torch.load(os.path.join(output_dir_path, "best_model.pickle")))
accuracy = 0
bleu4 = 0
model.eval()
results = []
for query, query_with_placeholder, ground_truth in tqdm(valid_dataset):
    result = validate(query, query_with_placeholder, ground_truth, query_embedding, synthesizer)
    accuracy += 1 if result.is_match else 0
    bleu4 += result.bleu4
    results.append(result)
accuracy /= len(valid_dataset)
bleu4 /= len(valid_dataset)
with open(os.path.join(output_dir_path, "validation_results.pickle"), "wb") as file:
    pickle.dump(results, file)
print(accuracy, bleu4)
writer.add_scalars('validation', { "accuracy": accuracy, "bleu4": bleu4 })

### Post-Process

* Save tensorboard output

In [0]:
!zip -r "$output_dir_path/tensorboard.zip" $tensorboard_path