<a href="https://colab.research.google.com/github/Masum06/Text2App/blob/master/notebooks/RoBERTa_to_SAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Code borrowed from: https://github.com/microsoft/CodeXGLUE

In [1]:
!nvidia-smi

Wed Jun 30 18:46:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers==4.5.0

Collecting transformers==4.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 11.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 55.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.9MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.5

In [3]:
!git clone https://github.com/Masum06/Text2App.git

Cloning into 'Text2App'...
remote: Enumerating objects: 525, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 525 (delta 79), reused 48 (delta 24), pack-reused 397[K
Receiving objects: 100% (525/525), 233.22 MiB | 35.97 MiB/s, done.
Resolving deltas: 100% (161/161), done.


# Text2App

Data

In [5]:
pwd

'/content'

In [6]:
cd Text2App/training_RoBERTa/

/content/Text2App/training_RoBERTa


In [4]:
# For saving checkpoint to Google Drive while working on Colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [7]:
import pandas as pd
train = pd.read_csv('../synthesized_data/nl_sar_train.csv')
dev = pd.read_csv('../synthesized_data/nl_sar_valid.csv')
test = pd.read_csv('../synthesized_data/nl_sar_test.csv')

In [8]:
class MyTokenizer:
  vocab_size = 0
  vocab = []
  id_to_token = {}
  token_to_id = {}

  def __init__(self):
    self.vocab = list(set(" ".join(list(train['SAR'])).split()))
    self.vocab.sort()
    self.vocab_size = len(self.token_to_id) 
    # Special tokens: <s><pad></s><unk>
    self.add_token('<s>')
    self.add_token('<pad>')
    self.add_token('</s>')
    self.add_token('<unk>')
    for v in self.vocab:
      self.add_token(v)
    self.add_token('None')

  def tokenize(self, s):
    return s.split()

  def add_token(self, s):
    if s not in self.token_to_id:
      self.id_to_token[self.vocab_size] = s
      self.token_to_id[s] = self.vocab_size
      self.vocab_size+=1

  def convert_string_to_ids(self, s):
    tokens = s.split()
    ids = []
    for token in tokens:
      ids.append(self.token_to_id[token])
    return ids

  def decode(self, ids):
    text = ""
    for id in ids:
      text += self.id_to_token[id] + " "
    return text[:-1]

Model

In [9]:
# Copyright (c) Microsoft Corporation. 
# Licensed under the MIT license.

import torch
import torch.nn as nn
import torch
from torch.autograd import Variable
import copy

class Seq2Seq(nn.Module):
    """
        Build Seqence-to-Sequence.
        
        Parameters:
        * `encoder`- encoder of seq2seq model. e.g. roberta
        * `decoder`- decoder of seq2seq model. e.g. transformer
        * `config`- configuration of encoder model. 
        * `beam_size`- beam size for beam search. 
        * `max_length`- max length of target for beam search. 
        * `sos_id`- start of symbol ids in target for beam search.
        * `eos_id`- end of symbol ids in target for beam search. 
    """
    def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder=decoder
        self.config=config
        self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.lm_head = nn.Linear(config.hidden_size, decoder_tokenizer.vocab_size, bias=False) #config.vocab_size
        self.lsm = nn.LogSoftmax(dim=-1)
        # self.tie_weights()
        
        self.beam_size=beam_size
        self.max_length=max_length
        self.sos_id=sos_id
        self.eos_id=eos_id
        
    def _tie_or_clone_weights(self, first_module, second_module):
        """ Tie or clone module weights depending of weither we are using TorchScript or not
        """
        if self.config.torchscript:
            first_module.weight = nn.Parameter(second_module.weight.clone())
        else:
            first_module.weight = second_module.weight
                  
    def tie_weights(self):
        """ Make sure we are sharing the input and output embeddings.
            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
        self._tie_or_clone_weights(self.lm_head,
                                   self.encoder.embeddings.word_embeddings)        
        
    def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
        outputs = self.encoder(source_ids, attention_mask=source_mask)
        encoder_output = outputs[0].permute([1,0,2]).contiguous()
        if target_ids is not None:  
            attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
            tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous() ## MH: Problmatic
            out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
            hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
            lm_logits = self.lm_head(hidden_states)
            # Shift so that tokens < n predict n
            active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = target_ids[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
                            shift_labels.view(-1)[active_loss])

            outputs = loss, loss*active_loss.sum(), active_loss.sum()
            # print("Inside forward, outputs:", outputs)
            return outputs
        else:
            #Predict 
            preds=[]       
            zero=torch.cuda.LongTensor(1).fill_(0)     
            for i in range(source_ids.shape[0]):
                context=encoder_output[:,i:i+1]
                context_mask=source_mask[i:i+1,:]
                beam = Beam(self.beam_size,self.sos_id,self.eos_id)
                input_ids=beam.getCurrentState()
                context=context.repeat(1, self.beam_size,1)
                context_mask=context_mask.repeat(self.beam_size,1)
                for _ in range(self.max_length): 
                    if beam.done():
                        break
                    attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
                    tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
                    out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
                    out = torch.tanh(self.dense(out))
                    hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
                    out = self.lsm(self.lm_head(hidden_states)).data
                    beam.advance(out)
                    input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
                    input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
                hyp= beam.getHyp(beam.getFinal())
                pred=beam.buildTargetTokens(hyp)[:self.beam_size]
                pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
                preds.append(torch.cat(pred,0).unsqueeze(0))
                
            preds=torch.cat(preds,0)
            # print("Inside Forward, preds: ", preds)
            return preds   
        
        

class Beam(object):
    def __init__(self, size,sos,eos):
        self.size = size
        self.tt = torch.cuda
        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()
        # The backpointers at each time-step.
        self.prevKs = []
        # The outputs at each time-step.
        self.nextYs = [self.tt.LongTensor(size)
                       .fill_(0)]
        self.nextYs[0][0] = sos
        # Has EOS topped the beam yet.
        self._eos = eos
        self.eosTop = False
        # Time and k pair for finished.
        self.finished = []

    def getCurrentState(self):
        "Get the outputs for the current timestep."
        batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
        return batch

    def getCurrentOrigin(self):
        "Get the backpointers for the current timestep."
        return self.prevKs[-1]

    def advance(self, wordLk):
        """
        Given prob over words for every last beam `wordLk` and attention
        `attnOut`: Compute and update the beam search.
        Parameters:
        * `wordLk`- probs of advancing from the last step (K x words)
        * `attnOut`- attention at the last step
        Returns: True if beam search is complete.
        """
        numWords = wordLk.size(1)

        # Sum the previous scores.
        if len(self.prevKs) > 0:
            beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)

            # Don't let EOS have children.
            for i in range(self.nextYs[-1].size(0)):
                if self.nextYs[-1][i] == self._eos:
                    beamLk[i] = -1e20
        else:
            beamLk = wordLk[0]
        flatBeamLk = beamLk.view(-1)
        bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)

        self.scores = bestScores

        # bestScoresId is flattened beam x word array, so calculate which
        # word and beam each score came from
        prevK = bestScoresId // numWords
        self.prevKs.append(prevK)
        self.nextYs.append((bestScoresId - prevK * numWords))


        for i in range(self.nextYs[-1].size(0)):
            if self.nextYs[-1][i] == self._eos:
                s = self.scores[i]
                self.finished.append((s, len(self.nextYs) - 1, i))

        # End condition is when top-of-beam is EOS and no global score.
        if self.nextYs[-1][0] == self._eos:
            self.eosTop = True

    def done(self):
        return self.eosTop and len(self.finished) >=self.size

    def getFinal(self):
        if len(self.finished) == 0:
            self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
        self.finished.sort(key=lambda a: -a[0])
        if len(self.finished) != self.size:
            unfinished=[]
            for i in range(self.nextYs[-1].size(0)):
                if self.nextYs[-1][i] != self._eos:
                    s = self.scores[i]
                    unfinished.append((s, len(self.nextYs) - 1, i)) 
            unfinished.sort(key=lambda a: -a[0])
            self.finished+=unfinished[:self.size-len(self.finished)]
        return self.finished[:self.size]

    def getHyp(self, beam_res):
        """
        Walk back to construct the full hypothesis.
        """
        hyps=[]
        for _,timestep, k in beam_res:
            hyp = []
            for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
                hyp.append(self.nextYs[j+1][k])
                k = self.prevKs[j][k]
            hyps.append(hyp[::-1])
        return hyps
    
    def buildTargetTokens(self, preds):
        sentence=[]
        for pred in preds:
            tokens = []
            for tok in pred:
                if tok==self._eos:
                    break
                tokens.append(tok)
            sentence.append(tokens)
        return sentence
        


In [10]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

from __future__ import absolute_import
import os
import sys
import bleu
import pickle
import torch
import json
import random
import logging
# import argparse
import numpy as np
import pandas as pd
from io import open
from itertools import cycle
import torch.nn as nn
# from model import Seq2Seq
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer)
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

class Example(object):
    """A single training/test example."""
    def __init__(self,
                 idx,
                 source,
                 target,
                 ):
        self.idx = idx
        self.source = source
        self.target = target

def read_examples(filename):
    """Read examples from filename."""
    examples=[]

    ## open DF, convert to json, iterate one by one
    # with open(filename,encoding="utf-8") as f:
    #     for idx, line in enumerate(f):
    #         line=line.strip()
    #         js=json.loads(line)
    #         if 'idx' not in js:
    #             js['idx']=idx
    #         sar=' '.join(js['SAR']).replace('\n',' ')
    #         sar=' '.join(code.strip().split())
    #         nl=' '.join(js['NL']).replace('\n','')
    #         nl=' '.join(nl.strip().split())  

    df = pd.read_csv(filename)
    data_list = list(df.T.to_dict().values())
    for idx, data in enumerate(data_list):
      nl = data['NL']
      sar = data['SAR']
      examples.append(
          Example(
                  idx = idx,
                  source=nl,
                  target=sar,
                  ) 
      )
    return examples


class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 example_id,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,

    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask       
        


def convert_examples_to_features(examples, tokenizer, args,stage=None): # MH: make it encoder_tokenizer, decoder_tokenizer
    features = []
    for example_index, example in enumerate(examples):
        #source
        source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-2]
        source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
        source_mask = [1] * (len(source_tokens))
        padding_length = args.max_source_length - len(source_ids)
        source_ids+=[tokenizer.pad_token_id]*padding_length
        source_mask+=[0]*padding_length

        #target
        if stage=="test":
            target_tokens = ['None'] #tokenizer.tokenize("None")
        else:
            # target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
            target_tokens = example.target.split()[:args.max_target_length-2]
        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
        # target_ids = tokenizer.convert_tokens_to_ids(target_tokens) # MH: decoder
        target_ids = decoder_tokenizer.convert_string_to_ids(' '.join(target_tokens))
        target_mask = [1] *len(target_ids)
        padding_length = args.max_target_length - len(target_ids)
        target_ids+=[tokenizer.pad_token_id]*padding_length
        target_mask+=[0]*padding_length   

        if example_index < 5:
            if stage=='train':
                print("*** Example ***")
                print("idx: {}".format(example.idx))

                print("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
                print("source_ids: {}".format(' '.join(map(str, source_ids))))
                print("source_mask: {}".format(' '.join(map(str, source_mask))))
                
                print("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
                print("target_ids: {}".format(' '.join(map(str, target_ids))))
                print("target_mask: {}".format(' '.join(map(str, target_mask))))
       
        features.append(
            InputFeatures(
                 example_index,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,
            )
        )
    return features



def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    

def main():

    print(args)

    # Set seed
    set_seed(args.seed)
    # make dir if output_dir not exist
    if os.path.exists(args.output_dir) is False:
        os.makedirs(args.output_dir)
        
## model was here

    if args.do_train:
        print("Inside TRAIN")
        # Prepare training data loader
        train_examples = read_examples(args.train_filename)
        train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train') # MH: 2 tokenizers
        all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
        all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
        all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
        all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
        train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
        
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps)

        num_train_optimization_steps =  args.train_steps

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': args.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=int(t_total*0.1),
                                                    num_training_steps=t_total)
    
        #Start training
        print("***** Running training *****")
        print("  Num examples = %d", len(train_examples))
        print("  Batch size = %d", args.train_batch_size)
        print("  Num epoch = %d", args.num_train_epochs)
        

        model.train()
        dev_dataset={}
        nb_tr_examples, nb_tr_steps,tr_loss,global_step,best_bleu,best_loss = 0, 0,0,0,0,1e6 
        for epoch in range(args.num_train_epochs):
            bar = tqdm(train_dataloader,total=len(train_dataloader))
            for batch in bar:
                batch = tuple(t.to(device) for t in batch)
                source_ids,source_mask,target_ids,target_mask = batch
                loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,target_ids=target_ids,target_mask=target_mask)

                if args.n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
                bar.set_description("epoch {} loss {}".format(epoch,train_loss))
                nb_tr_examples += source_ids.size(0)
                nb_tr_steps += 1
                loss.backward()

                if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                    #Update parameters
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    global_step += 1

            if args.do_eval:
                print("Inside EVAL")
                #Eval model with dev dataset
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0                     
                eval_flag=False    
                if 'dev_loss' in dev_dataset:
                    eval_examples,eval_data=dev_dataset['dev_loss']
                else:
                    eval_examples = read_examples(args.dev_filename)
                    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
                    all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
                    all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
                    all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
                    all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long)      
                    eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)   
                    dev_dataset['dev_loss']=eval_examples,eval_data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

                print("\n***** Running evaluation *****")
                print("  Num examples = %d", len(eval_examples))
                print("  Batch size = %d", args.eval_batch_size)

                #Start Evaling model
                model.eval()
                eval_loss,tokens_num = 0,0
                for batch in eval_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    source_ids,source_mask,target_ids,target_mask = batch                  

                    with torch.no_grad():
                        _,loss,num = model(source_ids=source_ids,source_mask=source_mask,
                                           target_ids=target_ids,target_mask=target_mask)     
                    eval_loss += loss.sum().item()
                    tokens_num += num.sum().item()
                #Pring loss of dev dataset    
                model.train()
                eval_loss = eval_loss / tokens_num
                result = {'eval_ppl': round(np.exp(eval_loss),5),
                          'global_step': global_step+1,
                          'train_loss': round(train_loss,5)}
                for key in sorted(result.keys()):
                    print("  %s = %s", key, str(result[key]))
                print("  "+"*"*20)   

                #save last checkpoint
                last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
                if not os.path.exists(last_output_dir):
                    os.makedirs(last_output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(last_output_dir, "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)                    
                if eval_loss<best_loss:
                    print("  Best ppl:%s",round(np.exp(eval_loss),5))
                    print("  "+"*"*20)
                    best_loss=eval_loss
                    # Save best checkpoint for best ppl
                    output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl')
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)  


                #Calculate bleu  
                print("Calculating BLEU")
                if 'dev_bleu' in dev_dataset:
                    eval_examples,eval_data=dev_dataset['dev_bleu']
                else:
                    eval_examples = read_examples(args.dev_filename)
                    eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
                    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
                    all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
                    all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
                    eval_data = TensorDataset(all_source_ids,all_source_mask)   
                    dev_dataset['dev_bleu']=eval_examples,eval_data



                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

                model.eval() 
                p=[]
                for batch in eval_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    source_ids,source_mask= batch                  
                    with torch.no_grad():
                        preds = model(source_ids=source_ids,source_mask=source_mask)  
                        for pred in preds:
                            t=pred[0].cpu().numpy()
                            t=list(t)
                            if 0 in t:
                                t=t[:t.index(0)]
                            # text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                            text = decoder_tokenizer.decode(t) # MH: decoder
                            p.append(text)
                model.train()
                predictions=[]
                with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
                    for ref,gold in zip(p,eval_examples):
                        predictions.append(str(gold.idx)+'\t'+ref)
                        f.write(str(gold.idx)+'\t'+ref+'\n')
                        f1.write(str(gold.idx)+'\t'+gold.target+'\n')     

                (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "dev.gold")) 
                dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
                print("  %s = %s "%("bleu-4",str(dev_bleu)))
                print("  "+"*"*20)    
                if dev_bleu>best_bleu:
                    print("  Best bleu:%s",dev_bleu)
                    print("  "+"*"*20)
                    best_bleu=dev_bleu
                    # Save best checkpoint for best bleu
                    output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu')
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)
               
    if args.do_test:
        # print("Inside TEST")
        # files=[]
        # if args.dev_filename is not None:
        #     files.append(args.dev_filename)
        # if args.test_filename is not None:
        #     files.append(args.test_filename)
        # for idx,file in enumerate(files):   
        idx = 0
        file = args.test_filename # Change it to test file later
        print("Test file: {}".format(file))
        eval_examples = read_examples(file)
        eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
        all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
        all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
        eval_data = TensorDataset(all_source_ids,all_source_mask)   

        # Calculate bleu
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval() 
        p=[]
        for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
            batch = tuple(t.to(device) for t in batch)
            source_ids,source_mask= batch                  
            with torch.no_grad():
                preds = model(source_ids=source_ids,source_mask=source_mask) 
                for pred in preds:
                    t=pred[0].cpu().numpy()
                    t=list(t)
                    if 0 in t:
                        t=t[:t.index(0)]
                    # text = tokenizer.decode(t,clean_up_tokenization_spaces=False) # MH: decoder
                    text = decoder_tokenizer.decode(t)
                    p.append(text)
        model.train()
        predictions=[]
        with open(os.path.join(args.output_dir,"test_{}.output".format(str(idx))),'w') as f, open(os.path.join(args.output_dir,"test_{}.gold".format(str(idx))),'w') as f1:
            for ref,gold in zip(p,eval_examples):
                predictions.append(str(gold.idx)+'\t'+ref)
                f.write(str(gold.idx)+'\t'+ref+'\n')
                f1.write(str(gold.idx)+'\t'+gold.target+'\n')     

        (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, args.output_name+"test_{}.gold".format(idx))) 
        dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
        print("  %s = %s "%("bleu-4",str(dev_bleu)))
        print("  "+"*"*20)    


**Training**

In [11]:
class Arguments:
    pass

In [12]:
output_dir="/gdrive/My Drive/text2app_models/RoBERTa/" # Colab + Drive
# output_dir="model/" # Local
data_dir = '../synthesized_data/'
train_file=data_dir+'nl_sar_train.csv'
dev_file=data_dir+'nl_sar_valid.csv'
test_file=data_dir+'nl_sar_test.csv'
pretrained_model= 'roberta-base' #'microsoft/codebert-base' #'roberta-base'

In [13]:
args = Arguments()

## Required parameters
args.model_type='roberta'
args.model_name_or_path=pretrained_model
args.output_dir=output_dir
args.load_model_path=None #output_dir+"/checkpoint-best-bleu/pytorch_model.bin"
## Other parameters
args.train_filename=train_file
args.dev_filename=dev_file
args.test_filename=test_file
args.output_name = ""
args.config_name=""
args.tokenizer_name=""
args.gradient_accumulation_steps=1
args.weight_decay=0.0
args.adam_epsilon=1e-8
args.max_grad_norm=1.0
args.max_steps=-1
args.eval_steps=-1
args.train_steps=-1
args.warmup_steps=0
args.local_rank=-1
args.seed=42

args.no_cuda=False
args.do_lower_case=True
args.do_train=True
args.do_eval=True
args.do_test=False

args.num_train_epochs=10
args.train_batch_size=100
args.eval_batch_size=100
args.learning_rate=5e-5
args.max_source_length=80
args.max_target_length=50
args.beam_size=5

In [14]:
# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = 1
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
args.device = device



In [15]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)

decoder_tokenizer = MyTokenizer()

#budild model
encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)

06/30/2021 18:49:35 - INFO - filelock -   Lock 140237806276560 acquired on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…

06/30/2021 18:49:35 - INFO - filelock -   Lock 140237806276560 released on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock





06/30/2021 18:49:35 - INFO - filelock -   Lock 140235175927760 acquired on /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…

06/30/2021 18:49:36 - INFO - filelock -   Lock 140235175927760 released on /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock





06/30/2021 18:49:36 - INFO - filelock -   Lock 140237939571600 acquired on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

06/30/2021 18:49:37 - INFO - filelock -   Lock 140237939571600 released on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock





06/30/2021 18:49:38 - INFO - filelock -   Lock 140235176148752 acquired on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…

06/30/2021 18:49:38 - INFO - filelock -   Lock 140235176148752 released on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock





06/30/2021 18:49:39 - INFO - filelock -   Lock 140235014615952 acquired on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…

06/30/2021 18:49:49 - INFO - filelock -   Lock 140235014615952 released on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock





In [16]:
model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
              beam_size=args.beam_size,max_length=args.max_target_length,
              sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
if args.load_model_path is not None:
    print("reload model from {}".format(args.load_model_path))
    model.load_state_dict(torch.load(args.load_model_path))
    
model.to(device)
if args.local_rank != -1:
    # Distributed training
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
    model = DDP(model)
elif args.n_gpu > 1:
    # multi-gpu training
    model = torch.nn.DataParallel(model)

In [None]:
if __name__ == "__main__":
    main()

<__main__.Arguments object at 0x7f8b4f4cce90>
Inside TRAIN
*** Example ***
idx: 0
source_tokens: ['<s>', 'make', '_software', '_consisting', '_of', '_a', '_ball', '_,', '_a', '_switch', '_with', '_text', '_string', '0', '_,', '_and', '_a', '_pas', 'word', '_text', '_box', '_.', '_when', '_the', '_ball', '_is', '_reaches', '_edge', ',', '_set', '_ball', '_in', '_motion', '_.', '</s>']
source_ids: 0 19746 2257 17402 9 10 1011 2156 10 5405 19 2788 6755 288 2156 8 10 6977 14742 2788 2233 479 77 5 1011 16 11541 3543 6 278 1011 11 4298 479 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
source_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
target_tokens: ['<s>', '<complist>', '<ball>', '<switch>', 'string0', '</switch>', '<passwordtextbox>', '</complist>', '<code>', '<ball1flung>', '<ball1>', '<motion>', '</ball1>', '</ball1flung>'

  0%|          | 0/400 [00:00<?, ?it/s]

***** Running training *****
  Num examples = %d 40000
  Batch size = %d 100
  Num epoch = %d 10


epoch 0 loss 1.9014: 100%|██████████| 400/400 [12:08<00:00,  1.82s/it]


Inside EVAL

***** Running evaluation *****
  Num examples = %d 5000
  Batch size = %d 100
  %s = %s eval_ppl 1.28222
  %s = %s global_step 401
  %s = %s train_loss 1.9014
  ********************
  Best ppl:%s 1.28222
  ********************
Calculating BLEU


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Total: 1000


  bleu-4 = 92.43 
  ********************
  Best bleu:%s 92.43
  ********************


epoch 1 loss 0.1162: 100%|██████████| 400/400 [12:21<00:00,  1.85s/it]


Inside EVAL

***** Running evaluation *****
  Num examples = %d 5000
  Batch size = %d 100
  %s = %s eval_ppl 1.0467
  %s = %s global_step 801
  %s = %s train_loss 0.1162
  ********************
  Best ppl:%s 1.0467
  ********************
Calculating BLEU


In [None]:
# If working on colab, save checkpoint to Google Drive
!cp model/checkpoint-best-bleu/pytorch_model.bin '/gdrive/My Drive/text2app_models/RoBERTa/' # CodeBERT RoBERTa PointerNet

# Testing

In [None]:
args.do_test=True
args.do_train=False
args.beam_size=1
args.test_filename=data_dir+'nl_sar_test_unseen_pair.csv'
args.output_name = "unseen_pair_"

**Test Model**

In [None]:
model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
              beam_size=args.beam_size,max_length=args.max_target_length,
              sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)

In [None]:
# args.load_model_path=output_dir+"/checkpoint-best-bleu/pytorch_model.bin"
args.load_model_path='/gdrive/My Drive/text2app_models/RoBERTa/pytorch_model.bin'

model.load_state_dict(torch.load(args.load_model_path))
model.to(device)

Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [None]:
for test_file in ['nl_sar_test.csv', 'nl_sar_test_2%_mutation.csv', 'nl_sar_test_5%_mutation.csv', 'nl_sar_test_10%_mutation.csv', 'nl_sar_test_unseen_pair.csv']:
  args.test_filename=data_dir+test_file
  args.output_name=test_file[10:-4]
  print("#### testing", test_file)

  if __name__ == "__main__":
      main()

  ref = open("model/test_0.gold")
  ref_sar = ref.readlines()
  ref.close()
  pred = open("model/"+args.output_name+"test_0.output")
  pred_sar = pred.readlines()
  pred.close()
  correct = 0
  for i in range(len(ref_sar)):
    if ref_sar[i]==pred_sar[i]:
      correct+=1
  print("Exact Match: ", 100*correct/len(ref_sar))

## Calculate BLEU-4

In [None]:
!python evaluator.py model/test_0.gold < model/test_0.output

Total: 5000
97.20048209170629


## Exact Match

In [None]:
ref = open("model/test_0.gold")
ref_sar = ref.readlines()
ref.close()

pred = open("model/test_0.output")
pred_sar = pred.readlines()
pred.close()

correct = 0
for i in range(len(ref_sar)):
  if ref_sar[i]==pred_sar[i]:
    correct+=1

100*correct/len(ref_sar)

77.8

## Single NL prediction

In [None]:
def single_example_to_feature(example, tokenizer): # MH: make it encoder_tokenizer, decoder_tokenizer
    features = []
    source_tokens = tokenizer.tokenize(example)[:args.max_source_length-2]
    source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
    source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
    source_mask = [1] * (len(source_tokens))
    padding_length = args.max_source_length - len(source_ids)
    source_ids+=[tokenizer.pad_token_id]*padding_length
    source_mask+=[0]*padding_length
 
    target_tokens = ['None']
    target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
    # target_ids = tokenizer.convert_tokens_to_ids(target_tokens) # MH: decoder
    target_ids = decoder_tokenizer.convert_string_to_ids(' '.join(target_tokens))
    target_mask = [1] *len(target_ids)
    padding_length = args.max_target_length - len(target_ids)
    target_ids+=[tokenizer.pad_token_id]*padding_length
    target_mask+=[0]*padding_length   

    features.append(
        InputFeatures(
              0,
              source_ids,
              target_ids,
              source_mask,
              target_mask,
        )
    )
    return features

In [None]:
args.eval_batch_size = 1
args.do_test=True
args.do_train=False
args.beam_size=1
args.load_model_path=output_dir+"/checkpoint-best-bleu/pytorch_model.bin"

In [None]:
model.eval() 

In [None]:
def get_sar(eval_example):
  eval_features = single_example_to_feature(eval_example, tokenizer)
  all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long).to(device)
  all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long).to(device)
  preds = model(source_ids=all_source_ids, source_mask=all_source_mask)  
  pred_list = list(preds[0][0].cpu().numpy())
  predicted_text = decoder_tokenizer.decode(pred_list[:pred_list.index(0)])
  return predicted_text

In [None]:
eval_example = 'make an app with a textbox, a button named "tweet", and a label. When the button is pressed, set the label to textbox text..'

get_sar(eval_example)

In [None]:
"string0": go
"string1": back
<complist> <textbox> <button> go </button> <button> back </button> </complist> <code> <button1clicked> <label1> <textboxtext1> </label1> </button1clicked> <button2clicked> <ball1> <color> <gray> </color> </ball1> </button2clicked> </code>

In [None]:
get_sar("make an app with an accelerometer and a music player . when accelerometer is shaken play music")