<a href="https://colab.research.google.com/github/masies/CRA/blob/main/pytorchConversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece==0.1.94
!pip install -q torch==1.4.0 -f https://download.pytorch.org/whl/cu101/torch_stable.html
!pip install -q transformers==3.5.0 fast-trees
!git clone -q https://github.com/microsoft/CodeXGLUE.git


import pandas as pd
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
import torch
from tqdm import tqdm
import os
import logging
import shutil
from torch.utils.data import Dataset, DataLoader
import json
import sys
import statistics


from google.colab import auth
auth.authenticate_user()
project_id = 'helical-loop-303918'
bucket_name = 'code_review_automation'

!gcloud config set project {project_id}

!mkdir core

!gsutil -m cp -r \
  "gs://code_review_automation/pyTorch_coversion/core/Logger.py" \
  ./core

from core.Logger import Logger

# script for conversion in pythorch
!gsutil cp gs://{bucket_name}/pyTorch_coversion/code/tf_2_pytorch_T5.py ./tf_2_pytorch_T5.py

# Download the configuration file
!gsutil cp gs://{bucket_name}/pyTorch_coversion/config/config.json ./current_model/config.json


Collecting sentencepiece==0.1.94
[?25l  Downloading https://files.pythonhosted.org/packages/6e/f0/7614029138ec9422f1a3ed3cd82c3bfc0821157e8032ca1828cee6b198bb/sentencepiece-0.1.94-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 16.9MB/s eta 0:00:01[K     |▋                               | 20kB 23.0MB/s eta 0:00:01[K     |▉                               | 30kB 28.1MB/s eta 0:00:01[K     |█▏                              | 40kB 23.6MB/s eta 0:00:01[K     |█▌                              | 51kB 18.0MB/s eta 0:00:01[K     |█▊                              | 61kB 17.3MB/s eta 0:00:01[K     |██                              | 71kB 13.1MB/s eta 0:00:01[K     |██▍                             | 81kB 13.9MB/s eta 0:00:01[K     |██▋                             | 92kB 15.1MB/s eta 0:00:01[K     |███                             | 102kB 14.0MB/s eta 0:00:01[K     |███▎                            | 112kB 14.0MB/s eta 0:00:01[K     |███▌

In [None]:
model_number = 395000
dataset_version = "v2"
model_type = "codeANDcomment_code"
trained_type ="pretrained"

# Download the selected best model
!mkdir dumps
!mkdir current_model
!gsutil -m cp \
  "gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_{dataset_version}/{model_type}/{trained_type}/model.ckpt-{model_number}.data-00000-of-00002" \
  "gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_{dataset_version}/{model_type}/{trained_type}/model.ckpt-{model_number}.data-00001-of-00002" \
  "gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_{dataset_version}/{model_type}/{trained_type}/model.ckpt-{model_number}.index" \
  "gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_{dataset_version}/{model_type}/{trained_type}/model.ckpt-{model_number}.meta" \
  ./current_model/

# Download the validation script
# !gsutil cp gs://{bucket_name}/pyTorch_coversion/code/validation.py ./validation.py

# Download the model and vocab
!gsutil cp gs://{bucket_name}/CodeReviewModel/TestModel.model ./current_model/TestModel.model
!gsutil cp gs://{bucket_name}/CodeReviewModel/TestModel.vocab ./current_model/TestModel.vocab

# download the test set
!gsutil cp gs://{bucket_name}/dataset/old/fineTuningDataset_{dataset_version}/codeANDcomment_code/test.tsv ./data/test.tsv

# prepare source and target files
df = pd.read_csv("./data/test.tsv", sep='\t', names=["source","target"])

# initialize source and target files
f = open("./data/test.source", "w")
f.close()
f = open("./data/test.target", "w")
f.close()

with open("./data/test.source", "a") as source:
  with open("./data/test.target", "a") as target:
    for index, row in df.iterrows():
      # if index < 80:
      source.write("code&comment2code: " + row.source + "\n")
      target.write(row.target + "\n")

# Convert the model
!python3 ./tf_2_pytorch_T5.py --tf_checkpoint_path ./current_model/model.ckpt-{model_number} --config_file ./current_model/config.json --pytorch_dump_path ./dumps

!head -n 1 ./data/test.source
!head -n 1 ./data/test.target

mkdir: cannot create directory ‘current_model’: File exists
Copying gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_v2/codeANDcomment_code/pretrained/model.ckpt-285000.data-00000-of-00002...
Copying gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_v2/codeANDcomment_code/pretrained/model.ckpt-285000.data-00001-of-00002...
Copying gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_v2/codeANDcomment_code/pretrained/model.ckpt-285000.index...
Copying gs://code_review_automation/fine_tuning/HP_tuning/final_version/comment_v2/codeANDcomment_code/pretrained/model.ckpt-285000.meta...
/ [4/4 files][126.0 MiB/126.0 MiB] 100% Done                                    
Operation completed over 4 objects/126.0 MiB.                                    
Copying gs://code_review_automation/CodeReviewModel/TestModel.model...
/ [1 files][762.0 KiB/762.0 KiB]                                                
Operation completed over 1 objects

In [None]:
class EvalDataset(torch.utils.data.Dataset):
  samples = []

  def __init__(self,  data_dir_path):
    data_dir = data_dir_path
    datasets = ['test.source', 'test.target']
    self.samples = []    

    input_file = open(os.path.join(data_dir, 'test.source'), 'r')
    output_file = open(os.path.join(data_dir, 'test.target'), 'r')

    lines_input = input_file.readlines()
    output_lines = output_file.readlines()

    for (inp, out) in zip(lines_input, output_lines):
      self.samples.append((inp.rstrip(), out.rstrip()))

  def __getitem__(self, index):
    return self.samples[index]

  def __len__(self):
    return len(self.samples)

In [None]:
beam_size = 10
batch_size = 8
data_dir = "/content/data"
tokenizer_name = "./current_model/TestModel.model" 
model_name_or_path = "./dumps/pytorch_model.bin"
config_name = "./current_model/config.json"

dataset = EvalDataset(data_dir)
dloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # My envirnment uses CPU

t5_tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)

t5_config = T5Config.from_pretrained(config_name)
t5_mlm = T5ForConditionalGeneration.from_pretrained(model_name_or_path, config=t5_config).to(DEVICE)
# t5_mlm.eval()

Calling T5Tokenizer.from_pretrained() with the path to a single file or url is deprecated


In [None]:
predictions = []
code_bleues = []

perfect = 0
almost_perfect = 0

# indexes for batches
old = 0
new = batch_size * beam_size

for batch in tqdm(dloader): 
  encoded = t5_tokenizer.batch_encode_plus(batch[0], add_special_tokens=False, return_tensors='pt', padding=True)
  
  input_ids = encoded['input_ids'].to(DEVICE)
  attention_mask = encoded['attention_mask'].to(DEVICE)

  outputs = t5_mlm.generate(
      input_ids=input_ids,
      max_length=512, #Change here 
      num_beams=beam_size,
      attention_mask=attention_mask,
      early_stopping=True,
      num_return_sequences=beam_size).to(DEVICE)

  predictions.extend(t5_tokenizer.batch_decode(outputs, skip_special_tokens=True))

  to_analyze = predictions[old:new]
  target_list = batch[1]
  input_list = batch[0]

  idx = 0
  for (input_item, target_item) in zip(input_list,target_list):

    flag_perfect = False
    flag_almost_perfect = False

    target_item = " ".join(target_item.split(' '))
    best_code_bleu = 0
    
    for i in range(beam_size):
      prediction_item = " ".join(to_analyze[idx].split(' '))

      if not flag_perfect and prediction_item == target_item:
        perfect += 1
        flag_perfect = True

      if not flag_almost_perfect and "".join(prediction_item.split(' ')) == "".join(target_item.split(' ')):
        almost_perfect += 1
        flag_almost_perfect = True

      idx += 1

      with open("code_bleu_target.txt", "w") as target_cb:
        target_cb.write(target_item + "\n")
      with open("code_bleu_prediction.txt", "w") as prediction_cb:
        prediction_cb.write(prediction_item + "\n")

      try:
        result = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs ./../../../../../code_bleu_target.txt --hyp ./../../../../../code_bleu_prediction.txt --lang java --params 0.25,0.25,0.25,0.25
        code_bleu = float(result[1][result[1].index("CodeBLEU score:  ")+17:])
        
      except: 
        result = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs ./../../../../../code_bleu_target.txt --hyp ./../../../../../code_bleu_prediction.txt --lang java --params 0.333,0.333,0.333,0
        code_bleu = float(result[2][result[2].index("CodeBLEU score:  ")+17:])

      best_code_bleu = code_bleu if code_bleu > best_code_bleu else best_code_bleu
        
    code_bleues.append(best_code_bleu)

  old = new
  new = new + (batch_size * beam_size)

perfect_percentage = (perfect/len(dataset))*100
almost_perfect_percentage = (almost_perfect/len(dataset))*100
mean_cb = statistics.mean(code_bleues)
median_cb = statistics.median(code_bleues)
stdev_cb = statistics.stdev(code_bleues)

print()
print('#perfect prediction: ', perfect)
print('Perfect prediction {}%'.format(round(perfect_percentage, 2)))

print('#almost perfect prediction: ', almost_perfect)
print('Almost Perfect prediction {}%'.format(round(almost_perfect_percentage, 2)))

print('Mean Bleu :', round(mean_cb, 2))
print('Median Bleu :', round(median_cb, 2))
print('stdev Bleu :', round(stdev_cb, 2))


100%|██████████| 214/214 [1:14:57<00:00, 21.02s/it]


#perfect prediction:  595
Perfect prediction 34.84%
#almost perfect prediction:  733
Almost Perfect prediction 42.92%
Mean Bleu : 0.85
Median Bleu : 0.89
stdev Bleu : 0.16





In [None]:
# run validation script
!python3 validation.py --beam_size 3 --batch_size 24 --data_dir /content/data/ --tokenizer_name ./current_model/TestModel.model --model_name_or_path ./dumps/pytorch_model.bin --config_name ./current_model/config.json

2021-04-12 16:16:42.434763: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Calling T5Tokenizer.from_pretrained() with the path to a single file or url is deprecated
100% 4/4 [00:19<00:00,  4.93s/it]
#perfect prediction:  29
Perfect prediction 36.25


In [None]:
!cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs ./../../../../../code_bleu_target.txt --hyp ./../../../../../code_bleu_prediction.txt --lang java --params 0.25,0.25,0.25,0.25

['ngram match: 0.6703420896351792, weighted ngram match: 0.7390293031678882, syntax_match: 1.0, dataflow_match: 1.0',
 'CodeBLEU score:  0.8523428482007669']

In [None]:
code = "public static void main(String[] args) { \u003CSTART> Int count = 3; int lotteria = 3 \u003CEND>}"#@param {type:"string"}
comment = "lotteria = 4" #@param {type:"string"}
input = "code&comment2code: <code>" + code + "</code><technical_language>" + comment + "</technical_language>"
encoded = t5_tokenizer.encode(input, add_special_tokens=False, return_tensors='pt', padding=True).to(DEVICE)


input_ids = encoded
outputs = t5_mlm.generate(
      input_ids=input_ids,
      max_length=512, #Change here 
      num_beams=1,
      early_stopping=True,
      num_return_sequences=1).to(DEVICE)

print(code)
print(t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


public static void main(String[] args) { <START> Int count = 3; int lotteria = 3 <END>}
public static void main(String[] args) { Int count = 4; int lotteria = 4; }
