<a href="https://colab.research.google.com/github/masies/CRA/blob/main/replication_package/Replication_package_pytorchConversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Conversion of the model on pythorch
in this colab notebook we will convert a model on pythorch to generate predictions and evaluate results with CodeBLEU, perfect predictions and almost perfct predictions.

In [None]:
!git clone -q https://github.com/microsoft/CodeXGLUE.git
!cd /content/CodeXGLUE && git checkout a0e2febebf20d551d479df40a51508b7797fea91

from google.colab import auth
auth.authenticate_user()
#@title ## Set Your GCS credential
project_id = 'helical-loop-303918'#@param {type:"string"}
bucket_name = 'code_review_automation'#@param {type:"string"}

!gcloud config set project {project_id}

!gsutil cp gs://{bucket_name}/replication_package/requirements/requirements_conversion.txt  requirements_conversion.txt

!pip install -r /content/requirements_conversion.txt

import pandas as pd
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
import torch
from tqdm import tqdm
import os
import logging
import shutil
from torch.utils.data import Dataset, DataLoader
import json
import sys
import statistics

# script for conversion in pythorch
!gsutil cp gs://{bucket_name}/replication_package/utils/pythorch_conversion/tf_2_pytorch_T5.py ./tf_2_pytorch_T5.py

# Download the configuration file
!gsutil cp gs://{bucket_name}/replication_package/utils/pythorch_conversion/config.json ./current_model/config.json


Note: checking out 'a0e2febebf20d551d479df40a51508b7797fea91'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at a0e2feb small refinement of code-trans data
Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

Copying gs://code_review_automation/replication_package/requirements/requirements_conversion.txt...
/ [1 files][  6.9 KiB/  6.9 KiB]                                                
Operation completed over 1 objects/6.9 KiB.                                      
Collecting fast-trees==0.0.3
  Downloading https://files.pythonhosted.org/packages/05/b1/81a2fe1a98eae27bf5f2b4f70e443

you need to set the right model path in your GCS, and the model number.
Remember to only add models you previusly fine-tuned.

In [None]:
# Download the selected best model
model_number = "700" #@param {type:"string"}
model_folder_path = "/replication_package/fine_tuning_model_dumps/Large_dataset_code_code"#@param {type:"string"}
model_folder_path = bucket_name + model_folder_path

!mkdir dumps
!mkdir current_model
!gsutil -m cp \
  "gs://{model_folder_path}/model.ckpt-{model_number}.data-00000-of-00002" \
  "gs://{model_folder_path}/model.ckpt-{model_number}.data-00001-of-00002" \
  "gs://{model_folder_path}/model.ckpt-{model_number}.index" \
  "gs://{model_folder_path}/model.ckpt-{model_number}.meta" \
  ./current_model/

# Download the model and vocab
!gsutil cp gs://{bucket_name}/replication_package/code_review_model/TestModel.model ./current_model/TestModel.model
!gsutil cp gs://{bucket_name}/replication_package/code_review_model/TestModel.vocab ./current_model/TestModel.vocab

# download the test set
!gsutil cp gs://{bucket_name}/replication_package/dataset/fine-tuning/large/code_comment/test.tsv ./data/test.tsv

# prepare source and target files
df = pd.read_csv("./data/test.tsv", sep='\t', names=["source","target"])

# initialize source and target files
f = open("./data/test.source", "w")
f.close()
f = open("./data/test.target", "w")
f.close()

with open("./data/test.source", "a") as source:
  with open("./data/test.target", "a") as target:
    for index, row in df.iterrows():
      if index< 10:
        source.write("code&comment2code: " + row.source + "\n")
        target.write(row.target + "\n")

# Convert the model
!python3 ./tf_2_pytorch_T5.py --tf_checkpoint_path ./current_model/model.ckpt-{model_number} --config_file ./current_model/config.json --pytorch_dump_path ./dumps

!head -n 1 ./data/test.source
!head -n 1 ./data/test.target

mkdir: cannot create directory ‘current_model’: File exists
Copying gs://code_review_automation/replication_package/fine_tuning_model_dumps/Large_dataset_code_code/model.ckpt-700.data-00000-of-00002...
Copying gs://code_review_automation/replication_package/fine_tuning_model_dumps/Large_dataset_code_code/model.ckpt-700.data-00001-of-00002...
Copying gs://code_review_automation/replication_package/fine_tuning_model_dumps/Large_dataset_code_code/model.ckpt-700.index...
Copying gs://code_review_automation/replication_package/fine_tuning_model_dumps/Large_dataset_code_code/model.ckpt-700.meta...
\ [4/4 files][126.0 MiB/126.0 MiB] 100% Done                                    
Operation completed over 4 objects/126.0 MiB.                                    
Copying gs://code_review_automation/replication_package/code_review_model/TestModel.model...
/ [1 files][761.9 KiB/761.9 KiB]                                                
Operation completed over 1 objects/761.9 KiB.                   

In [None]:
class EvalDataset(torch.utils.data.Dataset):
  samples = []

  def __init__(self,  data_dir_path):
    data_dir = data_dir_path
    datasets = ['test.source', 'test.target']
    self.samples = []    

    input_file = open(os.path.join(data_dir, 'test.source'), 'r')
    output_file = open(os.path.join(data_dir, 'test.target'), 'r')

    lines_input = input_file.readlines()
    output_lines = output_file.readlines()

    for (inp, out) in zip(lines_input, output_lines):
      self.samples.append((inp.rstrip(), out.rstrip()))

  def __getitem__(self, index):
    return self.samples[index]

  def __len__(self):
    return len(self.samples)

Set the beam size and the batch size for the model predictions

In [None]:
beam_size =  2#@param {type:"integer"}
batch_size = 8#@param {type:"integer"}
data_dir = "/content/data"
tokenizer_name = "./current_model/TestModel.model" 
model_name_or_path = "./dumps/pytorch_model.bin"
config_name = "./current_model/config.json"

dataset = EvalDataset(data_dir)
dloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # My envirnment uses CPU

t5_tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)

t5_config = T5Config.from_pretrained(config_name)
t5_mlm = T5ForConditionalGeneration.from_pretrained(model_name_or_path, config=t5_config).to(DEVICE)

Calling T5Tokenizer.from_pretrained() with the path to a single file or url is deprecated


# Generate and evaluate predictions

In [None]:
predictions = []
code_bleues = []

perfect = 0
almost_perfect = 0

# indexes for batches
old = 0
new = batch_size * beam_size

for batch in tqdm(dloader): 
  encoded = t5_tokenizer.batch_encode_plus(batch[0], add_special_tokens=False, return_tensors='pt', padding=True)
  
  input_ids = encoded['input_ids'].to(DEVICE)
  attention_mask = encoded['attention_mask'].to(DEVICE)

  outputs = t5_mlm.generate(
      input_ids=input_ids,
      max_length=512, #Change here 
      num_beams=beam_size,
      attention_mask=attention_mask,
      early_stopping=True,
      num_return_sequences=beam_size).to(DEVICE)

  predictions.extend(t5_tokenizer.batch_decode(outputs, skip_special_tokens=True))

  to_analyze = predictions[old:new]
  target_list = batch[1]
  input_list = batch[0]

  idx = 0
  for (input_item, target_item) in zip(input_list,target_list):

    flag_perfect = False
    flag_almost_perfect = False

    target_item = " ".join(target_item.split(' '))
    best_code_bleu = 0
    
    for i in range(beam_size):
      prediction_item = " ".join(to_analyze[idx].split(' '))

      if not flag_perfect and prediction_item == target_item:
        perfect += 1
        flag_perfect = True

      if not flag_almost_perfect and "".join(prediction_item.split(' ')) == "".join(target_item.split(' ')):
        almost_perfect += 1
        flag_almost_perfect = True

      idx += 1

      with open("code_bleu_target.txt", "w") as target_cb:
        target_cb.write(target_item + "\n")
      with open("code_bleu_prediction.txt", "w") as prediction_cb:
        prediction_cb.write(prediction_item + "\n")

      try:
        result = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs ./../../../../../code_bleu_target.txt --hyp ./../../../../../code_bleu_prediction.txt --lang java --params 0.25,0.25,0.25,0.25
        code_bleu = float(result[1][result[1].index("CodeBLEU score:  ")+17:])
        
      except: 
        result = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs ./../../../../../code_bleu_target.txt --hyp ./../../../../../code_bleu_prediction.txt --lang java --params 0.333,0.333,0.333,0
        code_bleu = float(result[2][result[2].index("CodeBLEU score:  ")+17:])

      best_code_bleu = code_bleu if code_bleu > best_code_bleu else best_code_bleu
        
    code_bleues.append(best_code_bleu)

  old = new
  new = new + (batch_size * beam_size)

perfect_percentage = (perfect/len(dataset))*100
almost_perfect_percentage = (almost_perfect/len(dataset))*100
mean_cb = statistics.mean(code_bleues)
median_cb = statistics.median(code_bleues)
stdev_cb = statistics.stdev(code_bleues)

print()
print('#perfect prediction: ', perfect)
print('Perfect prediction {}%'.format(round(perfect_percentage, 2)))

print('#almost perfect prediction: ', almost_perfect)
print('Almost Perfect prediction {}%'.format(round(almost_perfect_percentage, 2)))

print('Mean CodeBLEU :', round(mean_cb, 2))
print('Median CodeBLEU :', round(median_cb, 2))
print('stdev CodeBLEU :', round(stdev_cb, 2))


  0%|          | 0/2 [00:00<?, ?it/s]

# Custom prediction, remeber to set the task prefix according to your model

In [None]:
code = ' public static void main(String[] args) { System.out.println(" hello world") }'#@param {type:"string"}
markedCode = ' public static void main(String[] args) { <START> System.out.println("hello world") <END>}'#@param {type:"string"}
comment = "print using a logger" #@param {type:"string"}
task = "code2comment:" #@param ["code2comment:", "code&comment2code:", "markedCode2code:", "code2code:"]

if task == "code&comment2code:":
  input = task + " <code>" + code + "</code><technical_language>" + comment + "</technical_language>"
elif task == "markedCode2code:":
  input = task + markedCode
else:
  input = task + code

print(input)
encoded = t5_tokenizer.encode(input, add_special_tokens=False, return_tensors='pt', padding=True).to(DEVICE)

input_ids = encoded
outputs = t5_mlm.generate(
      input_ids=input_ids,
      max_length=512, #Change here 
      num_beams=1,
      early_stopping=True,
      num_return_sequences=1).to(DEVICE)

print(code)
print(t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
