## CodeBLEU

In [1]:
!pip install datasets evaluate transformers==4.33.1 accelerate peft bitsandbytes --quiet
!pip install sacrebleu unbabel-comet --quiet
!pip install huggingface_hub --quiet
!pip install git+https://github.com/k4black/codebleu.git --quiet
!pip install peft==0.13 --quiet
#!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
import codebleu
codebleu.AVAILABLE_LANGS

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

['java',
 'javascript',
 'c_sharp',
 'php',
 'c',
 'cpp',
 'python',
 'go',
 'ruby',
 'rust']

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import ClassLabel
from datasets import Dataset, DatasetDict
import pandas as pd
import re

def retrieve_dataset(split:["train", "val", "test"] = "train", dest_lang:["py", "cpp", "both"] = "cpp", path:str = "/content/") -> Dataset:
    """
    Retrieves a dataset of dictionaries with the codification:
        {"id": id,
        "translation":
            {"py":pycode,
            "cpp":cppcode}}
    According to the split selected
    """

    #Load the files
    with open(f"{path}{split}-C++-map.jsonl", "r") as f: cppids = f.read()
    with open(f"{path}{split}-Python-map.jsonl", "r") as f: pyids = f.read()
    with open(f"{path}{split}-C++-Python-tok.cpp", "r") as f: cppcode = f.read()
    with open(f"{path}{split}-C++-Python-tok.py", "r") as f: pycode = f.read()

    #Divide the text
    pyids = pyids.replace("Python", "py"); pyids = re.findall(r"(\d+)-(py)-(\d+)", pyids)
    cppids = cppids.replace("C++", "cpp"); cppids = re.findall(r"(\d+)-(cpp)-(\d+)", cppids)
    pycode = pycode.split("\n")[:-1]
    cppcode = cppcode.split("\n")[:-1]

    assert len(pycode) == len(pyids) and len(cppcode) == len(cppids) #Ids and lines of code are of equal length

    ids = []
    for i, lang, j in pyids:
        if i not in ids:
            ids.append(i)

    assert all(i in ids for i, lang, j in cppids) #Same ids for cpp and py

    #Create list of dicts with the desired codification
    idpy, idcpp = 0, 0
    dataset = []

    for i in ids:
        dic = {"source_text": "", "dest_text": "", "dest_lang": ""}
        pytrans, cpptrans = pycode[idpy], cppcode[idcpp]
        idpy += 1; idcpp += 1
        while idpy < len(pyids) and i in pyids[idpy]:
            pytrans += "\n" + pycode[idpy]
            idpy += 1
        while idcpp < len(cppids) and i in cppids[idcpp]:
            cpptrans += "\n" + cppcode[idcpp]
            idcpp += 1

        if dest_lang == "cpp" or dest_lang == "both":
            dic["source_text"]= pytrans
            dic["dest_text"] = cpptrans
            dic["dest_lang"] = "cpp"
            dataset.append(dic)
        if dest_lang == "both":
            dic = {"source_text": "", "dest_text": "", "dest_lang": ""}
        if dest_lang == "py" or dest_lang == "both":
            dic["source_text"]= cpptrans
            dic["dest_text"] = pytrans
            dic["dest_lang"] = "py"
            dataset.append(dic)


    #Create the final dataset
    split_ds = Dataset.from_pandas(pd.DataFrame(data=dataset))
    return split_ds


def retrieve_all() -> DatasetDict:
    """
    Retrieves a DatasetDict of Datasets cointaining the data of each split
    """

    train_ds = retrieve_dataset()
    val_ds = retrieve_dataset("val")
    test_ds = retrieve_dataset("test")
    ds = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})
    return ds.class_encode_column("dest_lang")

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import torch

ds = retrieve_all()

max_tok_length = 128
checkpoint = "meta-llama/Llama-2-7b-hf"
#checkpoint = "codellama/CodeLlama-7b-hf"
#checkpoint = "ajibawa-2023/Code-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint, use_auth_token=True,
    padding=True,
    pad_to_multiple_of=8,
    truncation=True,
    max_tok_len=max_tok_length,
    padding_side='left',
    )
tokenizer.pad_token = "[PAD]"



def preprocess_function(sample):
    model_inputs = tokenizer(
        sample["source_text"],
        text_target = sample["dest_text"],
        )
    return model_inputs




tokenized_ds = ds.map(preprocess_function, batched=True)

tokenized_ds = tokenized_ds.filter(lambda x: len(x["input_ids"]) <= max_tok_length and len(x["labels"]) <= max_tok_length , desc=f"Discarding source and target sentences with more than {max_tok_length} tokens")





quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    "hugo-albert/Llama-2-7b-hf-finetuned-py-to-cpp",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
)


Casting to class labels:   0%|          | 0/9308 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/477 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/890 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/9308 [00:00<?, ? examples/s]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Discarding source and target sentences with more than 128 tokens:   0%|          | 0/9308 [00:00<?, ? examples…

Discarding source and target sentences with more than 128 tokens:   0%|          | 0/477 [00:00<?, ? examples/…

Discarding source and target sentences with more than 128 tokens:   0%|          | 0/890 [00:00<?, ? examples/…

adapter_config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

In [6]:
import torch

src = "py"
tgt = "cpp"
task_prefix = f"Translate from {src} to {tgt}:\n"
s = ""

#if "Llama-3" in checkpoint: tokenizer.pad_token_id = 128002

prefix_tok_len = len(tokenizer.encode(f"{task_prefix}{src}: {s} = {tgt}: "))
max_tok_len = prefix_tok_len
# Adding 2 for new line in target sentence and eos_token_id token
max_tok_len += 2 * max_tok_length + 2


def preprocess4training_function(sample):

    sample_size = len(sample["source_text"])

    # Creating the prompt with the task description for each source sentence
    inputs  = [f"{task_prefix}{src}: {s} = {tgt}: " for s in sample["source_text"]]

    # Appending new line after each sample in the batch
    targets = [f"{s}\n" for s in sample["dest_text"]]

    # Applying the Llama2 tokenizer to the inputs and targets
    # to obtain "input_ids" (token_ids) and "attention mask"
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)

    # Each input is appended with its target
    # Each target is prepended with as many special token id (-100) as the original input length
    # Both input and target (label) has the same max_tok_len
    # Attention mask is all 1s
    for i in range(sample_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    # Each input is applied left padding up to max_tok_len
    # Attention mask is 0 for padding
    # Each target (label) is left filled with special token id (-100)
    # Finally inputs, attention_mask and targets (labels) are truncated to max_tok_len
    for i in range(sample_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_tok_len - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_tok_len - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_tok_len - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_tok_len])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_tok_len])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_tok_len])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess4test_function(sample):
    inputs = [f"{task_prefix}{src}: {s} = {tgt}: " for s in sample["source_text"]]
    model_inputs = tokenizer(inputs,padding=True,)
    return model_inputs


preprocessed_test_dataset = tokenized_ds['test'].map(preprocess4test_function, batched=True)

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

In [7]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained(
    checkpoint,
)

print(generation_config)

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9,
  "transformers_version": "4.33.1"
}



In [8]:
from tqdm import tqdm

test_batch_size = 4
batch_tokenized_test = preprocessed_test_dataset.batch(test_batch_size)



number_of_batches = len(batch_tokenized_test["input_ids"])
output_sequences = []
for i in tqdm(range(number_of_batches)):
    output_batch = model.generate(
        generation_config=generation_config,
        input_ids=torch.tensor(batch_tokenized_test["input_ids"][i]).cuda(),
        attention_mask=torch.tensor(batch_tokenized_test["attention_mask"][i]).cuda(),
        max_length = max_tok_len,
        num_beams=1,
        do_sample=False,)
    output_sequences.extend(output_batch)

from evaluate import load
from codebleu import calc_codebleu

comet = load("comet")
bleu = load("sacrebleu")

Batching examples:   0%|          | 0/133 [00:00<?, ? examples/s]

100%|██████████| 34/34 [33:12<00:00, 58.60s/it]


Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [9]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install tree-sitter-cpp
calc_codebleu(list(preprocessed_test_dataset["dest_text"]), list(preprocessed_test_dataset["dest_text"]), lang = "cpp")

Collecting tree-sitter-cpp
  Downloading tree_sitter_cpp-0.23.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tree_sitter_cpp-0.23.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter-cpp
Successfully installed tree-sitter-cpp-0.23.4


{'codebleu': 1.0,
 'ngram_match_score': 1.0,
 'weighted_ngram_match_score': 1.0,
 'syntax_match_score': 1.0,
 'dataflow_match_score': 1.0}

In [10]:
import re

def compute_metrics(sample, output_sequences):
    inputs = [f"{task_prefix}{src}: {s} = {tgt}: "  for s in sample["source_text"]]
    preds = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    #print(inputs)
    #print(preds)
    for i, (input,pred) in enumerate(zip(inputs,preds)):

      pred1 = re.search(r'^.*\n',pred.removeprefix(input).lstrip())
      if pred1 is not None:
        preds[i] = pred.replace(input + " ", "")
      else:
        preds[i] = ""
    #print(sample["source_text"])
    #print(sample["dest_text"])
    #print(preds)
    resultcomet = comet.compute(sources = sample["source_text"], predictions=preds, references=sample["dest_text"])
    resultbleu = bleu.compute(predictions=preds, references=sample["dest_text"])
    rescodebleu = calc_codebleu(list(sample["dest_text"]), list(preds), lang = "cpp")
    result = {"bleu": resultbleu["score"], "comet": resultcomet["mean_score"], "comet_all": resultcomet["scores"], "codebleu": rescodebleu}
    return result


result = compute_metrics(preprocessed_test_dataset,output_sequences)
print(f'BLEU score: {result["bleu"]}')
print(f'COMET score: {result["comet"]}')
print(f'CODEBLEU scores: {result["codebleu"]}')

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


BLEU score: 60.086365613827056
COMET score: 0.8740332516512477
CODEBLEU scores: {'codebleu': 0.7417267910592453, 'ngram_match_score': 0.6314675187141848, 'weighted_ngram_match_score': 0.788152685737308, 'syntax_match_score': 0.7264678471575023, 'dataflow_match_score': 0.8208191126279863}


In [11]:
def check_translation(i):
    print(f"COMET of snippet {i}:", result["comet_all"][i])
    print("REAL: \n", preprocessed_test_dataset[i]["dest_text"].replace("NEW_LINE", "\n"))
    print("PRED: \n", tokenizer.batch_decode(output_sequences, skip_special_tokens=True)[i].replace("NEW_LINE", "\n").split("= cpp: ")[1])

for i in range(len(result["comet_all"])):
    if result["comet_all"][i] == max(result["comet_all"]):
        check_translation(i)
        break

print("===================")
for i in range(len(result["comet_all"])):
    if result["comet_all"][i] == min(result["comet_all"]):
        check_translation(i)
        break

COMET of snippet 96: 0.9727656245231628
REAL: 
 int count ( int n ) { return n * ( 3 * n - 1 ) / 2 ; }
int main ( ) { int n = 3 ; cout << count ( n ) ; return 0 ; }
PRED: 
  int count ( int n ) { return n * ( 3 * n - 1 ) / 2 ; }
int main ( ) { int n = 3 ; cout << count ( n ) ; return 0 ; }

COMET of snippet 129: 0.6839482188224792
REAL: 
 float findVolume ( float a ) {
if ( a < 0 ) return -1 ;
float r = a / 2 ;
float h = a ;
float V = 3.14 * pow ( r , 2 ) * h ; return V ; }
int main ( ) { float a = 5 ; cout << findVolume ( a ) << endl ; return 0 ; }
PRED: 
  nobody = int ( findVolume ( a ) ) ; cout << nobody << endl ; return 0 ; }

