In [1]:
import os
from datasets import Dataset,load_dataset
import json
path = os.path.join("..","..","..","data")
data = load_dataset("json",data_files={"train":os.path.join(path,"train_java2cs.jsonl"),"test":os.path.join(path,"valid_java2cs.jsonl")})

In [2]:
from transformers import AutoTokenizer
basemodel = "Salesforce/codet5-base"
tokenzier = AutoTokenizer.from_pretrained(basemodel)

In [3]:
sourcelg = "java"
tgtlg = "cs"
prefix = """#translate this java code to c-sharp code:
java:"""

def preprocess_function(examples):
    inputs = [prefix + exmaple[sourcelg] for exmaple in examples['translation']]
    tgts = [example[tgtlg] for example in examples['translation']]
    model_inputs = tokenzier(inputs,text_target=tgts,max_length = 256,truncation= True)
    return model_inputs

In [4]:
tokenzied_data = data.map(preprocess_function,batched=True)

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenzier,model = basemodel)

In [6]:
import bleu
import dataflow_match
import syntax_match
import weighted_ngram_match
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()for label in labels]]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoded_preds = tokenzier.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenzier.pad_token_id)
    decoded_labels = tokenzier.batch_decode(labels, skip_special_tokens=True)
    
    hypothesis, pre_references = postprocess_text(decoded_preds, decoded_labels)

    for i in range(len(pre_references)):
        assert len(hypothesis) == len(pre_references[i])

    references = []
    for i in range(len(hypothesis)):
        ref_for_instance = []
        for j in range(len(pre_references)):
            ref_for_instance.append(pre_references[j][i])
        references.append(ref_for_instance)
    assert len(references) == len(pre_references)*len(hypothesis)


    tokenized_hyps = [x.split() for x in hypothesis]
    tokenized_refs = [[x.split() for x in reference] for reference in references]

    # calculate weighted ngram match
    keywords = [x.strip() for x in open('keywords/'+"c_sharp"+'.txt', 'r', encoding='utf-8').readlines()]
    def make_weights(reference_tokens, key_word_list):
        return {token:1 if token in key_word_list else 0.2 \
                for token in reference_tokens}
    tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
                for reference_tokens in reference] for reference in tokenized_refs]
    
    result = {}
    result['BLEU'] = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)
    result['Weighted_BLEU'] = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)
    result['Syntax Match accuracy'] = syntax_match.corpus_syntax_match(references,hypothesis,"c_sharp")
    result['Dataflow_match accuracy'] = dataflow_match.corpus_dataflow_match(references,hypothesis,"c_sharp")
    result['CodeBLEU'] = 0.25*result['BLEU'] + 0.25 * result['Weighted_BLEU'] + 0.25 * result['Syntax Match accuracy'] + 0.25 * result['Dataflow_match accuracy']
    
    return result

In [7]:
from transformers import AutoModelForSeq2SeqLM,Seq2SeqTrainingArguments,Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(basemodel)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
epoch_nums = 20
training_args = Seq2SeqTrainingArguments(
    output_dir="CodeT5ForCodeTrans",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=epoch_nums,
    predict_with_generate=True,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * epoch_nums),
    fp16=True,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True
)

In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    eval_dataset=tokenzied_data["test"],
    tokenizer=tokenzier,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingfac

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113292268580861, max=1.0…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Weighted Bleu,Syntax match accuracy,Dataflow Match accuracy,Codebleu
0,3.382,2.857626,0.000787,0.004617,0.106286,0.040971,0.038165
1,1.7884,0.973179,0.048843,0.085919,0.177985,0.085498,0.099561
2,0.6937,0.540023,0.129351,0.203648,0.19994,0.072047,0.151246
4,0.4638,0.381767,0.150047,0.234439,0.213774,0.069728,0.166997
5,0.4111,0.342289,0.156073,0.243483,0.210947,0.068646,0.169787
6,0.3865,0.314565,0.161879,0.251916,0.212752,0.06679,0.173334
8,0.3328,0.2757,0.165758,0.259106,0.214195,0.066481,0.176385
9,0.2913,0.259337,0.16621,0.260426,0.214075,0.06679,0.176875
10,0.2862,0.246577,0.169831,0.265374,0.21582,0.066636,0.179415
12,0.2532,0.226237,0.169315,0.265898,0.216241,0.067254,0.179677


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=6420, training_loss=0.5954580588504161, metrics={'train_runtime': 2713.5579, 'train_samples_per_second': 75.878, 'train_steps_per_second': 2.366, 'total_flos': 3.330567206111232e+16, 'train_loss': 0.5954580588504161, 'epoch': 19.95})

In [10]:
model.save_pretrained("CodeT5ForCodeTrans")

In [11]:
tokenzier.save_pretrained("CodeT5ForCodeTrans")

('CodeT5ForCodeTrans/tokenizer_config.json',
 'CodeT5ForCodeTrans/special_tokens_map.json',
 'CodeT5ForCodeTrans/vocab.json',
 'CodeT5ForCodeTrans/merges.txt',
 'CodeT5ForCodeTrans/added_tokens.json',
 'CodeT5ForCodeTrans/tokenizer.json')