In [1]:
import os
from datasets import Dataset,load_dataset
import json
path = os.path.join("..","..","data")
data = load_dataset("json",data_files={"train":os.path.join(path,"train_java2cs.jsonl"),"test":os.path.join(path,"valid_java2cs.jsonl")})

In [2]:
from transformers import AutoTokenizer
basemodel = "microsoft/codebert-base"
tokenzier = AutoTokenizer.from_pretrained(basemodel,model_max_length = 512)
tokenzier.pad_token = tokenzier.eos_token

In [3]:
tokenzier.model_max_length

512

In [4]:
tokenzier.add_special_tokens({'additional_special_tokens':['<|begin_of_java_code|>','<|end_of_java_code|>'\
                                                           ,'<|begin_of_c-sharp_code|>','<|end_of_c-sharp_code|>',\
                                                            '<|translate|>']})

5

In [5]:
tokenzier.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '</s>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<|end_of_c-sharp_code|>',
  '<|translate|>',
  '<|begin_of_c-sharp_code|>',
  '<|end_of_java_code|>',
  '<|begin_of_java_code|>']}

In [6]:
sourcelg = "java"
tgtlg = "cs"
prefix = tokenzier.special_tokens_map['additional_special_tokens'][-1]

In [7]:
prefix

'<|begin_of_java_code|>'

In [8]:

def preprocess_function(examples):
    inputs = [prefix + example[sourcelg] + tokenzier.special_tokens_map['additional_special_tokens'][-2] +\
               tokenzier.special_tokens_map['additional_special_tokens'][2]\
               + example[tgtlg] + tokenzier.special_tokens_map['additional_special_tokens'][0] for example in examples['translation']]
    model_inputs = tokenzier(inputs,padding="max_length",truncation=True)  
    return model_inputs

In [9]:
tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['id','translation'])

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [10]:
data['train']['translation'][1]

{'java': 'public UpdateJourneyStateResult updateJourneyState(UpdateJourneyStateRequest request) {request = beforeClientExecution(request);return executeUpdateJourneyState(request);}\n',
 'cs': 'public virtual UpdateJourneyStateResponse UpdateJourneyState(UpdateJourneyStateRequest request){var options = new InvokeOptions();options.RequestMarshaller = UpdateJourneyStateRequestMarshaller.Instance;options.ResponseUnmarshaller = UpdateJourneyStateResponseUnmarshaller.Instance;return Invoke<UpdateJourneyStateResponse>(request, options);}\n'}

In [11]:
tokenzier.decode(tokenzied_data['train'][2]['input_ids'])

'<s><|begin_of_java_code|>public void removePresentationFormat() {remove1stProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|end_of_java_code|><|begin_of_c-sharp_code|>public void RemovePresentationFormat(){MutableSection s = (MutableSection)FirstSection;s.RemoveProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|end_of_c-sharp_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></

In [12]:
tokenzier.decode(tokenzied_data['test'][2]['input_ids'])

'<s><|begin_of_java_code|>public InsertInstanceRequest() {super("Ots", "2016-06-20", "InsertInstance", "ots");setMethod(MethodType.POST);}\n<|end_of_java_code|><|begin_of_c-sharp_code|>public InsertInstanceRequest(): base("Ots", "2016-06-20", "InsertInstance", "ots", "openAPI"){Method = MethodType.POST;}\n<|end_of_c-sharp_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></

In [13]:
block_size = 512


def group_texts(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [14]:
tokenzied_data = tokenzied_data.map(group_texts,batched=True)

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenzier,mlm=False)

In [20]:
from transformers import RobertaForCausalLM,TrainingArguments,Trainer,RobertaModel

model = RobertaModel.from_pretrained(basemodel)
model.resize_token_embeddings(len(tokenzier))

You are using a model of type roberta to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.


NotImplementedError: 

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
epoch_nums = 20
training_args = TrainingArguments(
    output_dir="CodeBertForCodeTrans",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=epoch_nums,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * epoch_nums),
    fp16=True,
    push_to_hub=True,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    tokenizer=tokenzier,
    eval_dataset=tokenzied_data['test'],
    data_collator=data_collator,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingfac

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113307562967141, max=1.0…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,11.8132,9.532629


KeyboardInterrupt: 