In [1]:
import os
from datasets import load_dataset
path = os.path.join("..","..","data")
data = load_dataset("json",data_files={"train":os.path.join(path,"train_java2cs.jsonl"),"test":os.path.join(path,"valid_java2cs.jsonl")})

In [2]:
from transformers import AutoTokenizer
basemodel = "codellama/CodeLlama-7b-hf"
tokenzier = AutoTokenizer.from_pretrained(basemodel,model_max_length = 512)
tokenzier.pad_token = tokenzier.eos_token
tokenzier.padding_side = "right"

In [3]:
tokenzier.add_special_tokens({'additional_special_tokens':['<|begin_of_java_code|>','<|end_of_java_code|>'\
                                                           ,'<|begin_of_c-sharp_code|>','<|end_of_c-sharp_code|>',\
                                                            '<|translate|>']})

5

In [4]:
tokenzier.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '</s>',
 'additional_special_tokens': ['<|begin_of_java_code|>',
  '<|end_of_java_code|>',
  '<|begin_of_c-sharp_code|>',
  '<|end_of_c-sharp_code|>',
  '<|translate|>']}

In [5]:
sourcelg = "java"
tgtlg = "cs"
prefix = tokenzier.special_tokens_map['additional_special_tokens'][-1]

In [6]:
prefix

'<|translate|>'

In [7]:

def preprocess_function(examples):
    inputs = [prefix + example[sourcelg] + tokenzier.special_tokens_map['additional_special_tokens'][-2] +\
               tokenzier.special_tokens_map['additional_special_tokens'][2]\
               + example[tgtlg] + tokenzier.special_tokens_map['additional_special_tokens'][0] for example in examples['translation']]
    model_inputs = tokenzier(inputs,padding="max_length",truncation=True)  
    return model_inputs

In [8]:
tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['id','translation'])

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [9]:
data['train']['translation'][1]

{'java': 'public UpdateJourneyStateResult updateJourneyState(UpdateJourneyStateRequest request) {request = beforeClientExecution(request);return executeUpdateJourneyState(request);}\n',
 'cs': 'public virtual UpdateJourneyStateResponse UpdateJourneyState(UpdateJourneyStateRequest request){var options = new InvokeOptions();options.RequestMarshaller = UpdateJourneyStateRequestMarshaller.Instance;options.ResponseUnmarshaller = UpdateJourneyStateResponseUnmarshaller.Instance;return Invoke<UpdateJourneyStateResponse>(request, options);}\n'}

In [10]:
tokenzier.decode(tokenzied_data['train'][2]['input_ids'])

'<s><|translate|> public void removePresentationFormat() {remove1stProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|end_of_c-sharp_code|><|begin_of_c-sharp_code|> public void RemovePresentationFormat(){MutableSection s = (MutableSection)FirstSection;s.RemoveProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|begin_of_java_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s

In [11]:
tokenzier.decode(tokenzied_data['test'][2]['input_ids'])

'<s><|translate|> public InsertInstanceRequest() {super("Ots", "2016-06-20", "InsertInstance", "ots");setMethod(MethodType.POST);}\n<|end_of_c-sharp_code|><|begin_of_c-sharp_code|> public InsertInstanceRequest(): base("Ots", "2016-06-20", "InsertInstance", "ots", "openAPI"){Method = MethodType.POST;}\n<|begin_of_java_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s

In [12]:
block_size = 512
def group_texts(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [13]:
tokenzied_data = tokenzied_data.map(group_texts,batched=True)

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoModelForCausalLM,TrainingArguments, Trainer
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    PeftType,
    TaskType
)
import torch
peft_type = PeftType.LORA
config = LoraConfig(
        r=8,
        lora_alpha=16,
        inference_mode=False,
        lora_dropout=0.1,
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
        "q_proj",
        "v_proj",
    ],
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
from transformers import BitsAndBytesConfig
babcfig = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
model = AutoModelForCausalLM.from_pretrained(basemodel,
        device_map = "cuda:0",
        quantization_config = babcfig)

model.resize_token_embeddings(len(tokenzier))
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,194,304 || all params: 6,742,781,952 || trainable%: 0.06220435466930549




In [16]:
#from huggingface_hub import notebook_login

#notebook_login()

In [17]:
num_epochs = 5
training_args = TrainingArguments(
    output_dir="CodeLlama7bForCodeTransLoRA",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=num_epochs,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * num_epochs),
    fp16=True,
    #push_to_hub = True,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenzier,mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    tokenizer=tokenzier,
    data_collator=data_collator,
    eval_dataset = tokenzied_data['test']
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,0.5639,0.437669
2,0.4175,0.355897
3,0.371,0.33698
4,0.3402,0.326941
5,0.3309,0.325875




TrainOutput(global_step=6435, training_loss=0.5011731246319869, metrics={'train_runtime': 37832.3798, 'train_samples_per_second': 1.361, 'train_steps_per_second': 0.17, 'total_flos': 1.0455040269484032e+18, 'train_loss': 0.5011731246319869, 'epoch': 5.0})

In [19]:
#trainer.push_to_hub()
