In [4]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='/home/uuz5szh/Desktop/test/code/codeGenerator/data/codetotest_train.jsonl')

dataset = dataset['train'].train_test_split(test_size=0.1)


In [5]:
def formatting(example):
    return f"""### C++ Function
{example['source']}

### Google Test
{example['target']}"""


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer

model_name =  "/home/uuz5szh/Desktop/test/code/codeGenerator/data/coder-instruct"
def save_output(text, output_path):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="cpu",
    local_files_only=True 
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

def preprocess(example):
    prompt = formatting(example)
    return tokenizer(prompt, truncation=True, padding='max_length', max_length=64)

tokenized = dataset.map(preprocess, remove_columns=dataset["train"].column_names)


In [None]:

training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=20,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=False, 
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=1
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"].select(range(20)),
    eval_dataset=tokenized["test"].select(range(5)),
    processing_class=tokenizer
)

# for i in range(0, len(tokenized["train"]), 100):
#     sub_train = tokenized["train"].select(range(i, min(i+100, len(tokenized["train"]))))
#     trainer.train_dataset = sub_train
#     trainer.train()
trainer.train()

Truncating train dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
def generate(prompt, max_new_tokens=2048):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def extract_gtest_only(full_output):
    split_keyword = "### Google Test"
    if split_keyword in full_output:
        parts = full_output.split(split_keyword)
        if len(parts) > 1:
            return parts[1].strip()
    return full_output.strip()  

def generate_gtest_from_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        cpp_code = f.read()
    prompt = f"### C++ Function\n{cpp_code}\n##only need pure gtest code generated\n### Google Test\n"
    full_output = generate(prompt)
    gtest_code = extract_gtest_only(full_output)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(gtest_code)

    print(f"GTest code has been written to：{output_path}")


In [16]:
generate_gtest_from_file("input.cpp", "my_gtest_output.cpp")


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


GTest code has been written to：my_gtest_output.cpp
