In [1]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='/home/uuz5szh/Desktop/test/code/codeGenerator/data/codetotest_train.jsonl')

dataset = dataset['train'].train_test_split(test_size=0.1)


In [2]:
def formatting(example):
    return f"""### C++ Function
{example['source']}

### Google Test
{example['target']}"""


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer

model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu", 
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

def preprocess(example):
    prompt = formatting(example)
    return tokenizer(prompt, truncation=True, padding='max_length', max_length=64)

tokenized = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Map:   0%|          | 0/3138 [00:00<?, ? examples/s]

Map:   0%|          | 0/349 [00:00<?, ? examples/s]

In [4]:

training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=20,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=False, 
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=1
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"].select(range(20)),
    eval_dataset=tokenized["test"].select(range(5)),
    processing_class=tokenizer
)

trainer.train()

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,1.737297
2,No log,1.686268
3,No log,1.662539




TrainOutput(global_step=6, training_loss=1.5914270083109539, metrics={'train_runtime': 95.4374, 'train_samples_per_second': 0.629, 'train_steps_per_second': 0.063, 'total_flos': 29536923156480.0, 'train_loss': 1.5914270083109539})

In [None]:
def generate(prompt, max_new_tokens=2048):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
example_func1 = "int max(int a, int b) { return a > b ? a : b; }"

example_func = """
#include <iostream>
#include <vector>
#include <algorithm>

class Statistics {
public:
    Statistics(const std::vector<int>& data) : data_(data) {}

    int max() const {
        return *std::max_element(data_.begin(), data_.end());
    }

    int min() const {
        return *std::min_element(data_.begin(), data_.end());
    }

    double average() const {
        if (data_.empty()) return 0.0;
        double sum = 0;
        for (int num : data_) sum += num;
        return sum / data_.size();
    }

    void print() const {
        std::cout << "Max: " << max() << std::endl;
        std::cout << "Min: " << min() << std::endl;
        std::cout << "Average: " << average() << std::endl;
    }

private:
    std::vector<int> data_;
};

int main() {
    std::vector<int> values = {3, 5, 7, 2, 8, 10, 6};
    Statistics stats(values);
    stats.print();
    return 0;
}
"""

prompt = f"### C++ Function\n{example_func}\n\n### Google Test\n"
prompt1 = f"### C++ Function\n{example_func1}\n\n### Google Test\n"
print(generate(prompt1))
print('#################################################')
print(generate(prompt))



Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


### C++ Function
int max(int a, int b) { return a > b ? a : b; }

### Google Test
#include "gtest/gtest.h"

TEST(MaxTest, PositiveNumbers) {
  EXPECT_EQ(2, max(1, 2));
}

TEST(MaxTest, NegativeNumbers) {
  EXPECT_EQ(-1, max(-2, -1));
}

TEST(MaxTest, Zeroes) {
  EXPECT_EQ(0, max(0, 0));
}

TEST(MaxTest, NegativeAndPositive) {
  EXPECT_EQ(3, max(-1, 3));
}

int main(int argc, char **argv) {
  ::testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
}

#################################################
### C++ Function

#include <iostream>
#include <vector>
#include <algorithm>

class Statistics {
public:
    Statistics(const std::vector<int>& data) : data_(data) {}

    int max() const {
        return *std::max_element(data_.begin(), data_.end());
    }

    int min() const {
        return *std::min_element(data_.begin(), data_.end());
    }

    double average() const {
        if (data_.empty()) return 0.0;
        double sum = 0;
        for (int num : data_) sum += num;
  

In [9]:
def generate(prompt, max_new_tokens=2048):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def extract_gtest_only(full_output):

    split_keyword = "### Google Test"
    if split_keyword in full_output:
        parts = full_output.split(split_keyword)
        if len(parts) > 1:
            return parts[1].strip()
    return full_output.strip()  

def generate_gtest_from_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        cpp_code = f.read()
    prompt = f"### C++ Function\n{cpp_code}\n##only need pure gtest code generated\n### Google Test\n"
    full_output = generate(prompt)
    gtest_code = extract_gtest_only(full_output)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(gtest_code)

    print(f"GTest 代码已生成并写入文件：{output_path}")


In [10]:
generate_gtest_from_file("input.cpp", "my_gtest_output.cpp")


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


GTest 代码已生成并写入文件：my_gtest_output.cpp
