In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

max_length = 1000
# Load the model
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

In [None]:
prompt = \
"""
Below are one or more Solidity codeblocks. The codeblocks might contain vulnerable code.
If there is a vulnerability please provide a description of the vulnearblity in terms of the code that is responsible for it.
Describe how an attacker would be able to take advantage of the vulnerability so the explanation is even more clear.

Output only the description of the vulnerability and the attacking vector. No additional information is needed.

If there is no vulnerability output "There is no vulnearbility".

Codeblocks:
{}

"""

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "msc-smart-contract-auditing/audits-with-reasons",
)

In [None]:
def unescape_newlines(cell):
    if cell is None:
        return None
    return cell.replace("\\n", "\n")

def prepare(row):
    data = tokenizer(
        text=prompt.format(row["code"]),
        text_target=row["description"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

    data['labels'] = data['input_ids'].copy()
    return data

train_dataset = dataset["train"].to_pandas().map(unescape_newlines)
test_dataset = dataset["test"].to_pandas().map(unescape_newlines)

train_prompts = train_dataset.apply(prepare, axis=1)
test_prompts = test_dataset.apply(prepare, axis=1)

In [None]:
from peft import LoraConfig, get_peft_model

# Configuration for LoRA
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=32,  # scaling factor
    # target_modules=["q_proj", "v_proj"],  # Specify which modules to apply LoRA
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_dropout=0.05,  # dropout rate for LoRA layers
)

In [None]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
    train_dataset=train_prompts,
    eval_dataset=test_prompts,
)

trainer.train()


In [None]:
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

# Evaluation
results = trainer.evaluate()
print(results)