In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

max_length = 1000
# Load the model
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    # device_map='auto'
).cuda()

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


In [11]:
prompt = \
"""
Below are one or more Solidity codeblocks. The codeblocks might contain vulnerable code.
If there is a vulnerability please provide a description of the vulnearblity in terms of the code that is responsible for it.
Describe how an attacker would be able to take advantage of the vulnerability so the explanation is even more clear.

Output only the description of the vulnerability and the attacking vector. No additional information is needed.

If there is no vulnerability output "There is no vulnearbility".

Codeblocks:
{}

"""

In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "msc-smart-contract-audition/audits-with-reasons",
)

In [18]:
def unescape_newlines(cell):
    if cell is None:
        return None
    return cell.replace("\\n", "\n")

def prepare(row):
    data = tokenizer(
        text=prompt.format(row["code"]),
        text_target=row["description"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

    data['labels'] = data['input_ids'].copy()
    return data

train_dataset = dataset["train"].to_pandas().map(unescape_newlines)
test_dataset = dataset["test"].to_pandas().map(unescape_newlines)

train_prompts = train_dataset.apply(prepare, axis=1)
test_prompts = test_dataset.apply(prepare, axis=1)

In [6]:
from peft import LoraConfig, get_peft_model

# Configuration for LoRA
lora_config = LoraConfig(
    r=16,  # rank
    lora_alpha=32,  # scaling factor
    # target_modules=["q_proj", "v_proj"],  # Specify which modules to apply LoRA
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_dropout=0.05,  # dropout rate for LoRA layers
)

In [19]:
from transformers import Trainer, TrainingArguments
# from trl import SFTTrainer


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
    train_dataset=train_prompts,
    eval_dataset=test_prompts,
)

trainer.train()


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB. GPU 0 has a total capacity of 11.69 GiB of which 15.00 MiB is free. Including non-PyTorch memory, this process has 11.64 GiB memory in use. Of the allocated memory 10.77 GiB is allocated by PyTorch, and 724.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

# Evaluation
results = trainer.evaluate()
print(results)