In [1]:
model_name = 'unsloth/llama-3-8b-Instruct-bnb-4bit'
model_alias = 'llama3-8b'

In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv
import re
from pathlib import Path

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
WORK_DIR = Path('/vol/bitbucket/kza23/finetuning')

In [3]:
max_seq_length = 20 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA A30 MIG 2g.12gb. Max memory: 11.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [4]:
test_dataset = load_dataset(
    "msc-smart-contract-audition/audits-with-reasons",
    split="test"
)

test_dataset

Dataset({
    features: ['code', 'description', 'recommendation', 'type', 'functionality'],
    num_rows: 437
})

In [5]:
query_template = \
"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below are one or more Solidity codeblocks. The codeblocks might contain vulnerable code.
If there is a vulnerability please provide a description of the vulnearblity in terms of the code that is responsible for it.
Describe how an attacker would be able to take advantage of the vulnerability so the explanation is even more clear.

If there is no vulnerability output "There is no vulnearbility".
Do not output any ways to mitigate or fix the vulnerability, only the vulnerability itself.

Think step by step about the code and the possible vulnerabilities.
<|start_header_id|>user<|end_header_id|>
Codeblocks:
{}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Description of the vulnerability:
"""

In [7]:
df_test = test_dataset.to_pandas()

queries = df_test.apply(lambda row: query_template.format(row['code'].replace('\\n', '\n')), axis=1)

print(queries.iloc[0])


<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below are one or more Solidity codeblocks. The codeblocks might contain vulnerable code.
If there is a vulnerability please provide a description of the vulnearblity in terms of the code that is responsible for it.
Describe how an attacker would be able to take advantage of the vulnerability so the explanation is even more clear.

If there is no vulnerability output "There is no vulnearbility".
Do not output any ways to mitigate or fix the vulnerability, only the vulnerability itself.

Think step by step about the code and the possible vulnerabilities.
<|start_header_id|>user<|end_header_id|>
Codeblocks:
```
function repayAccountPrimeDebtAtSettlement(
    PrimeRate memory pr,
    VaultStateStorage storage primeVaultState,
    uint16 currencyId,
    address vault,
    address account,
    int256 accountPrimeCash,
    int256 accountPrimeStorageValue
) internal returns (int256 finalPrimeDebtStorageValue, bool didTransfer) {
    

In [None]:
with open(WORK_DIR/f"{model_alias}-outputs.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "output", "real"])

    for idx, (query, real) in tqdm(enumerate(zip(queries, test_dataset['description'])), total=len(queries)):
        inputs = tokenizer(query, return_tensors="pt", truncation=True).to("cuda")
        output_tokens = model.generate(
            **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )
        decoded_output = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        description = decoded_output.split("Description of the vulnerability:\n")[1].strip().replace("\n", "\\n")
        writer.writerow([idx, description, real])
