In [5]:
model_name = 'unsloth/llama-3-8b-Instruct-bnb-4bit'
model_alias = '0-shot-8b-instruct'

In [36]:
!pip install --upgrade pip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pip
  Downloading pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.1
    Uninstalling pip-24.1.1:
      Successfully uninstalled pip-24.1.1
Successfully installed pip-24.2


In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-p_ggmp0q/unsloth_6e59fd93498d4106960052c457a33c08
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-p_ggmp0q/unsloth_6e59fd93498d4106960052c457a33c08
  Resolved https://github.com/unslothai/unsloth.git to commit a7bfbe7927ea75f959e1d7c84e7bf50945d405ff
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv
import re
from pathlib import Path

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
WORK_DIR = Path('/vol/bitbucket/kza23/finetuning')

In [6]:
max_seq_length = 20 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA A30 MIG 2g.12gb. Max memory: 11.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [7]:
results = pd.read_csv(f"{model_alias}-descriptions.csv")
results.loc[results["real"].isnull(), 'real'] = "There is no vulnearbility"
results

Unnamed: 0,output,real
0,The vulnerability is in the `netPrimeDebtChang...,The `repayAccountPrimeDebtAtSettlement()` func...
1,The vulnerability is in the `claimableUnlocked...,The `lockCapital` mechanism in the ProtectionP...
2,The vulnerability is in the `_update` function...,The vulnerability occurs when the `_invariant`...
3,The codeblock contains a vulnerability in the ...,There is no vulnearbility
4,The vulnerability is in the `_credit` function...,The vulnerability arises when the account bein...
...,...,...
432,The vulnerability is in the `require` statemen...,There is no vulnearbility
433,The vulnerability is in the `getPriceUSD` func...,The StableOracleWBTC contract relies on a BTC/...
434,The vulnerability is in the `MigrateWithdrawal...,The `MigrateWithdrawal` function in the `migra...
435,The vulnerability is in the `addLiquidity` fun...,The createMarket transaction lacks a crucial e...


In [8]:
!pwd

/homes/kza23


In [29]:
with open(WORK_DIR/'prompts/evaluation.txt', mode='r') as f:
    eval_prompt = f.read()

split_token = eval_prompt.split('\n')[-2]

criteria = []
for idx in range(1, 4):
    with open(WORK_DIR/f'prompts/criterion{idx}.txt', mode='r') as f:
        criteria.append(f.read())


In [30]:
print(eval_prompt)
print(split_token)
for cirterion in criteria:
    print(cirterion)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are given 2 smart-contract audits written by a professional auditor and a student.
You are assigned a single criterion as a part of a larger audit.
Your task is to determine wether this criterion is satisfied or not.

First analyze wether the criterion is met or not, without outputing a final verdict.
Only once you have output your reasoning, provide a final verdict by outputing FAIL or PASS - nothing else is allowed.
In cases where the crietrion is partially met you should determine wether FAIL or PASS would be a more accurate verdict in the specific case.
Stick to this formatting:
Reasoning: ...
Verdict: PASS/FAIL

<|start_header_id|>user<|end_header_id|>
Criterion:
{}

Professional audit:
{}

Student audit:
{}
<|start_header_id|>assistant<|end_header_id|>
Reasoning:

Reasoning:
Fulfilled if the two audits describe the same vulnerability. This is about the nature of vulnerability, rather than the specifics in the code.
F

In [77]:
def run_criteria(query):
    ret_reasoning = ''
    verdicts = []
    for idx, criterion in enumerate(criteria):
        format_input = query.format(criterion).replace('{{', '{').replace('}}', '}')
        inputs = tokenizer(format_input, return_tensors="pt", truncation=True).to("cuda")
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=512,
            pad_token_id=tokenizer.pad_token_id,
            use_cache = True
        )

        decoded_output = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        result = decoded_output.split(f"{split_token}\n")[1].strip()
        parts = result.split('Verdict: ')
        if len(parts) == 2:
            reasoning, verdict = parts
        else:
            verdict = "N/A"
            reasoning = parts[0]

        ret_reasoning += f'Criterion {idx+1}:\n{reasoning.strip()}\n'
        verdicts.append(verdict.strip())
    return verdicts, ret_reasoning

def run_query(data, query, show_tqdm=True):
    queries = data.apply(lambda row: query.format(
        "{}",
        row['real'].replace('\\n', '\n').replace('{', '{{').replace('}', '}}'),
        row['output'].replace('\\n', '\n').replace('{', '{{').replace('}', '}}'),
    ), axis=1)

    queries = queries[88:]

    if show_tqdm:
        iterator = tqdm(enumerate(queries), total=len(queries))
    else:
        iterator = enumerate(queries)

    for idx, query in iterator:
        real_contains_vuln = data.iloc[idx]["real"] != "There is no vulnearbility"
        output_contains_vuln = data.iloc[idx]["output"] != "There is no vulnearbility"

        if real_contains_vuln != output_contains_vuln:
            yield ["FAIL", "FAIL", "FAIL", "FAIL", "Criterion 0: FAIL\\n One of the descriptions does not contain a vulnerability."]
            continue

        if not real_contains_vuln and not output_contains_vuln:
            yield ["PASS", "PASS", "PASS", "PASS", "No vulnerabilities to compare."]
            continue


        verdicts, reasoning = run_criteria(query)
        verdicts = ["PASS"] + verdicts

        yield verdicts + [reasoning.replace("\n", "\\n")]

In [78]:
with open(f"{model_alias}-descriptions-results-v2.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["cr0", "cr1", "cr2", "cr3", "description"])
    for result in run_query(results, eval_prompt):
        writer.writerow(result)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 349/349 [1:08:01<00:00, 11.69s/it]
