In [None]:
!pip install --upgrade transformers
!pip install --upgrade optimum
!pip install --upgrade auto-gptq

In [103]:
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import re

In [87]:
model_alias = '0-shot-8b-instruct'

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
    torch_dtype="auto",
    device_map="auto",
    cache_dir='/vol/bitbucket/kza23/huggingface'
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct-GPTQ-Int4", cache_dir='/vol/bitbucket/kza23/huggingface')

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/84.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.61G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at Qwen/Qwen2-7B-Instruct-GPTQ-Int4 were not used when initializing Qwen2ForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11.mlp.gate_proj.bias', 'model.layers.11.mlp.up_proj.bias', 'model.layers.11.self_attn.o_proj.bias', 'model.layers.12.mlp.down_proj.bias', 'model.layers.12.mlp.gate_proj.bias', 'model.layers.12.mlp.up_proj.bias', 'model.layers.12.self_attn.o_proj.bias', 'model.layers.13.mlp.down_proj.bias', 'model.layers.13.mlp.gate_proj.bias', 'model.layers.13.mlp.up_proj.bias', 'model.

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
SEQ_END = tokenizer.eos_token
print(SEQ_END)

<|im_end|>


In [194]:
system = \
"""You are a security auditor and your task is to mark an audit written by a student based on 3 critera.
"""
user = \
"""
Your audit:
{}<|im_end|>

Student's audit:
{}<|im_end|>


First reason about whether each criterion is met or not.
If criteria are not fully satisifed, determine which is the more appropriate label from "PASS" or "FAIL" (nothing else is allowed).
Only after you have explained the reasoning behind each criterion, output a list labeling each criterion with either "PASS" or "FAIL".

Explain whether the student's audit satisfies the following criteria:
Criterion 1: Fulfilled if the two audits describe an identical vulnerability. This is about the nature of vulnerability and how it affects the contract, rather than the specifics of the code.
Criterion 2: Fulfilled if the student describes the same code scope or a subset of yours. In other words partial scope is also acceptable. Otherwise, this criterion is not met.
Criterion 3: Fulfilled if the two audits describe identical attacking strategy. The description does NOT need to be identical just the idea of the attack.

Afterwards, output a list containing each criterion and its verdict
"""

assistant_prefix = "Criterion 1:"

In [195]:
def generate_text(entries):
    queries = entries.apply(lambda row: user.format(
        row['real'].replace('\\n', '\n'),
        row['output'].replace('\\n', '\n'),
    ), axis=1)


    for query in queries:
        messages = [
            {"role": "system", "content": system},
            {"role": "user", "content": query},
            # {"role": "assistant", "content": assistant_prefix}
        ]

        yield tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )

In [189]:
test_dataset = load_dataset(
    "msc-smart-contract-audition/audits-with-reasons",
    split="test"
)

test_dataset

Dataset({
    features: ['code', 'description', 'recommendation', 'type', 'functionality'],
    num_rows: 437
})

In [168]:
data = pd.read_csv(f"{model_alias}-descriptions.csv")

In [169]:
correct_output = pd.DataFrame([
    [197, 'PASS', 'PASS', 'PASS', 'PASS'],
    [240, 'PASS', 'FAIL', 'PASS', 'FAIL'],
    [314, 'PASS', 'FAIL', 'FAIL', 'FAIL'],
    [284, 'PASS', 'PASS', 'PASS', 'PASS'],
    [389, 'PASS', 'PASS', 'PASS', 'PASS'],
    [57,  'PASS', 'PASS', 'PASS', 'PASS'], # The codeblocks actually contain example code demonstrating the vulnerability so this one is not a fair comparison
    [316, 'PASS', 'PASS', 'PASS', 'PASS'],
    [392, 'PASS', 'PASS', 'FAIL', 'PASS'],
    [379, 'PASS', 'PASS', 'PASS', 'PASS'],
    [135, 'PASS', 'FAIL', 'FAIL', 'FAIL'],
    [32,  'PASS', 'FAIL', 'PASS', 'FAIL'],
    [2,   'PASS', 'FAIL', 'PASS', 'FAIL'],
    [46,  'PASS', 'FAIL', 'FAIL', 'FAIL'],
], columns=["index", "cr0", "cr1", "cr2", "cr3"]).set_index("index")
correct_output

Unnamed: 0_level_0,cr0,cr1,cr2,cr3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
197,PASS,PASS,PASS,PASS
240,PASS,FAIL,PASS,FAIL
314,PASS,FAIL,FAIL,FAIL
284,PASS,PASS,PASS,PASS
389,PASS,PASS,PASS,PASS
57,PASS,PASS,PASS,PASS
316,PASS,PASS,PASS,PASS
392,PASS,PASS,FAIL,PASS
379,PASS,PASS,PASS,PASS
135,PASS,FAIL,FAIL,FAIL


In [179]:
test_data = data.join(correct_output, how="inner")
test_data = test_data.join(test_dataset.to_pandas()["code"], how="inner")
test_data

Unnamed: 0,output,real,cr0,cr1,cr2,cr3,code
2,The vulnerability is in the `_update` function...,The vulnerability occurs when the `_invariant`...,PASS,FAIL,PASS,FAIL,```\nfunction _invariant(\n Context memory ...
32,The codeblock is vulnerable to an integer over...,The vulnerability arises from an incorrect cal...,PASS,FAIL,PASS,FAIL,```\nuint256 borrowingCollateral = cache.borro...
46,The code is vulnerable to a reentrancy attack....,The `setValidatorAddress` function in the smar...,PASS,FAIL,FAIL,FAIL,```\nfunction setValidatorAddress(uint128 vali...
57,The vulnerability is in the `clearRequest` fun...,The `CoolerCallback.isCoolerCallback()` functi...,PASS,PASS,PASS,PASS,```\nfunction clearRequest(\n uint256 reqID...
135,The code is vulnerable to a denial-of-service ...,The D3Oracle vulnerability occurs when the Cha...,PASS,FAIL,FAIL,FAIL,"```\n(uint80 roundID, int256 price, uint256 up..."
197,The code is vulnerable to a logical error.\n\n...,The `processAuction()` function in the `VaultA...,PASS,PASS,PASS,PASS,```\nbool cancelled = l.Auction.isCancelled(la...
240,The codeblock contains a hardcoded address of ...,The protocol's `Swap` library contains a hardc...,PASS,FAIL,PASS,FAIL,```\naddress internal constant WETH = 0xC02aaA...
284,The vulnerability is in the `finalise` functio...,The vulnerability in the RocketMinipoolDelegat...,PASS,PASS,PASS,PASS,```\nfinalise() -->\n status == MinipoolStatu...
314,The vulnerability is in the `initiateFlashloan...,"The vulnerability, known as FlasherFTM - Unsol...",PASS,FAIL,FAIL,FAIL,```\n/**\n * @dev Routing Function for Flashlo...
316,The code is vulnerable to an integer overflow ...,The vulnerability allows an attacker to create...,PASS,PASS,PASS,PASS,```\nuint112 virtualEth = type(uint112).max;\n...


In [192]:
def run_query(data):
    for idx, text in enumerate(generate_text(data)):
        real_contains_vuln = data.iloc[idx]["real"] != "There is no vulnearbility"
        output_contains_vuln = data.iloc[idx]["output"] != "There is no vulnearbility"

        if real_contains_vuln != output_contains_vuln:
            yield ["FAIL", "FAIL", "FAIL", "FAIL", "Criterion 0: FAIL\\n One of the descriptions does not contain a vulnerability."]
            continue

        if not real_contains_vuln and not output_contains_vuln:
            yield ["PASS", "PASS", "PASS", "PASS", "No vulnerabilities to compare."]
            continue

        model_inputs = tokenizer([text], return_tensors="pt").to(device)

        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512
        )

        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        decoded_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        result = decoded_output

        pattern = re.compile(r'Criterion \d+: (PASS|FAIL)')
        criteria = ["PASS"] + pattern.findall(result)[-3:] # Add a pass for the 0th criterion

        yield criteria + [result.replace("\n", "\\n")]

In [196]:
for idx, result in enumerate(run_query(test_data)):
    entry = test_data.iloc[idx]

    verdict = result[:-1]
    correct = list(entry[['cr0', 'cr1', 'cr2', 'cr3']])

    if verdict == correct:
        print(f"Test {idx+1}/{len(test_data)}: passed!")
    else:
        print(f"Test {idx+1}/{len(test_data)}: failed!\n")

        print(f"Verdict: {verdict}")
        print(f"Correct: {correct}")
        print("=" * 200)
        print(entry['real'].replace('\\n', '\n'))
        print("-" * 200)
        print(entry['output'].replace('\\n', '\n'))
        print("-" * 200)
        print(result[-1].replace('\\n', '\n'))
        print("=" * 200)
        print()

Test 1/13: failed!

Verdict: ['PASS', 'PASS', 'FAIL', 'PASS']
Correct: ['PASS', 'FAIL', 'PASS', 'FAIL']
The vulnerability occurs when the `_invariant` function is bypassed for protected position updates, allowing an attacker to exploit the system by sending a large number of pending position updates. This can lead to a denial-of-service (DoS) attack, as the system becomes overwhelmed by the excessive pending updates.

The `_invariant` function checks for a limit on the number of pending position updates, but this check is bypassed for protected position updates. This allows an attacker to send a large number of pending updates, which can cause the system to become overwhelmed and eventually revert with an error.

The `_settle` function is responsible for settling the pending position updates, but it does not check for the limit on pending updates. This means that an attacker can continue to send pending updates, causing the system to become increasingly overwhelmed.

The `update` funct