In [1]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
DIALECT_NAME = "db_dialect"

csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_NONE,
    escapechar="\\",
)

In [3]:
max_seq_length = 20  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A30 MIG 2g.12gb. Max memory: 11.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dataset_vulnerable = load_dataset(
    "msc-smart-contract-audition/vulnerable-functions-base",
    split="train",
    escapechar="\\",
)

In [18]:
query_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below vulnerability audit contains an audit name, a description of a vulnerability and a mitigation.
You need to improve the mitigation based on the information provided.
1. Expand the mitigation in such a way that it is very comprehensive and easy to understand.
2. Do not dumb it down, still use technical terms where necessary.
3. Do not include the code blocks, but you could still use inline code for your explanation. Surround the inline code using single backticks.
4. In your reponse do not quote the title of the vulnerability, add anything about the description or the code. Just output the enhanced mitigation
5. Do not make up any information about the vulnerability that is not in the provided context.
<|start_header_id|>user<|end_header_id|>
Vulnerability title: {}
Vulnerability description:
{}
Vulnerability mitigation:
{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Improved mitigation:
"""

query_template_empty = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below vulnerability audit contains an audit name and description of the vulnerability.
Write one suggestion for the mitigation of the vulnerability.
1. Expand the mitigation in such a way that it is very comprehensive and easy to understand.
2. Do not dumb it down, still use technical terms where necessary.
3. Do not include the code blocks, but you could still use inline code for your explanation. Surround the inline code using single backticks.
4. In your reponse do not quote the title of the vulnerability, add anything about the description or the code. Just output the enhanced mitigation
5. Do not make up any information about the vulnerability that is not in the provided context.
<|start_header_id|>user<|end_header_id|>
Vulnerability title: {}
Vulnerability description:
{}
<|start_header_id|>assistant<|end_header_id|>
Improved mitigation:
"""


In [15]:
def apply_template(row):
    if row["recommendation"] is None:
        return query_template_empty.format(
            row["name"],
            row["description"].replace("\\n", "\n"),
        )

    return query_template.format(
        row["name"],
        row["description"].replace("\\n", "\n"),
        row["recommendation"].replace("\\n", "\n"),
    )


# Convert to pandas DataFrame
df_vulnerable = dataset_vulnerable.to_pandas()
queries = df_vulnerable.apply(apply_template, axis=1)
queries = queries.drop(queries.index[534])

[400]
curve admin can drain pool via reentrancy (equal to execute emergency withdraw and rug tokenmak fund by third party)


In [24]:
print(queries[188])


<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below vulnerability audit contains an audit name, a description of a vulnerability and a mitigation.
You need to improve the mitigation based on the information provided.
1. Expand the mitigation in such a way that it is very comprehensive and easy to understand.
2. Do not dumb it down, still use technical terms where necessary.
3. Do not include the code blocks, but you could still use inline code for your explanation. Surround the inline code using single backticks.
4. In your reponse do not quote the title of the vulnerability, add anything about the description or the code. Just output the enhanced mitigation
5. Do not make up any information about the vulnerability that is not in the provided context.
<|start_header_id|>user<|end_header_id|>
Vulnerability title: If the vault's side token balance is 0 or a tiny amount, then most if not all IG Bear trades will revert due to incorrect check of computation error during d

In [None]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"enhanced-recommendation-{current_chunk}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["recommendation"])

    for query in tqdm(queries[start:end], total=chunk_len):
        inputs = tokenizer(query, return_tensors="pt", truncation=True).to("cuda")
        output_tokens = model.generate(
            **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )
        decoded_output = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        mitigation = decoded_output.split("Improved mitigation:\n")[1].strip().replace("\n", "\\n")
        writer.writerow([mitigation])


In [23]:
for idx, query in enumerate(queries[399:400]):
    inputs = tokenizer(query, return_tensors="pt", truncation=True).to("cuda")
    output_tokens = model.generate(
        **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
    )
    decoded_output = tokenizer.decode(
        output_tokens[0], skip_special_tokens=True, pad_token_id=tokenizer.pad_token_id
    )
    print(query)
    recommendation = decoded_output.split("Improved mitigation:\n")[1].strip()#.replace("\n", "\\n")
    print(recommendation)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below vulnerability audit contains an audit name, a description of a vulnerability and a mitigation.
You need to improve the mitigation based on the information provided.
1. Expand the mitigation in such a way that it is very comprehensive and easy to understand.
2. Do not dumb it down, still use technical terms where necessary.
3. Do not include the code blocks, but you could still use inline code for your explanation. Surround the inline code using single backticks.
4. In your reponse do not quote the title of the vulnerability, add anything about the description or the code. Just output the enhanced mitigation
5. Do not make up any information about the vulnerability that is not in the provided context.
<|start_header_id|>user<|end_header_id|>
Vulnerability title: Slashing during `LSTCalculatorBase.sol` deployment can show bad apr for months
Vulnerability description:
The contract `LSTCalculatorBase.sol` has some funct