In [None]:
# import torch
# # major_version, minor_version = torch.cuda.get_device_capability()
# # print(major_version)
# # Must install separately since Colab has torch 2.2.1, which breaks packages
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# # if major_version >= 8:
# #     # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
# #     !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
# # else:
# #     # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
# !pip install --no-deps xformers trl peft accelerate bitsandbytes
# # pass

In [None]:
from unsloth import FastLanguageModel
max_seq_length = 20 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
dataset_vulnerable = load_dataset(
  "msc-smart-contract-auditing/vulnerable-functions-base",
  split='train',
  escapechar='\\',
)

dataset_verified = load_dataset(
  "msc-smart-contract-auditing/vulnerable-functions-base",
  name="verified-functions",
  split='train',
  escapechar='\\',
)

# Convert to pandas DataFrame
df_vulnerable = dataset_vulnerable.to_pandas()
df_verified = dataset_verified.to_pandas()

In [None]:
query_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Given the below vulnerability description, provide a single vulnerability type which can be defined by multiple words.
It needs describes the vulnerability as accurate as possible while avoiding specifcs like contract names, function names, variable names, etc.
Do not explain the reason for your choice, or provide any additional context about the vulnerability itself (this includes notes, why you chose so). Do not add any text or characters like quotations<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Vulnerability Description:
{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Vulnerability type:
"""

## Label extraction
Multiple prompting approaches were tried:
1. Initially the model was prompted to extract a label that directly could be used for classification using the description. However, it turns out that too much of the task gets outsourced to the model and it struggles to produce nice classes which would encapsulate all the vulnarabilities. The model essentially was tasked to provide a class for a vulnerability of unknown set of vulnerability types. It is a very tricky task without any knowledge of all other vulnerabilities. For instance these are two examples of what the model produced for two audits: "Insufficient input validation" and "Unvalidated input" "Unvalidated User-Input". These are three separate instances and if the model had the knowledge about them prior to classfiying each one it would have classified all 3 the same (e.g. "Insufficient Validation"), which is the desirable result for classification labels.

2. Instead of forcing the model to predict a general label with no context of other labels (i.e. without knowledge how general it has to be), the model was tasked to output a much more specific type without adhereing to generality or homogenity of labels. The only condition that the model was imposed with was to not include any specifics about the vulnerability e.g. function names, contract names, specific variable names, etc. Afterwards using unsupervised approaches the labels are able to be extracted just based on these types. This time all the vulnerabilities would be present and in addition would be clustered. The tasks transforms into providing a general label based on the cluster of labels instead of making up general labels. This is a much more tractable task as it only involves reasoning on the data provided in the prompt itself.

In [None]:
queries = df_vulnerable['description'].apply(lambda x: query_template.format(x.replace("\\n", "\n")))
queries = queries.drop(queries.index[534])

In [None]:


# for idx, query in enumerate(queries):
#     if len(tokenizer(query)["input_ids"]) > 8000:
#         print("Query too long!")
#         print(idx, query)
#         break

In [None]:
# test_query = """
# <|begin_of_text|><|start_header_id|>system<|end_header_id|>
# Given the below vulnerability audit, provide a single vulnerability type which can be defined by multiple words.
# The type needs to describe the vulnerability as accurately as possible without including any specifics about the contract, function and variable names.
# Do not provide any explanations or write any notes for your decision.
# Output the result in the following format:
# "Vulnerability type: type"
# Do not add any additional text or characters.<|eot_id|>
# <|start_header_id|>user<|end_header_id|>
# Audit title:
# An attacker can increase liquidity to the position's UniswapNFT to prevent the position from being closed
# Audit body:
# UniswapV3NPM allows the user to increase liquidity to any NFT.
# When closing a position, in `_redeemPosition()`, only the initial liquidity of the NFT will be decreased, and then the NFT will be burned.
# If the liquidity of the NFT is not 0, burning will fail.
# This allows an attacker to add 1 wei liquidity to the position's NFT to prevent the position from being closed, and later when the position expires, the attacker can liquidate it.<|eot_id|>
# <|start_header_id|>assistant<|end_header_id|>
# Vulnerability type:
# """

In [None]:
from tqdm import tqdm
classes = []
for query in tqdm(queries):
# for query in queries[:20]:
    inputs = tokenizer(query, return_tensors = "pt", truncation=True).to("cuda")
    output_tokens = model.generate(**inputs, max_new_tokens = 32, pad_token_id=tokenizer.pad_token_id)
    decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True, pad_token_id=tokenizer.pad_token_id)
    classes.append(decoded_output.split("Vulnerability type:\n")[1].strip().lower())
    # print(decoded_output.split("Vulnerability type:\n")[1].strip())
df = pd.DataFrame(classes, columns=['type'])
df.to_csv("vulnerability_types.csv", index=False)


In [None]:
query_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below vulnerability audit contains a description of a vulnerability contained in the code blocks.
All the text needs to be extracted removing just the codeblocks. The inline code should be kept as is. The text needs to remain the same but you may fix any typos or grammatical errors.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Vulnerability audit:
{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Vulnerability text:
"""

In [None]:
queries = df_vulnerable['description'].apply(lambda x: query_template.format(x.replace("\\n", "\n")))
queries = queries.drop(queries.index[534]) # Drop this row as it exceeds the 8000 token limit (it has a whole contract in the description)

In [None]:
import csv
csv_params = {
    'quoting': csv.QUOTE_MINIMAL,  # quoting
    'quotechar': '"',  # quotechar
    'escapechar': '\\',  # escapechar
    'doublequote': False,  # Ensure doublequote is set to False as you are using escapechar
}



In [None]:
from tqdm import tqdm
classes = []
for query in tqdm(queries):
# for idx, query in enumerate(queries[:1]):
    inputs = tokenizer(query, return_tensors = "pt", truncation=True).to("cuda")
    output_tokens = model.generate(**inputs, max_new_tokens = 512, pad_token_id=tokenizer.pad_token_id)
    decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True, pad_token_id=tokenizer.pad_token_id)
    classes.append(decoded_output.split("Vulnerability text:\n")[1].strip().replace("\n", "\\n"))
    # print(decoded_output.split("Vulnerability text:\n")[1].strip())
df = pd.DataFrame(classes, columns=['description'])
df.to_csv("vulnerability_descriptions.csv", index=False, **csv_params)