In [None]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv

In [None]:
DIALECT_NAME = "db_dialect"

csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

In [None]:
max_seq_length = 20  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

In [None]:
codeblocks = pd.read_csv("/vol/bitbucket/kza23/cleaned-up-code.csv")

In [None]:
dataset_verified = load_dataset(
    "msc-smart-contract-auditing/vulnerable-functions-base",
    split="train",
    name="verified-functions",
    escapechar="\\",
)

verified_df = dataset_verified.to_pandas()

In [None]:
print(verified_df.head())

In [None]:
template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below are provided 1 or more code blocks from a smart contract. The task is to explain what is its purpose in plain English.

1. Do NOT use any names of variables, names of parameters or names of functions. Just try to explain the general functionality.
2. Explain what each codeblock does step by step.
3. Give a high-level overview and purpose of the code.
4. First explain the functionality of each code block and after all codeblocks are explained provide a high-level overview of the code over all.

<|start_header_id|>user<|end_header_id|>
Code block to explain:
{}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Code functionality:
Code block 1:
1. """

template_verified = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below is a single codeblock from a smart contract. The task is to explain what is its purpose in plain English.

1. Do NOT use any names of variables, names of parameters or names of functions. Just try to explain the general functionality.
2. Explain the function does step by step.
3. Give a high-level overview and purpose of the function within a wider context.
4. First explain the functionality of the code block and after provide a high-level overview of the code over all.

<|start_header_id|>user<|end_header_id|>
Code block to explain:
{}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Code functionality:
Code block 1:
1. """


In [None]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"functionality-{current_chunk}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["functionality"])

    for codeblock in tqdm(codeblocks['code'][start:end], total=chunk_len):

        print(codeblock.replace("\\n", "\n"))

        prompt = template.format(codeblock.replace("\\n", "\n"))

        input = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

        output_tokens = model.generate(
            **input, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )

        decoded_output_purpose = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        functionality = decoded_output_purpose.split("Code functionality:\n")[1].strip()
        writer.writerow([functionality.replace("\n", "\\n")])


In [None]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"functionality-verified-{current_chunk}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["functionality"])

    for codeblock in tqdm(verified_df['function'][start:end], total=chunk_len):

        print(codeblock.replace("\\n", "\n"))
        prompt = template_verified.format(codeblock.replace("\\n", "\n"))

        input = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

        output_tokens = model.generate(
            **input, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )

        decoded_output_purpose = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        functionality = decoded_output_purpose.split("Code functionality:\n")[1].strip()
        writer.writerow([functionality.replace("\n", "\\n")])



In [None]:
import pandas as pd
import csv

DIALECT_NAME = "db_dialect"
csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

# Output file name
output_file = 'functionality-verified.csv'
header = True
# Open the output file in write mode to create/overwrite it
with open(output_file, 'w') as outfile:
    # Iterate through each chunk file
    for i in range(6):
        input_file = f'../functionality-verified-{i}.csv'
        print(f"Processing {input_file}")

        # Read the input CSV file
        df = pd.read_csv(input_file, dialect=DIALECT_NAME)

        # For the first file, include the header
        df.to_csv(output_file, mode='a', index=False, header=header)
        header = False
