In [None]:
model_name = 'msc-smart-contract-auditing/deepseek-coder-6.7b-vulnerability'
model_alias = 'deepseek-coder-6.7b-finetuned'

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import csv
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
WORK_DIR = Path(f'/vol/bitbucket/kza23/finetuning/{model_alias}')
WORK_DIR.mkdir(exist_ok=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

In [None]:
test_dataset = load_dataset(
    "msc-smart-contract-auditing/audits-with-reasons",
    split="test"
)

test_dataset

## Descriptions

In [None]:
prompt = \
"""
Below are one or more Solidity codeblocks. The codeblocks might contain vulnerable code.
If there is a vulnerability please provide a description of the vulnearblity in terms of the code that is responsible for it.
Describe how an attacker would be able to take advantage of the vulnerability so the explanation is even more clear.

Output only the description of the vulnerability and the attacking vector. No additional information is needed.

If there is no vulnerability output "There is no vulnearbility".

Codeblocks:
{}

"""

In [None]:
df_test = test_dataset.to_pandas()

queries = df_test.apply(lambda row: prompt.format(row['code'].replace('\\n', '\n')), axis=1)

print(queries.iloc[0])

In [None]:
with open(WORK_DIR/"descriptions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "output", "real"])

    for idx, (query, real) in tqdm(enumerate(zip(queries, test_dataset['description'])), total=len(queries)):

        messages = [
            { 'role': 'user', 'content': query }
        ]
        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
        outputs = model.generate(inputs, max_new_tokens=512, do_sample=True, top_k=25, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
        description = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
        # writer.writerow([idx, description, real])
        print(description)

## Recommendations

In [None]:
query_template = \
"""
Below is some solidity code and a description of a vulnerability that the code contains.

Explain how to mitigate or fix the vulnerability.
Codeblocks:
{}

Vulnerability:
{}"""

In [None]:
df_test = test_dataset.to_pandas()
df_test = df_test[df_test['description'].notnull()]
queries = df_test.apply(lambda row: query_template.format(row['code'].replace('\\n', '\n'), row['description'].replace('\\n', '\n')), axis=1)

In [None]:
with open(WORK_DIR/"recommendations.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "output", "real"])

    for idx, (query, real) in tqdm(enumerate(zip(queries, test_dataset['recommendation'])), total=len(queries)):

        messages = [
            { 'role': 'user', 'content': query }
        ]
        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
        outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
        recommendation = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
        writer.writerow([idx, recommendation, real])