In [None]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv

In [None]:
DIALECT_NAME = "db_dialect"

csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

In [None]:
max_seq_length = 20  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

In [None]:
dataset_vulnerable = load_dataset(
    "msc-smart-contract-auditing/vulnerable-functions-base",
    split="train",
    escapechar="\\",
)

In [None]:
fix_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below is a code blocks which might have improper formatting. The task is to fix the formatting and remove comments which do not describe the code. Apart from formatting all of the code needs to remain the same.

1. Ensure that the code is properly indented.
2. Ensure that there are no leading tabs - the outer most block should start with no whitespace infront.
3. If a line is too long break it in multiple lines.
4. Keep all the code the same even if it seems unnecessary in the context.
5. Remove comments which contain no information about the code.
6. Remove any comments which contain URLs.
7. Do not add any backticks as this will be done automatically. If you do it will break the formatting. DO NOT ADD BACKTICKS.

Do not explain your changes at the end just output the formatted code block.
<|start_header_id|>user<|end_header_id|>
Code block:
{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Formatted code block:
"""



In [None]:
def extract_codeblocks(description):
    codeblocks = []
    codeblock = ""

    writing = False

    for line in description.split("\n"):
        if line.strip().startswith("```"):
            if writing:
                codeblocks.append(codeblock)
                yield codeblock
                codeblock = ""
                writing = False
            else:
                writing = True
            continue

        if writing:
            codeblock += line + "\n"

In [None]:
# Convert to pandas DataFrame and drop 534 (too long)
descriptions = dataset_vulnerable.to_pandas().drop(index=534)['description'].apply(lambda row: row.replace("\\n", "\n"))

In [None]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"cleaned-up-code-{current_chunk+1}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["code"])

    for description in tqdm(descriptions[start:end], total=chunk_len):
        formatted_blocks = []
        for codeblock in extract_codeblocks(description):
            prompt = fix_template.format(codeblock)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
            output_tokens = model.generate(
                **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
            )
            decoded_output = tokenizer.decode(
                output_tokens[0],
                skip_special_tokens=True,
                pad_token_id=tokenizer.pad_token_id,
            )

            raw_output = decoded_output.split("Formatted code block:\n")[1].strip()
            formatted_blocks.append(
                f"```\n{raw_output}\n```"
            )
        codeblocks = "\n".join(formatted_blocks).replace("\n", "\\n")
        writer.writerow([codeblocks])


In [None]:
fix_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below is a code blocks which might have improper formatting. The task is to fix the formatting and remove comments which do not describe the code. Apart from formatting all of the code needs to remain the same.
1. Ensure that the code is properly indented.
2. Ensure that there are no leading tabs - the outer most block should start with no whitespace infront.
3. If a line is too long break it in multiple lines.
4. Keep all the code the same even if it seems unnecessary in the context.
5. Remove comments which contain no information about the code.
6. Remove any comments which contain URLs.

Do not explain your changes at the end just output the formatted code block.
<|start_header_id|>user<|end_header_id|>
Code block:
```
    // File input.sol:12
            uint inputTotalUSDValueE36;
                for (uint i; i < openTokenInfos.length; ) {
                 inputTotalUSDValueE36 += openTokenInfos[i].inputAmt * tokenPriceE36s[i];
                      borrowTotalUSDValueE36 += openTokenInfos[i].borrowAmt * tokenPriceE36s[i];
                 unchecked {
            ++i; // See https://github.com/MSc-Smart-Contract-Audition/data-scraper/blob/main/verified/src/compile_contracts.py
                }
            }
                // 1.3 calculate net pnl (including strategy users & borrow profit)
            positionOpenUSDValueE36 = inputTotalUSDValueE36 + borrowTotalUSDValueE36;
            netPnLE36 = positionCurUSDValueE36.toInt256() - positionOpenUSDValueE36.toInt256();
```
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Formatted code block:
```
"""


inputs = tokenizer(fix_template, return_tensors="pt", truncation=True).to("cuda")
output_tokens = model.generate(
    **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
)
decoded_output = tokenizer.decode(
    output_tokens[0], skip_special_tokens=True, pad_token_id=tokenizer.pad_token_id
)


print(decoded_output.split("Formatted code block:\n")[1].strip())

In [None]:
def extract_codeblocks(description):
    codeblock = ""

    writing = False

    for line in description.split("\n"):
        if line.strip().startswith("```"):
            if writing:
                yield codeblock
                codeblock = ""
                writing = False
            else:
                writing = True
            continue

        if writing:
            codeblock += line + "\n"

In [None]:
content = "function sellUnderlying"

item = descriptions[descriptions.str.contains(content)].iloc[0]

# item = descriptions.iloc[0]

# print(item)

formatted_blocks = []
for codeblock in extract_codeblocks(item):

    print("CODEBLOCK:", codeblock)
    print("====")

    prompt = fix_template.format(codeblock)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    output_tokens = model.generate(
        **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
    )
    decoded_output = tokenizer.decode(
        output_tokens[0],
        skip_special_tokens=True,
        pad_token_id=tokenizer.pad_token_id,
    )

    raw_output = decoded_output.split("Formatted code block:\n")[1].strip()

    if not raw_output.startswith("```"):
        raw_output = "```\n" + raw_output
    if not raw_output.endswith("```"):
        raw_output += "\n```"

    formatted_blocks.append(raw_output)

print(len(formatted_blocks))

codeblocks = "\n".join(formatted_blocks)#.replace("\n", "\\n")
print(codeblocks)

In [None]:
import pandas as pd
import csv

DIALECT_NAME = "db_dialect"
csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

# Output file name
output_file = 'cleaned-up-code.csv'
header = True
# Open the output file in write mode to create/overwrite it
with open(output_file, 'w') as outfile:
    # Iterate through each chunk file
    for i in range(21):
        input_file = f'../cleaned-up-code-{i}.csv'
        print(f"Processing {input_file}")

        # Read the input CSV file
        df = pd.read_csv(input_file, dialect=DIALECT_NAME)

        # For the first file, include the header
        df.to_csv(output_file, mode='a', index=False, header=header)
        header = False


In [None]:

test1 = """
```\\n```\\n{\\n  \"@openzeppelin/contracts\": \"4.3.1\"\,\\n  \"@openzeppelin/test-helpers\": \"0.5.6\"\,\\n  \"@openzeppelin/contracts-upgradeable\": \"4.3.1\"\\n}\\n```\\n```
"""

test2 = """
365,"```\n{\n  ""@openzeppelin/contracts"": ""4.3.1"",\n  ""@openzeppelin/test-helpers"": ""0.5.6"",\n  ""@openzeppelin/contracts-upgradeable"": ""4.3.1""\n}\n```","The UUPSUpgradeable vulnerability is a critical severity bug discovered in OpenZeppelin's contracts, specifically in versions 4.1.0 to 4.3.1. The affected contracts include `@openzeppelin/contracts` and `@openzeppelin/contracts-upgradeable`, which are widely used in the kyber-swap contracts, as evident from the `package.json` file.\n\nThe vulnerability allows an attacker to manipulate the upgrade process of UUPSUpgradeable contracts, enabling them to execute arbitrary code during the upgrade process. This can lead to unauthorized changes to the contract's behavior, potentially resulting in the theft of funds or unauthorized access to sensitive data.\n\nThe affected contracts, `PoolOracle.sol` and `TokenPositionDescriptor.sol`, are both UUPSUpgradeable and require immediate attention to ensure the security of the kyber-swap contracts. It is essential to upgrade these contracts to a fixed version, specifically `@openzeppelin/contracts` version 4.3.2 or higher, and `@openzeppelin/contracts-upgradeable` version 4.3.2 or higher, to mitigate this vulnerability.","To mitigate the UUPSUpgradeable vulnerability in OpenZeppelin Contracts, follow these steps:nn1. **Update the OpenZeppelin library to the latest version**: Ensure that you are using the latest version of the OpenZeppelin Contracts library, which is version 4.3.2 or higher. This will ensure that you have the fix for the UUPSUpgradeable vulnerability.nn2. **Verify the UUPS implementation contracts**: Review the OpenZeppelin UUPS documentation to understand the correct implementation of UUPSUpgradeable contracts. This will help you identify any potential issues and ensure that your contracts are properly configured.nn3. **Check the affected contracts**: Identify the contracts that are affected by the UUPSUpgradeable vulnerability, such as PoolOracle.sol and TokenPositionDescriptor.sol. Review the code and ensure that they are using the latest version of the OpenZeppelin Contracts library.nn4. **Implement the UUPSUpgradeable contracts correctly**: Follow the OpenZeppelin UUPS documentation to implement the UUPSUpgradeable contracts correctly. This includes setting the `UUPSUpgradeable` contract as the base contract for your upgradeable contracts.nn5. **Test the UUPSUpgradeable contracts**: Thoroughly test the UUPSUpgradeable contracts to ensure that they are functioning correctly and that the upgrade process is working as expected.nn6. **Monitor the UUPSUpgradeable contracts**: Continuously monitor the UUPSUpgradeable contracts to ensure that they are not vulnerable to any new attacks or exploits.nnBy following these steps, you can effectively mitigate the UUPSUpgradeable vulnerability in OpenZeppelin Contracts and ensure the security of your smart contracts.",upgradeable contract vulnerability
"""


print(test1.replace('\\n', '\n'))
print(test2.replace('\\n', '\n'))