In [5]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv

ModuleNotFoundError: No module named 'unsloth'

In [4]:
DIALECT_NAME = "db_dialect"

csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_NONE,
    escapechar="\\",
)

NameError: name 'csv' is not defined

In [3]:
max_seq_length = 20  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A30 MIG 2g.12gb. Max memory: 11.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dataset_vulnerable = load_dataset(
    "msc-smart-contract-audition/vulnerable-functions-base",
    split="train",
    escapechar="\\",
)

In [5]:
fix_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below is a code blocks which might have improper formatting. The task is to fix the formatting and remove comments which do not describe the code. Apart from formatting all of the code needs to remain the same.
1. Ensure that the code is properly indented.
2. Ensure that there are no leading tabs - the outer most block should start with no whitespace infront.
3. If a line is too long break it in multiple lines.
4. Keep all the code the same even if it seems unnecessary in the context.
5. Remove comments which contain no information about the code.
6. Remove any comments which contain URLs.

Do not explain your changes at the end just output the formatted code block.
<|start_header_id|>user<|end_header_id|>
Code block:
{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Formatted code block:
```
"""



In [6]:
def extract_codeblocks(description):
    codeblocks = []
    codeblock = ""

    writing = False

    for line in description.split("\n"):
        if writing:
            codeblock += line + "\n"

        if line.strip().startswith("```"):
            if writing:
                codeblocks.append(codeblock)
                yield codeblock
                codeblock = ""
                writing = False
            else:
                writing = True
                codeblock += line + "\n"

In [7]:
# Convert to pandas DataFrame and drop 534 (too long)
descriptions = dataset_vulnerable.to_pandas().drop(index=534)['description'].apply(lambda row: row.replace("\\n", "\n"))

In [8]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"cleaned-up-code-{current_chunk+1}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["code"])

    for description in tqdm(descriptions[start:end], total=chunk_len):
        formatted_blocks = []
        for codeblock in extract_codeblocks(description):
            prompt = fix_template.format(codeblock)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
            output_tokens = model.generate(
                **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
            )
            decoded_output = tokenizer.decode(
                output_tokens[0],
                skip_special_tokens=True,
                pad_token_id=tokenizer.pad_token_id,
            )
            formatted_blocks.append(
                decoded_output.split("Formatted code block:\n")[1].strip()
            )

        codeblocks = "\n".join(formatted_blocks).replace("\n", "\\n")
        writer.writerow([codeblocks])


100%|█████████████████████████████████████████████| 10/10 [01:20<00:00,  8.08s/it]


In [64]:
fix_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
The below is a code blocks which might have improper formatting. The task is to fix the formatting and remove comments which do not describe the code. Apart from formatting all of the code needs to remain the same.
1. Ensure that the code is properly indented.
2. Ensure that there are no leading tabs - the outer most block should start with no whitespace infront.
3. If a line is too long break it in multiple lines.
4. Keep all the code the same even if it seems unnecessary in the context.
5. Remove comments which contain no information about the code.
6. Remove any comments which contain URLs.

Do not explain your changes at the end just output the formatted code block.
<|start_header_id|>user<|end_header_id|>
Code block:
```
    // File input.sol:12
            uint inputTotalUSDValueE36;
                for (uint i; i < openTokenInfos.length; ) {
                 inputTotalUSDValueE36 += openTokenInfos[i].inputAmt * tokenPriceE36s[i];
                      borrowTotalUSDValueE36 += openTokenInfos[i].borrowAmt * tokenPriceE36s[i];
                 unchecked {
            ++i; // See https://github.com/MSc-Smart-Contract-Audition/data-scraper/blob/main/verified/src/compile_contracts.py
                }
            }
                // 1.3 calculate net pnl (including strategy users & borrow profit)
            positionOpenUSDValueE36 = inputTotalUSDValueE36 + borrowTotalUSDValueE36;
            netPnLE36 = positionCurUSDValueE36.toInt256() - positionOpenUSDValueE36.toInt256();
```
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
Formatted code block:
```
"""


inputs = tokenizer(fix_template, return_tensors="pt", truncation=True).to("cuda")
output_tokens = model.generate(
    **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
)
decoded_output = tokenizer.decode(
    output_tokens[0], skip_special_tokens=True, pad_token_id=tokenizer.pad_token_id
)


print(decoded_output.split("Formatted code block:\n")[1].strip())

```
uint inputTotalUSDValueE36;

for (uint i; i < openTokenInfos.length; ) {
    inputTotalUSDValueE36 += openTokenInfos[i].inputAmt * tokenPriceE36s[i];
    borrowTotalUSDValueE36 += openTokenInfos[i].borrowAmt * tokenPriceE36s[i];
    unchecked {
        ++i;
    }
}

positionOpenUSDValueE36 = inputTotalUSDValueE36 + borrowTotalUSDValueE36;
netPnLE36 = positionCurUSDValueE36.toInt256() - positionOpenUSDValueE36.toInt256();
```


In [84]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"cleaned-up-code-{current_chunk+1}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["code"])

    for description in tqdm(descriptions[start:end], total=chunk_len):
        formatted_blocks = []
        for codeblock in extract_codeblocks(description):
            prompt = fix_template.format(codeblock)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
            output_tokens = model.generate(
                **inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
            )
            decoded_output = tokenizer.decode(
                output_tokens[0],
                skip_special_tokens=True,
                pad_token_id=tokenizer.pad_token_id,
            )
            formatted_blocks.append(
                decoded_output.split("Formatted code block:\n")[1].strip()
            )

        codeblocks = "\n".join(formatted_blocks).replace("\n", "\\n")
        writer.writerow([codeblocks])


  0%|▏                                                  | 5/1971 [00:25<2:48:55,  5.16s/it]


In [2]:
import pandas as pd
import csv

DIALECT_NAME = "db_dialect"
csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_NONE,
    escapechar="\\",
)

# Output file name
output_file = 'enhanced-recommendations.csv'
header = True
# Open the output file in write mode to create/overwrite it
with open(output_file, 'w') as outfile:
    # Iterate through each chunk file
    for i in range(21):
        input_file = f'../enhanced-recommendation-{i}.csv'
        print(f"Processing {input_file}")

        # Read the input CSV file
        df = pd.read_csv(input_file, dialect=DIALECT_NAME)

        # For the first file, include the header
        df.to_csv(output_file, mode='a', index=False, header=header)
        header = False


Processing ../enhanced-recommendation-0.csv
Processing ../enhanced-recommendation-1.csv
Processing ../enhanced-recommendation-2.csv
Processing ../enhanced-recommendation-3.csv
Processing ../enhanced-recommendation-4.csv
Processing ../enhanced-recommendation-5.csv
Processing ../enhanced-recommendation-6.csv
Processing ../enhanced-recommendation-7.csv
Processing ../enhanced-recommendation-8.csv
Processing ../enhanced-recommendation-9.csv
Processing ../enhanced-recommendation-10.csv
Processing ../enhanced-recommendation-11.csv
Processing ../enhanced-recommendation-12.csv
Processing ../enhanced-recommendation-13.csv
Processing ../enhanced-recommendation-14.csv
Processing ../enhanced-recommendation-15.csv
Processing ../enhanced-recommendation-16.csv
Processing ../enhanced-recommendation-17.csv
Processing ../enhanced-recommendation-18.csv
Processing ../enhanced-recommendation-19.csv
Processing ../enhanced-recommendation-20.csv
