In [1]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import csv

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
DIALECT_NAME = "db_dialect"

csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

In [3]:
max_seq_length = 20  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A30 MIG 2g.12gb. Max memory: 11.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
codeblocks = pd.read_csv("/vol/bitbucket/kza23/cleaned-up-code.csv")

In [7]:
dataset_verified = load_dataset(
    "msc-smart-contract-audition/vulnerable-functions-base",
    split="train",
    name="verified-functions",
    escapechar="\\",
)

verified_df = dataset_verified.to_pandas()

In [9]:
print(verified_df.head())

                                            function
0  ```\nconstructor(string memory name_, string m...
1  ```\nconstructor(address[] memory tokensToList...
2          ```\nreceive() external payable {}\n```\n
3  ```\nfunction swap(\n  address inputToken,\n  ...
4  ```\nfunction addLiquidity(address inputToken,...


In [14]:
template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below are provided 1 or more code blocks from a smart contract. The task is to explain what is its purpose in plain English.

1. Do NOT use any names of variables, names of parameters or names of functions. Just try to explain the general functionality.
2. Explain what each codeblock does step by step.
3. Give a high-level overview and purpose of the code.
4. First explain the functionality of each code block and after all codeblocks are explained provide a high-level overview of the code over all.

<|start_header_id|>user<|end_header_id|>
Code block to explain:
{}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Code functionality:
Code block 1:
1. """

template_verified = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Below is a single codeblock from a smart contract. The task is to explain what is its purpose in plain English.

1. Do NOT use any names of variables, names of parameters or names of functions. Just try to explain the general functionality.
2. Explain the function does step by step.
3. Give a high-level overview and purpose of the function within a wider context.
4. First explain the functionality of the code block and after provide a high-level overview of the code over all.

<|start_header_id|>user<|end_header_id|>
Code block to explain:
{}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Code functionality:
Code block 1:
1. """


In [37]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"functionality-{current_chunk}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["functionality"])

    for codeblock in tqdm(codeblocks['code'][start:end], total=chunk_len):

        print(codeblock.replace("\\n", "\n"))

        prompt = template.format(codeblock.replace("\\n", "\n"))

        input = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

        output_tokens = model.generate(
            **input, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )

        decoded_output_purpose = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        functionality = decoded_output_purpose.split("Code functionality:\n")[1].strip()
        writer.writerow([functionality.replace("\n", "\\n")])


  0%|                                                                      | 0/100 [00:00<?, ?it/s]

```
@external
@nonreentrant('lock')
def remove_liquidity_one_coin(
    token_amount: uint256, 
    i: uint256, 
    min_amount: uint256, 
    use_eth: bool = False, 
    receiver: address = msg.sender
) -> uint256:
    A_gamma: uint256[2] = self._A_gamma()

    dy: uint256 = 0
    D: uint256 = 0
    p: uint256 = 0
    xp: uint256[N_COINS] = empty(uint256[N_COINS])
    future_A_gamma_time: uint256 = self.future_A_gamma_time
    dy, p, D, xp = self._calc_withdraw_one_coin(A_gamma, token_amount, i, future_A_gamma_time > 0, True)
    assert dy >= min_amount, "Slippage"

    if block.timestamp >= future_A_gamma_time:
        self.future_A_gamma_time = 1

    self.balances[i] -= dy
    CurveToken(self.token).burnFrom(msg.sender, token_amount)

    coin: address = self.coins[i]
    if use_eth and coin == WETH20:
        raw_call(receiver, b"", value=dy)
    else:
        if coin == WETH20:
            WETH(WETH20).deposit(value=dy)
        response: Bytes[32] = raw_call(
            coin, 
  

  0%|                                                                      | 0/100 [00:19<?, ?it/s]

Purpose: Code block 1:
1.  This function is used to remove liquidity from a pool.
2.  It takes in several parameters: the amount of tokens to remove, the index of the coin to remove, the minimum amount of tokens to remove, and an optional boolean flag to specify whether to use ETH or not.
3.  The function calculates the amount of tokens to be removed (`dy`) and the amount of tokens to be transferred (`xp`) based on the pool's parameters and the amount of tokens to remove.
4.  It then checks if the block timestamp is greater than or equal to the future A-gamma time. If it is, it updates the future A-gamma time.
5.  The function then subtracts the amount of tokens to be removed from the pool's balance and burns the tokens from the sender's account.
6.  If the use_eth flag is set to True, it uses the raw_call function to transfer the tokens to the receiver. Otherwise, it uses the raw_call function to transfer the tokens to the receiver, but only if the coin is WETH20. If the coin is not W




In [None]:
chunk_len = 100
current_chunk = 0
start = current_chunk * chunk_len
end = (current_chunk + 1) * chunk_len

with open(f"functionality-verified-{current_chunk}.csv", "w", newline="") as f:
    writer = csv.writer(f, dialect=DIALECT_NAME)

    # Write the header row
    writer.writerow(["functionality"])

    for codeblock in tqdm(verified_df['function'][start:end], total=chunk_len):

        print(codeblock.replace("\\n", "\n"))
        prompt = template_verified.format(codeblock.replace("\\n", "\n"))

        input = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

        output_tokens = model.generate(
            **input, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id
        )

        decoded_output_purpose = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        functionality = decoded_output_purpose.split("Code functionality:\n")[1].strip()
        writer.writerow([functionality.replace("\n", "\\n")])



In [1]:
import pandas as pd
import csv

DIALECT_NAME = "db_dialect"
csv.register_dialect(
    DIALECT_NAME,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

# Output file name
output_file = 'functionality-verified.csv'
header = True
# Open the output file in write mode to create/overwrite it
with open(output_file, 'w') as outfile:
    # Iterate through each chunk file
    for i in range(6):
        input_file = f'../functionality-verified-{i}.csv'
        print(f"Processing {input_file}")

        # Read the input CSV file
        df = pd.read_csv(input_file, dialect=DIALECT_NAME)

        # For the first file, include the header
        df.to_csv(output_file, mode='a', index=False, header=header)
        header = False


Processing ../functionality-verified-0.csv
Processing ../functionality-verified-1.csv
Processing ../functionality-verified-2.csv
Processing ../functionality-verified-3.csv
Processing ../functionality-verified-4.csv
Processing ../functionality-verified-5.csv
