In [64]:
DATASET_NAME = 'enhanced-vulnerability-audits'

# Add root to path
import sys
current_path = sys.path[0]
root_name = 'data-lifecycle'
root_path = current_path[:sys.path[0].find(root_name) + len(root_name)]
if root_path not in sys.path:
    sys.path.insert(0, root_path)

import pandas as pd
import csv
from pathlib import Path
from datasets import Dataset, load_dataset
import pandas as pd
from common.directories import DATASET_DIR

DIR = Path("../")

In [65]:
CODE_DIALECT = "code_dialect"

csv.register_dialect(
    CODE_DIALECT,
    delimiter=",",
    quotechar='"',
    quoting=csv.QUOTE_MINIMAL,
)

### Load vulnerable dataset

In [66]:
DIALECT = "db_dialect"
csv.register_dialect(
    DIALECT,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

In [67]:
code = pd.read_csv(DIR / "cleaned-up-code.csv", dialect=CODE_DIALECT)
descriptions = pd.read_csv(DIR / "enhanced-vulnerability-descriptions.csv", dialect=DIALECT)
mitigations = pd.read_csv(DIR / "enhanced-recommendations.csv", dialect=DIALECT)
types = pd.read_csv(DIR / "vulnerability-types.csv", dialect=DIALECT)
functionality = pd.read_csv(DIR / "functionality.csv", dialect=CODE_DIALECT)

In [68]:
print(len(code))
print(len(descriptions))
print(len(mitigations))
print(len(types))
print(len(functionality))

1971
1971
1971
1971
1971


In [69]:
combined_vulnerable_df = pd.concat([
    code,
    descriptions,
    mitigations,
    types,
    functionality,
], axis=1)

In [70]:
print(combined_vulnerable_df)

                                                   code  \
0     ```\n(\n    netPnLE36,\n    lenderProfitUSDVal...   
1     ```\nuint deltaTime;\n\nif (pos.startLiqTimest...   
2     ```\nfunction _mintInternal(address _receiver,...   
3     ```\nuint inputTotalUSDValueE36;\n\nfor (uint ...   
4     ```\nfunction increaseLiquidity(IncreaseLiquid...   
...                                                 ...   
1966  ```\nDepositVault.sol\nfunction withdraw(uint2...   
1967  ```\nfunction deposit(uint256 amount, address ...   
1968  ```\nfunction withdraw(uint256 _amount) extern...   
1969  ```\n/**\n * @title GeoEmaAndCumSmaPump\n * @a...   
1970  ```\nsrc/libraries/LibLastReserveBytes.sol\nui...   

                                            description  \
0     When a position is liquidated, the liquidator ...   
1     This vulnerability allows an attacker to manip...   
2     This vulnerability allows an attacker to manip...   
3     This vulnerability allows an attacker to manip...

### Load verified data

In [71]:
verified_df = load_dataset(
    "msc-smart-contract-audition/vulnerable-functions-base",
    split="train",
    name="verified-functions",
    escapechar="\\",
).to_pandas()

In [72]:
code_verified = verified_df['function'].rename("code")
functionality_verified = pd.read_csv(DIR / "functionality-verified.csv", dialect=CODE_DIALECT)

In [73]:
combined_verified_df = pd.concat([
    code_verified,
    functionality_verified,
], axis=1)

combined_verified_df["type"] = "no vulnerability"

In [74]:
print(combined_verified_df)

                                                  code  \
0    ```\nconstructor(string memory name_, string m...   
1    ```\nconstructor(address[] memory tokensToList...   
2            ```\nreceive() external payable {}\n```\n   
3    ```\nfunction swap(\n  address inputToken,\n  ...   
4    ```\nfunction addLiquidity(address inputToken,...   
..                                                 ...   
933  ```\nfunction firstPublicSaleBatchMint(uint256...   
934  ```\nfunction lastPublicSaleBatchMint(uint256 ...   
935  ```\nfunction sendPaymentForReimbursement() ex...   
936  ```\nfunction withdrawPayment() external retur...   
937  ```\nfunction reimbursementAirdrop(\n    addre...   

                                         functionality              type  
0    Code block 1:\n1.  This code block is a constr...  no vulnerability  
1    Code block 1:\n1.  The code is a constructor f...  no vulnerability  
2    Code block 1:\n1.  The code block is a functio...  no vulnerability  
3  

## Combine

In [75]:
combined_df = pd.concat([combined_vulnerable_df, combined_verified_df], ignore_index=True)


In [76]:
combined_df.head()

Unnamed: 0,code,description,recommendation,type,functionality
0,"```\n(\n netPnLE36,\n lenderProfitUSDVal...","When a position is liquidated, the liquidator ...","To address the vulnerability, we need to ensur...",token locking,Code block 1:\n1. It calculates some values r...
1,```\nuint deltaTime;\n\nif (pos.startLiqTimest...,This vulnerability allows an attacker to manip...,"To mitigate this vulnerability, consider imple...",arithmetic manipulation,Code block 1:\n1. It checks if the start time...
2,"```\nfunction _mintInternal(address _receiver,...",This vulnerability allows an attacker to manip...,To prevent the first depositor from being fron...,rounding error,Code block 1:\n1. It sets a timestamp for a s...
3,```\nuint inputTotalUSDValueE36;\n\nfor (uint ...,This vulnerability allows an attacker to manip...,"To mitigate the vulnerability, it is recommend...",arithmetic manipulation,Code block 1:\n1. Initialize a variable `inpu...
4,```\nfunction increaseLiquidity(IncreaseLiquid...,The UniswapV3NPM contract allows users to incr...,To prevent an attacker from increasing the liq...,arbitrary liquidity manipulation,Code block 1:\n1. This function is used to in...


In [77]:
combined_df.tail()

Unnamed: 0,code,description,recommendation,type,functionality
2904,```\nfunction firstPublicSaleBatchMint(uint256...,,,no vulnerability,Code block 1:\n1. The function is triggered w...
2905,```\nfunction lastPublicSaleBatchMint(uint256 ...,,,no vulnerability,Code block 1:\n1. The function is triggered w...
2906,```\nfunction sendPaymentForReimbursement() ex...,,,no vulnerability,Code block 1:\n1. The function `sendPaymentFo...
2907,```\nfunction withdrawPayment() external retur...,,,no vulnerability,Code block 1:\n1. The function `withdrawPayme...
2908,```\nfunction reimbursementAirdrop(\n addre...,,,no vulnerability,Code block 1:\n1. The function `reimbursement...


In [79]:
combined_df.to_csv(DIR / "enhanced-dataset.csv", header=True, index=True)
combined_dataset = Dataset.from_pandas(combined_df)
split_dataset = combined_dataset.train_test_split(test_size=0.15)
split_dataset.save_to_disk(DATASET_DIR / DATASET_NAME)

Saving the dataset (0/1 shards):   0%|          | 0/2472 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/437 [00:00<?, ? examples/s]