In [None]:
DATASET_NAME = 'enhanced-vulnerability-audits'

# Add root to path
import sys
current_path = sys.path[0]
root_name = 'data-lifecycle'
root_path = current_path[:sys.path[0].find(root_name) + len(root_name)]
if root_path not in sys.path:
    sys.path.insert(0, root_path)

import pandas as pd
import csv
from pathlib import Path
from datasets import Dataset, load_dataset
import pandas as pd
from common.directories import DATASET_DIR

DIR = Path("../")

In [None]:
CODE_DIALECT = "code_dialect"

csv.register_dialect(
    CODE_DIALECT,
    delimiter=",",
    quotechar='"',
    quoting=csv.QUOTE_MINIMAL,
)

### Load vulnerable dataset

In [None]:
DIALECT = "db_dialect"
csv.register_dialect(
    DIALECT,
    delimiter=",",
    quoting=csv.QUOTE_MINIMAL,
    escapechar="\\",
)

In [None]:
code = pd.read_csv(DIR / "cleaned-up-code.csv", dialect=CODE_DIALECT)
descriptions = pd.read_csv(DIR / "enhanced-vulnerability-descriptions.csv", dialect=DIALECT)
mitigations = pd.read_csv(DIR / "enhanced-recommendations.csv", dialect=DIALECT)
types = pd.read_csv(DIR / "vulnerability-types.csv", dialect=DIALECT)
functionality = pd.read_csv(DIR / "functionality.csv", dialect=CODE_DIALECT)

In [None]:
print(len(code))
print(len(descriptions))
print(len(mitigations))
print(len(types))
print(len(functionality))

In [None]:
combined_vulnerable_df = pd.concat([
    code,
    descriptions,
    mitigations,
    types,
    functionality,
], axis=1)

In [None]:
print(combined_vulnerable_df)

### Load verified data

In [None]:
verified_df = load_dataset(
    "msc-smart-contract-audition/vulnerable-functions-base",
    split="train",
    name="verified-functions",
    escapechar="\\",
).to_pandas()

In [None]:
code_verified = verified_df['function'].rename("code")
functionality_verified = pd.read_csv(DIR / "functionality-verified.csv", dialect=CODE_DIALECT)

In [None]:
combined_verified_df = pd.concat([
    code_verified,
    functionality_verified,
], axis=1)

combined_verified_df["type"] = "no vulnerability"

In [None]:
print(combined_verified_df)

## Combine

In [None]:
combined_df = pd.concat([combined_vulnerable_df, combined_verified_df], ignore_index=True)


In [None]:
combined_df.head()

In [None]:
combined_df.tail()

In [None]:
combined_df.to_csv(DIR / "enhanced-dataset.csv", header=True, index=True)
combined_dataset = Dataset.from_pandas(combined_df)
split_dataset = combined_dataset.train_test_split(test_size=0.15)
split_dataset.save_to_disk(DATASET_DIR / DATASET_NAME)