In [1]:
# ------------------------------------------------------------------
# 0. Imports & paths
# ------------------------------------------------------------------
import json, pathlib
import pandas as pd
from sklearn.model_selection import train_test_split

RAW_FILE  = pathlib.Path("../data/processed/master_data.jsonl")   # or .json
SPLIT_DIR = pathlib.Path("../data/splits")
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------------
# 1. Load JSONL *or* JSON array
# ------------------------------------------------------------------
def load_raw(path: pathlib.Path) -> pd.DataFrame:
    if path.suffix == ".jsonl":
        with path.open() as f:
            rows = [json.loads(l) for l in f if l.strip()]
    elif path.suffix == ".json":
        rows = json.loads(path.read_text())
    else:
        raise ValueError(f"Unsupported extension: {path.suffix}")
    return pd.DataFrame(rows)

df = load_raw(RAW_FILE)
print("Loaded", len(df), "rows -> columns:", list(df.columns))

# ------------------------------------------------------------------
# 2. Column constants (from your schema)
# ------------------------------------------------------------------
VULN_COL  = "vulnerable_code"
FIXED_COL = "fixed_code"
LABEL_COL = "is_vuln"
CODE_COL  = "Function before"        # name used by Task-5 notebook
CVSS_COL = "cvss"
# CVSS_COL = "cvss_v3"          # <-- adjust to real column name

assert {VULN_COL, FIXED_COL}.issubset(df.columns), \
    "Expected columns 'vulnerable_code' and 'fixed_code' not found!"

# simple mapping by CWE (expand as needed)
cwe2cvss = {
    "CWE-119": 7.5,   # buffer overflow → high
    "CWE-20":  6.0,   # input validation → medium
}
df["cvss"] = df["cwe_id"].map(cwe2cvss).fillna(5.0)  # default 5.0
# ------------------------------------------------------------------
# 3. Build labelled table
# ------------------------------------------------------------------
vuln_df = (
    df[[VULN_COL]]
    .rename(columns={VULN_COL: CODE_COL})
    .assign(**{LABEL_COL: 1})
)

clean_df = (
    df[[FIXED_COL]]
    .rename(columns={FIXED_COL: CODE_COL})
    .assign(**{LABEL_COL: 0})
)

combo_df = (
    pd.concat([vuln_df, clean_df], ignore_index=True)
      .dropna(subset=[CODE_COL])          # weed out blanks
)
combo_df["cvss"] = df[CVSS_COL] if CVSS_COL in df.columns else 5.0
print("Labelled table:", combo_df.shape)

# ------------------------------------------------------------------
# 4. Stratified 80 / 10 / 10 split
# ------------------------------------------------------------------
train_df, temp_df = train_test_split(
    combo_df,
    test_size=0.20,
    stratify=combo_df[LABEL_COL],
    random_state=42,
)

valid_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df[LABEL_COL],
    random_state=42,
)

print(f"train:{len(train_df)}  valid:{len(valid_df)}  test:{len(test_df)}")

# ------------------------------------------------------------------
# 5. Write out JSONL files
# ------------------------------------------------------------------
def to_jsonl(frame: pd.DataFrame, outfile: pathlib.Path) -> None:
    with outfile.open("w", encoding="utf-8") as w:
        for _, row in frame.iterrows():
            w.write(json.dumps({
                CODE_COL: row[CODE_COL],
                LABEL_COL: int(row[LABEL_COL]),
                "cvss": float(row["cvss"])
            }, ensure_ascii=False) + "\n")

to_jsonl(train_df, SPLIT_DIR / "train.jsonl")
to_jsonl(valid_df, SPLIT_DIR / "valid.jsonl")
to_jsonl(test_df,  SPLIT_DIR / "test.jsonl")

print("✅  JSONL splits written to", SPLIT_DIR)


Loaded 352 rows -> columns: ['cwe_id', 'cwe_description', 'vulnerable_code', 'fixed_code', 'analysis', ',cwe_description']
Labelled table: (704, 3)
train:563  valid:70  test:71
✅  JSONL splits written to ..\data\splits
