In [1]:
# %% [markdown]
# ## Task 3 – Embed & Index
# Converts every code snippet into a 128-d CodeBERT vector and stores it in a FAISS
# index for instant similarity search.

# %% [code] ▸ 1 Imports & paths
import json, pathlib, time
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import faiss, torch

from src.embed_utils import CodeBERTEmbedder, write_faiss_and_meta

RAW_FILE  = pathlib.Path("../data/processed/master_data.jsonl")   # or one of the split files
OUT_DIR   = pathlib.Path("../data/embeddings")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CODE_FIELD = "vulnerable_code"   # same as Task 5 expects
VECTOR_DIM = 128
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# %% [code] ▸ 2 Load raw or split files
def load_jsonl(path: pathlib.Path) -> pd.DataFrame:
    with path.open() as f:
        rows = [json.loads(l) for l in f if l.strip()]
    return pd.DataFrame(rows)

df = load_jsonl(RAW_FILE) if RAW_FILE.suffix == ".jsonl" else pd.concat(
        [load_jsonl(p) for p in pathlib.Path("../data/splits").glob("*.jsonl")],
        ignore_index=True
     )
print("Loaded", len(df), "rows")
display(df.head(2))

# %% [code] ▸ 3 Embed
embed = CodeBERTEmbedder(device=DEVICE)
vecs, meta = [], []

t0 = time.time()
for i, row in tqdm(df.iterrows(), total=len(df), desc="Embedding"):
    vecs.append(embed(row[CODE_FIELD]))
    meta.append({
        "row_id": int(i),
        "cwe_id": row.get("cwe_id", "NA"),
        "label":  row.get("is_vuln", "NA")      # 1 / 0 if present
    })
vecs = np.stack(vecs)                          # (N, 768)
print(f"Embedding done in {time.time()-t0:.1f}s")

# ▸ 4 PCA → at most min(samples, 768) dims
SAMPLES, ORIG_DIM = vecs.shape   # e.g. (39, 768)

if VECTOR_DIM >= ORIG_DIM:          # asking for >=768 → skip PCA
    print("Skipping PCA (VECTOR_DIM ≥ original dim).")
    final_vecs = normalize(vecs)

else:
    # ensure n_components ≤ samples-1  (sklearn restriction)
    n_components = min(VECTOR_DIM, SAMPLES - 1)
    if n_components < VECTOR_DIM:
        print(f"Dataset is small (N={SAMPLES}); "
              f"shrinking PCA dim to {n_components}.")
    else:
        print(f"Running PCA to {n_components} dims …")

    from sklearn.decomposition import PCA
    final_vecs = PCA(n_components=n_components,
                     random_state=42).fit_transform(vecs).astype(np.float32)
    final_vecs = normalize(final_vecs)

# %% [code] ▸ 5 Save index + metadata & smoke test
write_faiss_and_meta(
    vecs,
    meta,
    out_index=OUT_DIR / "embeddings.faiss",
    out_meta =OUT_DIR / "embeddings.jsonl"
)

index = faiss.read_index(str(OUT_DIR / "embeddings.faiss"))
D, I = index.search(vecs[:1], k=3)
with open(OUT_DIR / "embeddings.jsonl") as f:
    meta_list = [json.loads(l) for l in f]
print("Top-3 CWE IDs:", [meta_list[idx]["cwe_id"] for idx in I[0]])


  from .autonotebook import tqdm as notebook_tqdm


Loaded 352 rows


Unnamed: 0,cwe_id,cwe_description,vulnerable_code,fixed_code,analysis,",cwe_description"
0,CWE-119,The product performs operations on a memory bu...,#include <string.h>\n#include <stdint.h>\n#inc...,#include <string.h>\n#include <stdint.h>\n#inc...,,
1,CWE-119,The product performs operations on a memory bu...,#include <string.h>\n#include <stdint.h>\n#inc...,#include <string.h>\n#include <stdint.h>\n#inc...,,


Embedding: 100%|██████████| 352/352 [01:39<00:00,  3.55it/s]

Embedding done in 99.1s
Running PCA to 128 dims …
💾  Saved 352 × 768 index → ..\data\embeddings\embeddings.faiss
Top-3 CWE IDs: ['CWE-119', 'CWE-119', 'CWE-119']



