In [6]:
# ============================================
# CONFIG — edit as needed
# ============================================
IN_DIR = "../Datasets/Ingestor"  # input folder of CSVs
COMBINED_OUT = "../Datasets/XGB_Train/pairs_combined_with_ids.csv"  # csv or .parquet

# Core
GLOB_PATTERN = "*.csv"
RECURSIVE = False
ID_COL = None              # e.g., "id"; if None, synthetic id "filename::rowindex" is created
NEG_PER_POS = 9            # 1 positive + 9 negatives per anchor row

# Column split options
COLUMN_SPLIT_MODE = "half"       # "half" = split numeric columns into two halves
COLUMN_SPLIT_SHUFFLE = False     # shuffle numeric columns before splitting
EXPLICIT_COLS_A = []             # optional explicit assignment
EXPLICIT_COLS_B = []             # optional explicit assignment

# Row cap per CSV
ROW_LIMIT = 10_000               # cap each CSV before pairing
ROW_LIMIT_MODE = "head"          # "head" or "sample"

# Date parsing → Unix timestamp (only for non-numeric cols)
DATE_MIN_VALID_FRACTION = 0.50   # convert to date if ≥50% parse successfully
NUMERIC_MIN_VALID_FRACTION = 0.98  # if ≥98% values numeric -> treat as numeric (don’t date-convert)
UNIX_UNIT = "s"                  # "s" for seconds (recommended) or "ms" for milliseconds

# Logging
LOG_EVERY_FILES = 10
LOG_LEVEL = "INFO"
LOG_FILE = None

# Columns to DROP from the OUTPUT file
DROP_FROM_OUTPUT = [
    "num_nan_a", "num_nan_b", "num_nan_mismatch",
    "row_idx_A", "row_idx_B", "label_type", "source_file"
]


# ============================================
# IMPLEMENTATION (notebook-friendly)
# ============================================
from __future__ import annotations

import glob
import logging
import os
import time
from typing import List, Optional, Tuple

import numpy as np
import pandas as pd


def setup_logging(level: str = "INFO", logfile: Optional[str] = None):
    lvl = getattr(logging, level.upper(), logging.INFO)
    fmt = "%Y-%m-%d %H:%M:%S | %(levelname)-7s | %(message)s"
    handlers = [logging.StreamHandler()]
    if logfile:
        os.makedirs(os.path.dirname(os.path.abspath(logfile)) or ".", exist_ok=True)
        handlers.append(logging.FileHandler(logfile, mode="w", encoding="utf-8"))
    logging.basicConfig(level=lvl, format=fmt, handlers=handlers)


# ---------- preprocessing helpers ----------
def _clean_numeric_like_text(s: pd.Series) -> pd.Series:
    """Remove common decorators like commas/spaces before numeric coercion."""
    return (s.astype(str)
              .str.replace(",", "", regex=False)
              .str.replace(" ", "", regex=False)
              .str.replace("\u00A0", "", regex=False))  # non-breaking space


def detect_numeric_columns(df: pd.DataFrame, *, id_col: Optional[str], min_valid_fraction: float) -> List[str]:
    """
    Columns that are already numeric (or safely numeric after cleaning) — we will NEVER date-convert these.
    A column is 'numeric' if pd.to_numeric(..., errors='coerce') yields >= min_valid_fraction non-NaN.
    """
    cols = [c for c in df.columns if (id_col is None or c != id_col)]
    numeric_cols = []
    for c in cols:
        s_num = pd.to_numeric(_clean_numeric_like_text(df[c]), errors="coerce")
        if s_num.notna().mean() >= min_valid_fraction:
            numeric_cols.append(c)
    return numeric_cols


def datetime_series_to_unix(dt: pd.Series, unit: str) -> pd.Series:
    """
    Convert a timezone-aware datetime64[ns] Series to Unix timestamps with NaN for NaT,
    without using deprecated .view on Series.
    """
    # ndarray of datetime64[ns]
    arr_dt = dt.to_numpy(dtype="datetime64[ns]")
    # Mask NaT
    mask_nat = np.isnat(arr_dt)
    # Convert to int ns (NaT becomes very negative int -> we’ll null it)
    arr_ns = arr_dt.astype("datetime64[ns]").astype("int64")
    arr_ns = arr_ns.astype("float64")
    arr_ns[mask_nat] = np.nan
    if unit == "ms":
        return pd.Series(arr_ns / 1e6, index=dt.index, dtype="float64")
    else:  # seconds
        return pd.Series(arr_ns / 1e9, index=dt.index, dtype="float64")


def convert_only_non_numeric_dates_to_unix(
    df: pd.DataFrame,
    *,
    id_col: Optional[str],
    numeric_min_valid_fraction: float,
    date_min_valid_fraction: float,
    unit: str,
) -> Tuple[pd.DataFrame, List[str]]:
    """
    Only columns that are NOT numeric (per numeric_min_valid_fraction) are tested as dates.
    If ≥ date_min_valid_fraction parse, convert them to Unix timestamps.
    Returns the converted DataFrame and the list of columns converted.
    """
    df = df.copy()
    candidate_cols = [c for c in df.columns if (id_col is None or c != id_col)]

    # 1) Find columns that are already numeric enough — skip them
    numeric_cols = set(detect_numeric_columns(df, id_col=id_col, min_valid_fraction=numeric_min_valid_fraction))

    converted = []
    for c in candidate_cols:
        if c in numeric_cols:
            continue  # already numeric — do not date-convert

        # 2) Try parse as datetime (strict parser by default in recent pandas)
        dt = pd.to_datetime(df[c], errors="coerce", utc=True)
        frac_ok = dt.notna().mean()
        if frac_ok >= date_min_valid_fraction:
            df[c] = datetime_series_to_unix(dt, unit=unit)
            converted.append(c)
        # else: leave it as is (likely categorical/text)

    return df, converted


def as_numeric(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """Coerce given columns to numeric using a lenient cleaning pass."""
    out = df.copy()
    for c in cols:
        out[c] = pd.to_numeric(_clean_numeric_like_text(out[c]), errors="coerce")
    return out


def zscore(arr: np.ndarray) -> np.ndarray:
    m = np.nanmean(arr, axis=0)
    s = np.nanstd(arr, axis=0)
    s[s == 0] = np.nan
    Z = (arr - m) / s
    return np.where(np.isnan(Z), 0.0, Z)


def percentile_rank_matrix(arr: np.ndarray) -> np.ndarray:
    n, d = arr.shape
    out = np.full((n, d), 0.5, dtype=float)
    for j in range(d):
        col = arr[:, j]
        mask = ~np.isnan(col)
        if mask.sum() <= 1:
            continue
        order = np.argsort(col[mask], kind="mergesort")
        ranks = np.empty_like(order, dtype=float)
        ranks[order] = np.arange(order.size)
        pct = ranks / (mask.sum() - 1)
        out[mask, j] = pct
    return out


def safe_ratio(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    den = np.where(np.abs(b) < 1e-12, np.nan, b)
    return a / den


def pair_features_numeric(a_raw, b_raw, a_z, b_z, a_pct, b_pct) -> dict:
    """Compute pairwise similarity/difference features for two numeric vectors."""
    absdiff = np.abs(a_raw - b_raw)
    reldiff = np.abs(safe_ratio(a_raw - b_raw, (np.abs(a_raw) + np.abs(b_raw)) / 2 + 1e-12))
    zdiff   = np.abs(a_z - b_z)
    pcdiff  = np.abs(a_pct - b_pct)

    def nanmean(x):
        m = np.nanmean(x)
        return float(m) if np.isfinite(m) else np.nan
    def nansum(x): return float(np.nansum(x))

    L1_raw = nansum(absdiff)
    L2_raw = float(np.sqrt(np.nansum(absdiff**2)))
    L1_z   = nansum(zdiff)
    L2_z   = float(np.sqrt(np.nansum(zdiff**2)))
    L1_p   = nansum(pcdiff)
    L2_p   = float(np.sqrt(np.nansum(pcdiff**2)))

    def safe_cos(a, b):
        na = np.linalg.norm(a); nb = np.linalg.norm(b)
        if na < 1e-12 or nb < 1e-12: return np.nan
        return float(np.dot(a, b) / (na * nb))

    return {
        "absdiff_mean": nanmean(absdiff),
        "absdiff_median": float(np.nanmedian(absdiff)),
        "reldiff_mean": nanmean(reldiff),
        "zdiff_mean": nanmean(zdiff),
        "pcdiff_mean": nanmean(pcdiff),
        "L1_raw": L1_raw, "L2_raw": L2_raw,
        "L1_z": L1_z,     "L2_z": L2_z,
        "L1_pct": L1_p,   "L2_pct": L2_p,
        # diagnostics (we will DROP them from the final file)
        "num_nan_a": float(np.isnan(a_raw).sum()),
        "num_nan_b": float(np.isnan(b_raw).sum()),
        "num_nan_mismatch": float((np.isnan(a_raw) ^ np.isnan(b_raw)).sum()),
    }


# ---------- column split ----------
def split_numeric_columns(
    df: pd.DataFrame,
    *,
    id_col: Optional[str],
    mode: str = "half",
    shuffle: bool = False,
    seed: int = 42,
    explicit_A: Optional[List[str]] = None,
    explicit_B: Optional[List[str]] = None,
    min_valid_fraction: float = 0.01,  # keep cols with at least 1% non-NaN after coercion
) -> Tuple[List[str], List[str]]:
    """
    Return (cols_A, cols_B) disjoint numeric feature lists.
    Drops columns that become all-NaN after numeric coercion.
    """
    all_cols = df.columns.tolist()
    feature_cols = [c for c in all_cols if (id_col is None or c != id_col)]

    # Coerce and measure coverage
    valid_frac = {}
    for c in feature_cols:
        s = pd.to_numeric(_clean_numeric_like_text(df[c]), errors="coerce")
        valid_frac[c] = s.notna().mean()

    numeric_ok = [c for c in feature_cols if valid_frac[c] >= min_valid_fraction]

    if explicit_A and explicit_B:
        missing = [c for c in (explicit_A + explicit_B) if c not in numeric_ok]
        if missing:
            raise ValueError(f"Explicit columns not usable (non-numeric or too many NaNs): {missing}")
        overlap = set(explicit_A).intersection(explicit_B)
        if overlap:
            raise ValueError(f"Explicit A/B overlap: {overlap}")
        return list(explicit_A), list(explicit_B)

    cols = numeric_ok.copy()
    if shuffle:
        rng = np.random.default_rng(seed)
        rng.shuffle(cols)

    dropped = [c for c in feature_cols if c not in numeric_ok]
    if dropped:
        logging.warning(f"Dropping {len(dropped)} non-usable column(s): {dropped[:10]}{'...' if len(dropped)>10 else ''}")

    mid = len(cols) // 2
    cols_A, cols_B = cols[:mid], cols[mid:]
    if len(cols_A) == 0 or len(cols_B) == 0:
        raise ValueError("Column split resulted in an empty side; need at least two usable numeric columns.")
    return cols_A, cols_B


# ---------- builders (column-split pairing) ----------
def build_pairs_from_single_df_column_split(
    df: pd.DataFrame,
    *,
    id_col: Optional[str],
    negatives_per_positive: int,
    seed: int,
    source_file: str,
    column_split_mode: str = "half",
    column_split_shuffle: bool = False,
    explicit_cols_A: Optional[List[str]] = None,
    explicit_cols_B: Optional[List[str]] = None,
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    """
    One CSV:
    - ensure ID,
    - convert ONLY non-numeric, date-like columns to Unix,
    - split numeric columns into A/B,
    - for each row i: pos = (i,i); negs = (i, j≠i) K times,
    - compute features; return X, y, idx.
    """
    rng = np.random.default_rng(seed)

    # Ensure ID column
    if id_col is None or id_col not in df.columns:
        df = df.copy()
        df["_row_id"] = np.arange(len(df)).astype(str)
        stem = os.path.splitext(os.path.basename(source_file))[0]
        df["_row_id"] = stem + "::" + df["_row_id"]
        id_col_eff = "_row_id"
        logging.debug(f"[{source_file}] Synthetic ID '_row_id' created with file prefix")
    else:
        id_col_eff = id_col

    # 1) Convert ONLY non-numeric, date-like columns to Unix timestamp
    df, converted = convert_only_non_numeric_dates_to_unix(
        df,
        id_col=id_col_eff,
        numeric_min_valid_fraction=NUMERIC_MIN_VALID_FRACTION,
        date_min_valid_fraction=DATE_MIN_VALID_FRACTION,
        unit=UNIX_UNIT,
    )
    if converted:
        logging.info(f"Converted {len(converted)} non-numeric date-like column(s) to Unix {UNIX_UNIT}: {converted}")

    # 2) Split numeric columns into A and B
    cols_A, cols_B = split_numeric_columns(
        df, id_col=id_col_eff,
        mode=column_split_mode, shuffle=column_split_shuffle, seed=seed,
        explicit_A=explicit_cols_A, explicit_B=explicit_cols_B,
    )
    logging.info(f"[{os.path.basename(source_file)}] Column split: A={len(cols_A)} cols, B={len(cols_B)} cols")

    # 3) Coerce chosen cols to numeric (cleaning commas/spaces)
    df = as_numeric(df, cols_A + cols_B)

    A = df[[id_col_eff] + cols_A].copy()
    B = df[[id_col_eff] + cols_B].copy()
    XA = A[cols_A].to_numpy(dtype=float)
    XB = B[cols_B].to_numpy(dtype=float)

    # 4) Standardize per side
    ZA = zscore(XA.copy()); ZB = zscore(XB.copy())
    PA = percentile_rank_matrix(XA.copy()); PB = percentile_rank_matrix(XB.copy())

    n = len(df)
    if n < 2:
        raise ValueError("Need at least 2 rows to form negatives.")

    # 5) Build pairs for every row
    pos_pairs, neg_pairs = [], []
    all_indices = np.arange(n)
    for i in range(n):
        pos_pairs.append((i, i))
        size = min(negatives_per_positive, n - 1)
        if size > 0:
            candidates = np.delete(all_indices, i)
            choices = rng.choice(candidates, size=size, replace=False)
            for j in choices:
                neg_pairs.append((i, j))

    # 6) Compute features + labels + index
    X_rows, y_rows, idx_rows = [], [], []
    for (i, j) in pos_pairs:
        feats = pair_features_numeric(XA[i], XB[j], ZA[i], ZB[j], PA[i], PB[j])
        X_rows.append(feats); y_rows.append(1)
        idx_rows.append({
            "idA": A[id_col_eff].iloc[i], "idB": B[id_col_eff].iloc[j],
            "row_idx_A": i, "row_idx_B": j, "label_type": "pos",
            "source_file": os.path.basename(source_file)
        })
    for (i, j) in neg_pairs:
        feats = pair_features_numeric(XA[i], XB[j], ZA[i], ZB[j], PA[i], PB[j])
        X_rows.append(feats); y_rows.append(0)
        idx_rows.append({
            "idA": A[id_col_eff].iloc[i], "idB": B[id_col_eff].iloc[j],
            "row_idx_A": i, "row_idx_B": j, "label_type": "neg",
            "source_file": os.path.basename(source_file)
        })

    return pd.DataFrame(X_rows), pd.Series(y_rows, name="label"), pd.DataFrame(idx_rows)


def _apply_row_cap(df: pd.DataFrame, cap: int, mode: str, seed: int) -> pd.DataFrame:
    if cap is None or len(df) <= cap:
        return df
    if mode == "sample":
        return df.sample(n=cap, random_state=seed)
    return df.head(cap)


def save_combined_with_ids_and_drop(X: pd.DataFrame, y: pd.Series, idx: pd.DataFrame, out_path: str, drop_cols: List[str]):
    """
    Concatenate IDs/meta + features + label, drop requested columns, and save.
    """
    df_combined = pd.concat(
        [idx.reset_index(drop=True), X.reset_index(drop=True), y.reset_index(drop=True)], axis=1
    )
    # Drop requested columns if present
    drop_present = [c for c in drop_cols if c in df_combined.columns]
    if drop_present:
        df_combined = df_combined.drop(columns=drop_present)
        logging.info(f"Dropped from output: {drop_present}")

    os.makedirs(os.path.dirname(os.path.abspath(out_path)) or ".", exist_ok=True)
    ext = os.path.splitext(out_path)[1].lower()
    if ext == ".parquet":
        df_combined.to_parquet(out_path, index=False)
    else:
        df_combined.to_csv(out_path, index=False)
    logging.info(f"Combined saved: {out_path} | rows={len(df_combined):,}, cols={df_combined.shape[1]:,}")


def build_pairs_from_dir_column_split(
    in_dir: str,
    *,
    glob_pattern: str = "*.csv",
    recursive: bool = False,
    id_col: Optional[str] = None,
    negatives_per_positive: int = 9,
    seed: int = 42,
    combined_out: str = "Thesis II/Datasets/XGB_Train/pairs_combined_with_ids.csv",
    log_every_files: int = 10,
    row_limit: Optional[int] = None,
    row_limit_mode: str = "head",
    column_split_mode: str = "half",
    column_split_shuffle: bool = False,
    explicit_cols_A: Optional[List[str]] = None,
    explicit_cols_B: Optional[List[str]] = None,
):
    """
    Multi-CSV orchestrator (column split pipeline with selective date→Unix conversion).
    """
    t0 = time.perf_counter()

    # Find files
    search = os.path.join(in_dir, "**", glob_pattern) if recursive else os.path.join(in_dir, glob_pattern)
    files = sorted(glob.glob(search, recursive=recursive))
    if not files:
        raise FileNotFoundError(f"No files matched: {search}")
    logging.info(f"Found {len(files):,} CSV files")

    all_X, all_y, all_idx = [], [], []
    total_pos, total_neg = 0, 0

    for k, f in enumerate(files, 1):
        df = pd.read_csv(f)

        # Row cap
        original_rows = len(df)
        df = _apply_row_cap(df, row_limit, row_limit_mode, seed)
        if len(df) < original_rows:
            logging.info(f"[{k}/{len(files)}] {os.path.basename(f)} — capped {original_rows:,} -> {len(df):,} rows")

        if len(df) < 2:
            logging.warning(f"[{k}/{len(files)}] {os.path.basename(f)} has <2 rows after capping; skipping.")
            continue

        logging.info(f"[{k}/{len(files)}] {os.path.basename(f)} — rows={len(df):,}, cols={len(df.columns):,}")

        X, y, idx = build_pairs_from_single_df_column_split(
            df=df,
            id_col=id_col,
            negatives_per_positive=negatives_per_positive,
            seed=seed,
            source_file=f,
            column_split_mode=column_split_mode,
            column_split_shuffle=column_split_shuffle,
            explicit_cols_A=explicit_cols_A,
            explicit_cols_B=explicit_cols_B,
        )

        all_X.append(X); all_y.append(y); all_idx.append(idx)
        pos = int(y.sum()); neg = len(y) - pos
        total_pos += pos; total_neg += neg

        if k % log_every_files == 0:
            logging.info(f"  Progress: {k:,}/{len(files):,} files | "
                         f"pairs so far={total_pos+total_neg:,} (pos={total_pos:,}, neg={total_neg:,})")

    if not all_y:
        raise RuntimeError("No valid CSVs produced pairs (check input files).")

    # Concatenate
    X = pd.concat(all_X, axis=0, ignore_index=True)
    y = pd.concat(all_y, axis=0, ignore_index=True)
    idx = pd.concat(all_idx, axis=0, ignore_index=True)

    logging.info(f"FINAL — pairs={len(y):,} (pos={int(y.sum()):,}, neg={len(y)-int(y.sum()):,}), features={X.shape[1]:,}")

    # Save combined, dropping unwanted columns
    save_combined_with_ids_and_drop(X, y, idx, combined_out, drop_cols=DROP_FROM_OUTPUT)

    logging.info(f"Total elapsed: {time.perf_counter() - t0:.2f}s")
    return X, y, idx

In [None]:
# ============================================
# RUN — execute this cell
# ============================================
setup_logging(LOG_LEVEL, LOG_FILE)

X_all, y_all, idx_all = build_pairs_from_dir_column_split(
    in_dir=IN_DIR,
    glob_pattern=GLOB_PATTERN,
    recursive=RECURSIVE,
    id_col=ID_COL,
    negatives_per_positive=NEG_PER_POS,
    seed=42,
    combined_out=COMBINED_OUT,
    log_every_files=LOG_EVERY_FILES,
    row_limit=ROW_LIMIT,
    row_limit_mode=ROW_LIMIT_MODE,
    column_split_mode=COLUMN_SPLIT_MODE,
    column_split_shuffle=COLUMN_SPLIT_SHUFFLE,
    explicit_cols_A=EXPLICIT_COLS_A,
    explicit_cols_B=EXPLICIT_COLS_B,
)

print("Done.")
print("Saved:", COMBINED_OUT)
print("Shapes — X:", X_all.shape, "| y:", y_all.shape, "| idx:", idx_all.shape)


2025-09-28 22:50:37 | INFO    | Found 25 CSV files
2025-09-28 22:50:38 | INFO    | [1/25] AAPL.csv — capped 987,754 -> 10,000 rows
2025-09-28 22:50:38 | INFO    | [1/25] AAPL.csv — rows=10,000, cols=8
2025-09-28 22:50:38 | INFO    | Converted 1 non-numeric date-like column(s) to Unix s: ['ts']
2025-09-28 22:50:38 | INFO    | [AAPL.csv] Column split: A=4 cols, B=4 cols
2025-09-28 22:50:47 | INFO    | [2/25] AMZN.csv — capped 824,787 -> 10,000 rows
2025-09-28 22:50:47 | INFO    | [2/25] AMZN.csv — rows=10,000, cols=8
2025-09-28 22:50:47 | INFO    | Converted 1 non-numeric date-like column(s) to Unix s: ['ts']
2025-09-28 22:50:47 | INFO    | [AMZN.csv] Column split: A=4 cols, B=4 cols
2025-09-28 22:50:56 | INFO    | [3/25] BA.csv — capped 664,841 -> 10,000 rows
2025-09-28 22:50:56 | INFO    | [3/25] BA.csv — rows=10,000, cols=8
2025-09-28 22:50:56 | INFO    | Converted 1 non-numeric date-like column(s) to Unix s: ['ts']
2025-09-28 22:50:56 | INFO    | [BA.csv] Column split: A=4 cols, B=4 

Done.
Saved: ../Datasets/XGB_Train/pairs_combined_with_ids.csv
Shapes — X: (2500000, 14) | y: (2500000,) | idx: (2500000, 6)


In [1]:
# ===========================
# CONFIG
# ===========================
COMBINED_PATH = "../Datasets/XGB_Train/pairs_combined_with_ids.csv"
OUT_DIR = "../Datasets/XGB_Train"

TRAIN_FRACTION = 0.75
VAL_FRACTION   = 0.10
TEST_FRACTION  = 0.15
SEED = 42

assert abs(TRAIN_FRACTION + VAL_FRACTION + TEST_FRACTION - 1.0) < 1e-9, "Fractions must sum to 1.0"

# ===========================
# IMPLEMENTATION
# ===========================
import os
import pandas as pd
from sklearn.model_selection import train_test_split

os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(COMBINED_PATH)
assert "label" in df.columns, "Expected a 'label' column in the combined dataset."

# If you want to model only on the numeric feature columns, you can select them here.
# The splitter only needs 'label' to stratify, but we keep the full rows so you
# can decide later what columns to use in training.
y = df["label"]

# --- First split: train vs temp (val+test) ---
test_val_size = VAL_FRACTION + TEST_FRACTION
train_df, temp_df = train_test_split(
    df, test_size=test_val_size, random_state=SEED, stratify=y
)

# --- Second split: val vs test (from temp) ---
# We need to compute the *relative* size of val inside temp
rel_val = VAL_FRACTION / (VAL_FRACTION + TEST_FRACTION)
temp_y = temp_df["label"]
val_df, test_df = train_test_split(
    temp_df, test_size=(1 - rel_val), random_state=SEED, stratify=temp_y
)

# Sanity checks: class balance & sizes
def summarize(split_name, d):
    counts = d["label"].value_counts(dropna=False).sort_index()
    pct = (counts / len(d)).round(4)
    print(f"{split_name:>5} | rows={len(d):,} | label counts: {counts.to_dict()} | ratio: {pct.to_dict()}")

summarize("train", train_df)
summarize(" val ", val_df)
summarize("test ", test_df)

# Save splits (CSV). You can change to .parquet if preferred.
train_path = os.path.join(OUT_DIR, "train.csv")
val_path   = os.path.join(OUT_DIR, "val.csv")
test_path  = os.path.join(OUT_DIR, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nSaved:")
print(" -", train_path)
print(" -", val_path)
print(" -", test_path)


train | rows=1,875,000 | label counts: {0: 1687500, 1: 187500} | ratio: {0: 0.9, 1: 0.1}
 val  | rows=250,000 | label counts: {0: 225000, 1: 25000} | ratio: {0: 0.9, 1: 0.1}
test  | rows=375,000 | label counts: {0: 337500, 1: 37500} | ratio: {0: 0.9, 1: 0.1}

Saved:
 - ../Datasets/XGB_Train/train.csv
 - ../Datasets/XGB_Train/val.csv
 - ../Datasets/XGB_Train/test.csv


## XGB Model

In [2]:
# Imports + small GPU/CPU helper

import os
import numpy as np
import pandas as pd
from math import floor


from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score,
    precision_recall_fscore_support, confusion_matrix
)
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

def gpu_params_for_xgb():
    """
    Return (params_update_dict, description).
    Prefers GPU if your xgboost build supports it.
    """
    ver = tuple(int(x) for x in xgb.__version__.split(".")[:2])
    if ver >= (2, 0):
        # XGBoost 2.x
        return ({"device": "cuda", "tree_method": "hist"}, "XGB>=2: device=cuda, tree_method=hist")
    else:
        # XGBoost 1.x
        return ({"tree_method": "gpu_hist", "predictor": "gpu_predictor"}, "XGB<2: tree_method=gpu_hist, predictor=gpu_predictor")

def try_gpu_or_fallback(base_params):
    """Try a 1-iter train to confirm GPU; fall back to CPU hist if not available."""
    gpu_update, note = gpu_params_for_xgb()
    params = base_params.copy()
    params.update(gpu_update)
    try:
        dm = xgb.DMatrix(np.array([[0.0],[1.0]], dtype=np.float32), label=np.array([0,1], dtype=np.int32))
        xgb.train(params, dm, num_boost_round=1)
        print(f"[INFO] Using GPU -> {note}")
        return params
    except Exception as e:
        print(f"[WARN] GPU not available ({e}). Falling back to CPU (tree_method='hist').")
        params = base_params.copy()
        params.update({"tree_method": "hist"})
        return params


In [3]:
# Paths & core config

DATA_DIR   = "../Datasets/XGB_Train"
TRAIN_PATH = f"{DATA_DIR}/train.csv"
VAL_PATH   = f"{DATA_DIR}/val.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"

MODEL_OUT    = f"{DATA_DIR}/xgb_model.json"
FEATURES_OUT = f"{DATA_DIR}/xgb_features.txt"

SEED = 42

# Threshold criterion: "f1" (balanced) or "youden" (TPR - FPR)
THRESHOLD_CRITERION = "f1"

# Standardization (XGBoost doesn't require it; keep True if you want it)
USE_SCALER = True


In [4]:
# Load splits

train_df = pd.read_csv(TRAIN_PATH)
val_df   = pd.read_csv(VAL_PATH)
test_df  = pd.read_csv(TEST_PATH)

assert "label" in train_df.columns and "label" in val_df.columns and "label" in test_df.columns
print(f"Loaded: train={len(train_df):,}, val={len(val_df):,}, test={len(test_df):,}")


Loaded: train=1,875,000, val=250,000, test=375,000


In [5]:
# Select numeric features (drop obvious IDs/meta)

id_like = {"idA", "idB", "row_idx_A", "row_idx_B", "label_type", "source_file"}
drop_cols = [c for c in train_df.columns if c in id_like or c == "label"]

num_cols = [c for c in train_df.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(train_df[c])]
assert len(num_cols) > 0, "No numeric feature columns found."

X_train = train_df[num_cols].to_numpy(dtype=float)
y_train = train_df["label"].to_numpy(dtype=int)

X_val   = val_df[num_cols].to_numpy(dtype=float)
y_val   = val_df["label"].to_numpy(dtype=int)

X_test  = test_df[num_cols].to_numpy(dtype=float)
y_test  = test_df["label"].to_numpy(dtype=int)

print(f"Features used ({len(num_cols)}): {num_cols[:10]}{' ...' if len(num_cols)>10 else ''}")

# Optional standardization
if USE_SCALER:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)
    X_test  = scaler.transform(X_test)


Features used (12): ['absdiff_mean', 'absdiff_median', 'reldiff_mean', 'zdiff_mean', 'pcdiff_mean', 'L1_raw', 'L2_raw', 'L1_z', 'L2_z', 'L1_pct'] ...


In [6]:
# Compute scale_pos_weight from TRAIN split

pos = y_train.sum()
neg = len(y_train) - pos
spw = (neg / max(pos, 1)) if pos > 0 else 1.0
print(f"scale_pos_weight (train): {spw:.3f}  (pos={pos}, neg={neg})")

# Define presets and pick one

PRESETS = {
    "baseline": {
        "learning_rate": 0.05,
        "max_depth": 6,
        "min_child_weight": 5,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "gamma": 0.0,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0,
        # uses scale_pos_weight from Cell 5
    },
    "fp_averse": {  # precision-leaning
        "learning_rate": 0.05,
        "max_depth": 5,         # ↓
        "min_child_weight": 8,  # ↑
        "subsample": 0.8,
        "colsample_bytree": 0.7, # ↓
        "gamma": 1.0,           # ↑
        "reg_alpha": 0.0,
        "reg_lambda": 3.0,      # ↑
    },
    "recall_oriented": {
        "learning_rate": 0.03,  # ↓
        "max_depth": 7,         # ↑
        "min_child_weight": 2,  # ↓
        "subsample": 0.9,       # ↑
        "colsample_bytree": 0.9,# ↑
        "gamma": 0.0,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0,
        # tip: you can reduce spw effect by multiplying it later (e.g., 0.7*spw)
    },
    "overfit_guarded": {
        "learning_rate": 0.06,  # ↑
        "max_depth": 4,         # ↓
        "min_child_weight": 10, # ↑
        "subsample": 0.7,       # ↓
        "colsample_bytree": 0.7,# ↓
        "gamma": 2.0,           # ↑
        "reg_alpha": 0.5,       # ↑ (L1)
        "reg_lambda": 8.0,      # ↑ (L2)
    },
    "gpu_large": {
        "learning_rate": 0.05,
        "max_depth": 6,
        "min_child_weight": 6,  # ↑
        "subsample": 0.85,      # ↑
        "colsample_bytree": 0.8,
        "gamma": 0.5,           # ↑
        "reg_alpha": 0.0,
        "reg_lambda": 2.0,      # ↑
        "max_bin": 512,         # new (GPU hist quality)
    },
}

PRESET_NAME = "recall_oriented"  # <-- choose: baseline | fp_averse | recall_oriented | overfit_guarded | gpu_large 
preset = PRESETS[PRESET_NAME]
print("Using preset:", PRESET_NAME, "\n", preset)


scale_pos_weight (train): 9.000  (pos=187500, neg=1687500)
Using preset: recall_oriented 
 {'learning_rate': 0.03, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.0, 'reg_alpha': 0.0, 'reg_lambda': 1.0}


In [7]:
# ---- knobs you can tweak ----
NUM_BOOST_ROUND = 40_000     # max rounds (early stopping may halt sooner)
PATIENCE        = 1_000       # rounds with no val AUC improvement before stopping
VERBOSE_EVAL    = 200         # print every N rounds
USE_LR_SCHEDULE = True        # set False to keep fixed learning_rate from params

# Learning-rate schedule options (choose one style below)
def lr_step_decay(base_lr, step_size=2_000, decay=0.5):
    """
    Halve lr every `step_size` rounds: 0..1999: lr, 2000..3999: lr*0.5, etc.
    """
    def _sched(round_idx):
        from math import floor
        k = floor(round_idx / step_size)
        return base_lr * (decay ** k)
    return _sched

def lr_cosine_decay(base_lr, min_lr=0.005, total_rounds=50_000):
    """
    Cosine annealing from base_lr -> min_lr over `total_rounds`, then flat.
    """
    import math
    def _sched(round_idx):
        if round_idx >= total_rounds:
            return min_lr
        cos = (1 + math.cos(math.pi * round_idx / total_rounds)) / 2.0
        return min_lr + (base_lr - min_lr) * cos
    return _sched




# ---- build params with GPU/CPU selection (reuse from earlier cells) ----
base_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "random_state": SEED,
    "scale_pos_weight": base_params.get("scale_pos_weight", spw) if 'base_params' in globals() else spw,
}
# if you already created `preset` in Cell 6:
params = base_params.copy(); params.update(preset)
params = try_gpu_or_fallback(params)

# Pick ONE schedule to use if USE_LR_SCHEDULE is True (now params is defined)
if USE_LR_SCHEDULE:
    lr_scheduler = lr_step_decay(
        base_lr=params.get("learning_rate", 0.05),
        step_size=3_000,
        decay=0.5
    )
    # Or choose cosine:
    # lr_scheduler = lr_cosine_decay(
    #     base_lr=params.get("learning_rate", 0.05),
    #     min_lr=0.005,
    #     total_rounds=60_000
    # )
else:
    lr_scheduler = None


# ---- data matrices ----
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=num_cols)
dval   = xgb.DMatrix(X_val,   label=y_val,   feature_names=num_cols)
dtest  = xgb.DMatrix(X_test,  label=y_test,  feature_names=num_cols)   # NEW


# ---- callbacks ----
callbacks = [
    xgb.callback.EarlyStopping(
        rounds=PATIENCE,
        save_best=True,          # keep best iteration
        maximize=True            # AUC is to be maximized
    )
]
if USE_LR_SCHEDULE:
    callbacks.append(xgb.callback.LearningRateScheduler(lr_scheduler))

# ---- train ----
evals = [(dtrain, "train"), (dval, "val")]
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=NUM_BOOST_ROUND,
    evals=evals,
    verbose_eval=VERBOSE_EVAL,
    callbacks=callbacks
)

print(f"Stopped at iteration: {bst.best_iteration}  | Best val AUC: {bst.best_score}")

[INFO] Using GPU -> XGB>=2: device=cuda, tree_method=hist
[0]	train-auc:0.99509	val-auc:0.92082
[200]	train-auc:0.99520	val-auc:0.95759
[400]	train-auc:0.99539	val-auc:0.95441
[600]	train-auc:0.99568	val-auc:0.93870
[800]	train-auc:0.99595	val-auc:0.92669
[1000]	train-auc:0.99618	val-auc:0.92204
[1200]	train-auc:0.99641	val-auc:0.91777
[1248]	train-auc:0.99646	val-auc:0.91696
Stopped at iteration: 248  | Best val AUC: 0.9576907373333333


In [8]:
# Pick decision threshold from validation set

def pick_threshold(y_true, p, method="f1"):
    thresholds = np.linspace(0.01, 0.99, 99)
    best_t, best_score = 0.5, -1
    for t in thresholds:
        y_hat = (p >= t).astype(int)
        if method == "f1":
            prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
            score = f1
        elif method == "youden":
            tp = np.sum((y_true == 1) & (y_hat == 1))
            fn = np.sum((y_true == 1) & (y_hat == 0))
            fp = np.sum((y_true == 0) & (y_hat == 1))
            tn = np.sum((y_true == 0) & (y_hat == 0))
            tpr = tp / max(tp + fn, 1)  # recall
            fpr = fp / max(fp + tn, 1)
            score = tpr - fpr
        else:
            raise ValueError("Unknown method")
        if score > best_score:
            best_score, best_t = score, t
    return best_t, best_score

dval = xgb.DMatrix(X_val, label=y_val, feature_names=num_cols)
val_pred_proba = bst.predict(dval, iteration_range=(0, bst.best_iteration+1))

best_thresh, best_val_score = pick_threshold(y_val, val_pred_proba, method=THRESHOLD_CRITERION)
print(f"Chosen threshold ({THRESHOLD_CRITERION}): {best_thresh:.3f}  | score={best_val_score:.4f}")

# --- NEW: add compat_score to each split and overwrite the same files ---
train_pred_proba = bst.predict(dtrain, iteration_range=(0, bst.best_iteration + 1))
test_pred_proba  = bst.predict(dtest,  iteration_range=(0, bst.best_iteration + 1))

train_df["compat_score"] = train_pred_proba.astype(float)
val_df["compat_score"]   = val_pred_proba.astype(float)   # already computed above
test_df["compat_score"]  = test_pred_proba.astype(float)

# Overwrite in place (no new files)
train_df.to_csv(TRAIN_PATH, index=False)
val_df.to_csv(VAL_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)
print("[INFO] compat_score added and splits overwritten:", TRAIN_PATH, VAL_PATH, TEST_PATH)


Chosen threshold (f1): 0.770  | score=0.7412
[INFO] compat_score added and splits overwritten: ../Datasets/XGB_Train/train.csv ../Datasets/XGB_Train/val.csv ../Datasets/XGB_Train/test.csv


In [9]:
def evaluate_split(name, y_true, proba, threshold):
    y_pred = (proba >= threshold).astype(int)
    auc = roc_auc_score(y_true, proba)
    ap  = average_precision_score(y_true, proba)
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n[{name}]")
    print(f"ROC-AUC={auc:.4f} | PR-AUC={ap:.4f} | ACC={acc:.4f} | P={prec:.4f} | R={rec:.4f} | F1={f1:.4f}")
    print("Confusion matrix (tn, fp; fn, tp):")
    print(cm)
    return {"auc": auc, "ap": ap, "acc": acc, "prec": prec, "rec": rec, "f1": f1, "cm": cm}

# VAL
val_metrics = evaluate_split("VAL", y_val, val_pred_proba, best_thresh)

# TEST
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=num_cols)
test_pred_proba = bst.predict(dtest, iteration_range=(0, bst.best_iteration+1))
test_metrics = evaluate_split("TEST", y_test, test_pred_proba, best_thresh)



[VAL]
ROC-AUC=0.9577 | PR-AUC=0.7603 | ACC=0.9444 | P=0.6932 | R=0.7963 | F1=0.7412
Confusion matrix (tn, fp; fn, tp):
[[216191   8809]
 [  5093  19907]]

[TEST]
ROC-AUC=0.9572 | PR-AUC=0.7567 | ACC=0.9438 | P=0.6909 | R=0.7929 | F1=0.7384
Confusion matrix (tn, fp; fn, tp):
[[324197  13303]
 [  7767  29733]]


In [10]:
# Save artifacts
bst.save_model(MODEL_OUT)
with open(FEATURES_OUT, "w", encoding="utf-8") as f:
    for c in num_cols:
        f.write(f"{c}\n")

print("\nSaved:")
print(" - Model:", MODEL_OUT)
print(" - Features list:", FEATURES_OUT)

# Feature importance (gain)
importance = bst.get_score(importance_type="gain")
imp_df = (pd.DataFrame({"feature": list(importance.keys()),
                        "gain": list(importance.values())})
            .sort_values("gain", ascending=False)
            .reset_index(drop=True))
print("\nTop features by gain:")
display(imp_df.head(15))



Saved:
 - Model: ../Datasets/XGB_Train/xgb_model.json
 - Features list: ../Datasets/XGB_Train/xgb_features.txt

Top features by gain:


Unnamed: 0,feature,gain
0,compat_score,19084.712891
1,pcdiff_mean,548.583984
2,L2_pct,222.523087
3,zdiff_mean,197.823044
4,L2_z,146.249954
5,reldiff_mean,88.446213
6,absdiff_mean,47.208988
7,L1_z,33.245438
8,absdiff_median,20.655066
9,L1_pct,18.690516


### Testing Metrics

In [11]:
import pandas as pd

TEST_PATH = "../Datasets/XGB_Train/test.csv"  # adjust if needed
THRESH = 0.80

df = pd.read_csv(TEST_PATH)
assert {"idA","idB","compat_score"}.issubset(df.columns), "test.csv must have idA, idB, compat_score"

eq  = df["idA"].astype(str) == df["idB"].astype(str)
gt  = df["compat_score"].astype(float) > THRESH   # strict '>' as requested

# For idA == idB
total_eq = int(eq.sum())
hit_eq   = int((eq & gt).sum())
pct_eq   = 100.0 * hit_eq / total_eq if total_eq else 0.0

print(f"idA==idB: {hit_eq}/{total_eq} rows have compat_score > {THRESH} -> {pct_eq:.2f}%")

idA==idB: 29356/37500 rows have compat_score > 0.8 -> 78.28%


In [12]:
neq       = ~eq
total_neq = int(neq.sum())
hit_neq   = int((neq & gt).sum())
pct_neq   = 100.0 * hit_neq / total_neq if total_neq else 0.0

print(f"idA!=idB: {hit_neq}/{total_neq} rows have compat_score > {THRESH} -> {pct_neq:.2f}%")

idA!=idB: 12773/337500 rows have compat_score > 0.8 -> 3.78%


In [None]:
# ============================================
# NEW CELL 32: 🔍 DATA LEAKAGE TEST - ADD NOISE
# ============================================
print("\n" + "="*80)
print("DATA LEAKAGE DETECTION TEST")
print("="*80)
print("""
Testing strategy:
  1. Add increasing levels of Gaussian noise to test features
  2. Re-evaluate model on noisy data
  3. Compare metrics to original test performance
  
Expected behavior:
  ✓ NO LEAKAGE: Metrics degrade gradually with noise
  ✗ LEAKAGE: Metrics stay high despite noise (model memorized)
""")

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_fscore_support

# Load original test data
test_df_original = pd.read_csv(TEST_PATH)

# Extract features (same as before)
id_like = {"idA", "idB", "row_idx_A", "row_idx_B", "label_type", "source_file", "compat_score"}
drop_cols = [c for c in test_df_original.columns if c in id_like or c == "label"]
num_cols_test = [c for c in test_df_original.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(test_df_original[c])]

X_test_clean = test_df_original[num_cols_test].to_numpy(dtype=float)
y_test_clean = test_df_original["label"].to_numpy(dtype=int)

# Apply same standardization as training
if USE_SCALER:
    X_test_clean_scaled = scaler.transform(X_test_clean)
else:
    X_test_clean_scaled = X_test_clean

print(f"\n✓ Loaded test data: {len(X_test_clean):,} samples, {len(num_cols_test)} features")


# ============================================
# Function to add Gaussian noise
# ============================================
def add_gaussian_noise(X, noise_level=0.1, seed=42):
    """
    Add Gaussian noise to features.
    
    Args:
        X: Feature matrix (n_samples, n_features)
        noise_level: Std dev of noise as fraction of feature std dev
        seed: Random seed
    
    Returns:
        X_noisy: Feature matrix with added noise
    """
    np.random.seed(seed)
    X_noisy = X.copy()
    
    # For each feature, add noise proportional to its std dev
    for j in range(X.shape[1]):
        feature_std = np.nanstd(X[:, j])
        if feature_std > 0:
            noise = np.random.normal(0, noise_level * feature_std, size=X.shape[0])
            X_noisy[:, j] = X[:, j] + noise
    
    return X_noisy


# ============================================
# Test with increasing noise levels
# ============================================
noise_levels = [0.0, 0.05, 0.10, 0.20, 0.30, 0.50, 0.75, 1.0]
results_noise_test = []

print("\n" + "="*80)
print("TESTING WITH INCREASING NOISE LEVELS")
print("="*80)

for noise_level in noise_levels:
    # Add noise
    if noise_level == 0.0:
        X_test_noisy = X_test_clean_scaled
        noise_desc = "CLEAN (no noise)"
    else:
        X_test_noisy = add_gaussian_noise(X_test_clean_scaled, noise_level=noise_level)
        noise_desc = f"Noise σ = {noise_level:.2f}"
    
    # Create DMatrix
    dtest_noisy = xgb.DMatrix(X_test_noisy, label=y_test_clean, feature_names=num_cols_test)
    
    # Predict
    test_pred_noisy = bst.predict(dtest_noisy, iteration_range=(0, bst.best_iteration + 1))
    
    # Evaluate at F2-optimal threshold
    y_pred_noisy = (test_pred_noisy >= best_f2_thresh).astype(int)
    
    # Metrics
    auc = roc_auc_score(y_test_clean, test_pred_noisy)
    tn, fp, fn, tp = confusion_matrix(y_test_clean, y_pred_noisy).ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    # Store results
    results_noise_test.append({
        'noise_level': noise_level,
        'auc': auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn
    })
    
    # Print summary
    print(f"\n{noise_desc:20s} | AUC: {auc:.4f} | F1: {f1:.4f} | P: {precision:.4f} | R: {recall:.4f}")

print("\n" + "="*80)


# ============================================
# Convert to DataFrame for analysis
# ============================================
results_df = pd.DataFrame(results_noise_test)

print("\nDETAILED RESULTS TABLE:")
print(results_df.to_string(index=False))

# Save results
results_df.to_csv(f"{DATA_DIR}/leakage_test_results.csv", index=False)
print(f"\n✓ Results saved: {DATA_DIR}/leakage_test_results.csv")


# ============================================
# NEW CELL 33: 📊 VISUALIZE NOISE TEST RESULTS
# ============================================
print("\n" + "="*80)
print("VISUALIZING LEAKAGE TEST RESULTS")
print("="*80)

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: AUC degradation
ax = axes[0, 0]
ax.plot(results_df['noise_level'], results_df['auc'], 'o-', linewidth=3, markersize=10, color='steelblue')
ax.axhline(y=results_df['auc'].iloc[0], color='green', linestyle='--', alpha=0.5, label='Baseline (no noise)')
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Random guessing')
ax.set_xlabel('Noise Level (σ)', fontsize=13, fontweight='bold')
ax.set_ylabel('AUC', fontsize=13, fontweight='bold')
ax.set_title('AUC vs Noise Level', fontsize=15, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Plot 2: F1 Score degradation
ax = axes[0, 1]
ax.plot(results_df['noise_level'], results_df['f1'], 'o-', linewidth=3, markersize=10, color='darkgreen')
ax.axhline(y=results_df['f1'].iloc[0], color='green', linestyle='--', alpha=0.5, label='Baseline')
ax.set_xlabel('Noise Level (σ)', fontsize=13, fontweight='bold')
ax.set_ylabel('F1 Score', fontsize=13, fontweight='bold')
ax.set_title('F1 Score vs Noise Level', fontsize=15, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Plot 3: Precision and Recall
ax = axes[1, 0]
ax.plot(results_df['noise_level'], results_df['precision'], 'o-', linewidth=3, markersize=10, 
        color='blue', label='Precision')
ax.plot(results_df['noise_level'], results_df['recall'], 's-', linewidth=3, markersize=10, 
        color='orange', label='Recall')
ax.set_xlabel('Noise Level (σ)', fontsize=13, fontweight='bold')
ax.set_ylabel('Score', fontsize=13, fontweight='bold')
ax.set_title('Precision & Recall vs Noise Level', fontsize=15, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Plot 4: Relative degradation (%)
ax = axes[1, 1]
baseline_auc = results_df['auc'].iloc[0]
baseline_f1 = results_df['f1'].iloc[0]

auc_degradation = (1 - results_df['auc'] / baseline_auc) * 100
f1_degradation = (1 - results_df['f1'] / baseline_f1) * 100

ax.plot(results_df['noise_level'], auc_degradation, 'o-', linewidth=3, markersize=10, 
        color='steelblue', label='AUC degradation')
ax.plot(results_df['noise_level'], f1_degradation, 's-', linewidth=3, markersize=10, 
        color='darkgreen', label='F1 degradation')
ax.set_xlabel('Noise Level (σ)', fontsize=13, fontweight='bold')
ax.set_ylabel('Performance Loss (%)', fontsize=13, fontweight='bold')
ax.set_title('Relative Performance Degradation', fontsize=15, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.axhline(y=50, color='red', linestyle='--', alpha=0.5, label='50% loss')

plt.tight_layout()
plt.savefig(f"{DATA_DIR}/leakage_test_visualization.png", dpi=150, bbox_inches='tight')
print(f"✓ Visualization saved: {DATA_DIR}/leakage_test_visualization.png")
plt.show()


# ============================================
# NEW CELL 34: 🔍 LEAKAGE DIAGNOSIS
# ============================================
print("\n" + "="*80)
print("DATA LEAKAGE DIAGNOSIS")
print("="*80)

# Calculate degradation metrics
baseline_auc = results_df['auc'].iloc[0]
baseline_f1 = results_df['f1'].iloc[0]

auc_at_10pct = results_df[results_df['noise_level'] == 0.10]['auc'].values[0]
auc_at_50pct = results_df[results_df['noise_level'] == 0.50]['auc'].values[0]

auc_drop_10 = ((baseline_auc - auc_at_10pct) / baseline_auc) * 100
auc_drop_50 = ((baseline_auc - auc_at_50pct) / baseline_auc) * 100

f1_at_10pct = results_df[results_df['noise_level'] == 0.10]['f1'].values[0]
f1_at_50pct = results_df[results_df['noise_level'] == 0.50]['f1'].values[0]

f1_drop_10 = ((baseline_f1 - f1_at_10pct) / baseline_f1) * 100
f1_drop_50 = ((baseline_f1 - f1_at_50pct) / baseline_f1) * 100

print(f"\n📊 BASELINE PERFORMANCE (No Noise):")
print(f"   AUC: {baseline_auc:.4f}")
print(f"   F1:  {baseline_f1:.4f}")

print(f"\n📉 DEGRADATION WITH NOISE:")
print(f"   At 10% noise (σ=0.10):")
print(f"      AUC: {auc_at_10pct:.4f} (↓ {auc_drop_10:.1f}%)")
print(f"      F1:  {f1_at_10pct:.4f} (↓ {f1_drop_10:.1f}%)")
print(f"   ")
print(f"   At 50% noise (σ=0.50):")
print(f"      AUC: {auc_at_50pct:.4f} (↓ {auc_drop_50:.1f}%)")
print(f"      F1:  {f1_at_50pct:.4f} (↓ {f1_drop_50:.1f}%)")

# Diagnosis logic
print("\n" + "="*80)
print("LEAKAGE DETECTION VERDICT")
print("="*80)

leakage_detected = False
warnings = []

# Test 1: Is degradation gradual?
if auc_drop_10 < 2.0:
    warnings.append("⚠️  WARNING: AUC barely drops with 10% noise (< 2% loss)")
    warnings.append("   → Model may be memorizing patterns or has data leakage")
    leakage_detected = True

# Test 2: Does it stay unreasonably high?
if auc_at_50pct > 0.85 and baseline_auc > 0.90:
    warnings.append("⚠️  WARNING: AUC stays very high (>0.85) even with 50% noise")
    warnings.append("   → Possible leakage: model shouldn't work this well with heavy noise")
    leakage_detected = True

# Test 3: Check if AUC drops below 0.6 at high noise
auc_at_100pct = results_df[results_df['noise_level'] == 1.0]['auc'].values[0]
if auc_at_100pct > 0.75:
    warnings.append("⚠️  WARNING: AUC > 0.75 even with 100% noise")
    warnings.append("   → Strong indication of data leakage")
    leakage_detected = True

# Test 4: Check for reasonable degradation curve
expected_drop_50 = 25  # We expect at least 25% drop at 50% noise
if auc_drop_50 < expected_drop_50:
    warnings.append(f"⚠️  WARNING: AUC degradation too slow ({auc_drop_50:.1f}% at 50% noise)")
    warnings.append(f"   → Expected: >{expected_drop_50}% degradation")
    leakage_detected = True

# Print verdict
if leakage_detected:
    print("\n🚨 POTENTIAL DATA LEAKAGE DETECTED!")
    print("="*80)
    for warning in warnings:
        print(warning)
    
    print("\n🔍 RECOMMENDED ACTIONS:")
    print("   1. Check if idA/idB are leaking into features")
    print("   2. Verify train/val/test split is based on unique IDs")
    print("   3. Review feature engineering - are you using future information?")
    print("   4. Check for duplicate rows across splits")
    print("   5. Ensure same-file pairs aren't in multiple splits")
    
else:
    print("\n✅ NO DATA LEAKAGE DETECTED!")
    print("="*80)
    print(f"   ✓ Performance degrades gracefully with noise")
    print(f"   ✓ AUC drops {auc_drop_10:.1f}% at 10% noise (reasonable)")
    print(f"   ✓ AUC drops {auc_drop_50:.1f}% at 50% noise (expected)")
    print(f"   ✓ Model learned generalizable patterns, not memorization")
    
    print("\n📈 EXPECTED BEHAVIOR CONFIRMED:")
    print("   • Baseline: Strong performance")
    print("   • 10% noise: Slight degradation (model is robust)")
    print("   • 50% noise: Significant degradation (model can't handle garbage)")
    print("   • 100% noise: Major degradation (as expected)")

print("\n" + "="*80)


# ============================================
# NEW CELL 35: 🧪 ADDITIONAL LEAKAGE TESTS
# ============================================
print("\n" + "="*80)
print("ADDITIONAL LEAKAGE TESTS")
print("="*80)

# Test 1: Check for identical features between train and test
print("\n🔍 TEST 1: Checking for identical rows...")

train_df_check = pd.read_csv(TRAIN_PATH)
test_df_check = pd.read_csv(TEST_PATH)

# Create signature from feature columns only
train_signatures = train_df_check[num_cols_test].apply(
    lambda row: hash(tuple(row)), axis=1
)
test_signatures = test_df_check[num_cols_test].apply(
    lambda row: hash(tuple(row)), axis=1
)

duplicates = set(train_signatures).intersection(set(test_signatures))

if len(duplicates) > 0:
    print(f"   🚨 FOUND {len(duplicates)} identical feature rows in train and test!")
    print("   → CRITICAL: This is data leakage!")
else:
    print(f"   ✓ No identical feature rows found between train and test")


# Test 2: Check if IDs overlap
print("\n🔍 TEST 2: Checking for overlapping IDs...")

train_ids_a = set(train_df_check['idA'].astype(str))
train_ids_b = set(train_df_check['idB'].astype(str))
train_ids_all = train_ids_a.union(train_ids_b)

test_ids_a = set(test_df_check['idA'].astype(str))
test_ids_b = set(test_df_check['idB'].astype(str))
test_ids_all = test_ids_a.union(test_ids_b)

overlapping_ids = train_ids_all.intersection(test_ids_all)

print(f"   Unique IDs in train: {len(train_ids_all):,}")
print(f"   Unique IDs in test:  {len(test_ids_all):,}")
print(f"   Overlapping IDs:     {len(overlapping_ids):,}")

if len(overlapping_ids) > 0:
    overlap_pct = 100 * len(overlapping_ids) / len(test_ids_all)
    print(f"   ⚠️  {overlap_pct:.1f}% of test IDs also appear in training")
    
    if overlap_pct > 50:
        print("   🚨 HIGH OVERLAP: Potential leakage if same entity pairs in both sets")
    else:
        print("   ℹ️  Some overlap is OK if pairs are different (e.g., A-B in train, A-C in test)")
else:
    print("   ✓ No overlapping IDs (completely disjoint)")


# Test 3: Statistical test - are feature distributions too similar?
print("\n🔍 TEST 3: Comparing feature distributions...")

from scipy.stats import ks_2samp

feature_similarity_scores = []

for col in num_cols_test[:10]:  # Check first 10 features
    train_vals = train_df_check[col].dropna().values
    test_vals = test_df_check[col].dropna().values
    
    if len(train_vals) > 100 and len(test_vals) > 100:
        # Kolmogorov-Smirnov test
        statistic, pvalue = ks_2samp(train_vals, test_vals)
        feature_similarity_scores.append({
            'feature': col,
            'ks_statistic': statistic,
            'p_value': pvalue,
            'similar': pvalue > 0.05  # If p > 0.05, distributions are similar
        })

if feature_similarity_scores:
    similar_count = sum(1 for f in feature_similarity_scores if f['similar'])
    print(f"   Features with similar distributions: {similar_count}/{len(feature_similarity_scores)}")
    
    if similar_count == len(feature_similarity_scores):
        print("   ✓ Train and test have similar feature distributions (expected)")
    else:
        print(f"   ℹ️  Some features differ between train/test (may be OK if splits are stratified)")


print("\n" + "="*80)
print("✅ LEAKAGE TESTING COMPLETE")
print("="*80)

print(f"""
Summary of all tests:
  1. Noise robustness:     {'✓ PASS' if not leakage_detected else '✗ FAIL - check noise test'}
  2. Identical rows:       {'✓ PASS' if len(duplicates) == 0 else '✗ FAIL - duplicates found'}
  3. ID overlap:           {'✓ PASS' if len(overlapping_ids) < len(test_ids_all)*0.5 else '⚠️  WARNING - high overlap'}
  4. Distribution check:   {'✓ PASS' if feature_similarity_scores else 'ℹ️  SKIPPED'}

Overall verdict: {'✅ NO LEAKAGE DETECTED' if not leakage_detected and len(duplicates) == 0 else '🚨 POTENTIAL LEAKAGE - INVESTIGATE'}
""")


# ============================================
# NEW CELL 36: 📝 SAVE LEAKAGE TEST REPORT
# ============================================
leakage_report = f"""
{'='*80}
DATA LEAKAGE DETECTION REPORT
{'='*80}

Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

TEST METHODOLOGY
{'-'*80}
1. Added Gaussian noise at levels: {', '.join([f'{x:.0%}' for x in noise_levels[1:]])}
2. Re-evaluated model on noisy test data
3. Measured performance degradation
4. Checked for suspicious patterns

NOISE ROBUSTNESS TEST
{'-'*80}
Baseline (No Noise):
  AUC: {baseline_auc:.4f}
  F1:  {baseline_f1:.4f}

Performance at 10% noise:
  AUC: {auc_at_10pct:.4f} (↓ {auc_drop_10:.1f}%)
  F1:  {f1_at_10pct:.4f} (↓ {f1_drop_10:.1f}%)

Performance at 50% noise:
  AUC: {auc_at_50pct:.4f} (↓ {auc_drop_50:.1f}%)
  F1:  {f1_at_50pct:.4f} (↓ {f1_drop_50:.1f}%)

Performance at 100% noise:
  AUC: {auc_at_100pct:.4f} (↓ {((baseline_auc - auc_at_100pct) / baseline_auc * 100):.1f}%)

LEAKAGE INDICATORS
{'-'*80}
"""

if leakage_detected:
    leakage_report += "🚨 WARNINGS DETECTED:\n"
    for warning in warnings:
        leakage_report += f"{warning}\n"
else:
    leakage_report += "✅ No leakage indicators found\n"
    leakage_report += "   Performance degrades appropriately with noise\n"

leakage_report += f"""

ADDITIONAL TESTS
{'-'*80}
Identical rows in train/test:  {len(duplicates) if 'duplicates' in locals() else 'N/A'}
Overlapping IDs:               {len(overlapping_ids):,} ({100*len(overlapping_ids)/len(test_ids_all):.1f}% of test)

VERDICT
{'-'*80}
"""

if leakage_detected or (len(duplicates) > 0 if 'duplicates' in locals() else False):
    leakage_report += "🚨 POTENTIAL DATA LEAKAGE DETECTED\n\n"
    leakage_report += "RECOMMENDED ACTIONS:\n"
    leakage_report += "1. Review data splitting logic\n"
    leakage_report += "2. Ensure train/test are split by unique entity IDs\n"
    leakage_report += "3. Check feature engineering for temporal leakage\n"
    leakage_report += "4. Verify no duplicate pairs across splits\n"
else:
    leakage_report += "✅ NO DATA LEAKAGE DETECTED\n\n"
    leakage_report += "Model appears to have learned generalizable patterns.\n"
    leakage_report += "Performance degrades appropriately with synthetic noise.\n"

leakage_report += f"\n{'='*80}\n"

# Save report
with open(f"{DATA_DIR}/leakage_test_report.txt", 'w') as f:
    f.write(leakage_report)

print(leakage_report)
print(f"✓ Leakage test report saved: {DATA_DIR}/leakage_test_report.txt")

: 