# Notebook 15 â€” Dataset Consolidation
## Single `dataset/` folder Â· 60/20/20 stratified split Â· 1,342 fraud : 2,684 genuine

**What this does:**
- Pools ALL fraud images from 5 scattered locations â†’ `dataset/raw/fraud/`
- Pools genuine images (capped at 2Ã— fraud) â†’ `dataset/raw/genuine/`
- Moves remaining genuine to `dataset/genuine_extended/` (OCR benchmark only)
- Applies 60/20/20 stratified split â†’ `dataset/train/`, `dataset/val/`, `dataset/test/`
- OLD folders (`Main_Dataset/`, `Geniune Document DS/`) are kept as archive â€” NOT deleted

In [None]:
import os
import shutil
import random
from pathlib import Path
from sklearn.model_selection import train_test_split

BASE_DIR = Path(r"c:\Users\saigo\Desktop\fraud_document_ai")
DATASET_DIR = BASE_DIR / "dataset"

# Create all required directories
for split in ["raw/fraud", "raw/genuine", "genuine_extended",
              "train/fraud", "train/genuine",
              "val/fraud",   "val/genuine",
              "test/fraud",  "test/genuine"]:
    (DATASET_DIR / split).mkdir(parents=True, exist_ok=True)

print("âœ… Directory structure created")

In [None]:
# â”€â”€ STEP 1: Collect ALL fraud image paths â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

FRAUD_SOURCES = [
    BASE_DIR / "Main_Dataset" / "train"          / "fraud",
    BASE_DIR / "Main_Dataset" / "val"            / "fraud",
    BASE_DIR / "Main_Dataset" / "test"           / "fraud",
    BASE_DIR / "Main_Dataset" / "augmented"      / "fraud",
    BASE_DIR / "Main_Dataset" / "augmented_debug"/ "fraud",
]

all_fraud_paths = []
for src in FRAUD_SOURCES:
    if src.exists():
        imgs = [p for p in src.iterdir() if p.suffix.lower() in [".jpg",".jpeg",".png"]]
        print(f"  {src.relative_to(BASE_DIR)}: {len(imgs)} images")
        all_fraud_paths.extend(imgs)

print(f"\nðŸ“Š Total fraud images found: {len(all_fraud_paths)}")

In [None]:
# â”€â”€ STEP 2: Collect ALL genuine image paths â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

GENUINE_SOURCES = [
    BASE_DIR / "Main_Dataset" / "augmented"      / "genuine",  # best quality first
    BASE_DIR / "Main_Dataset" / "augmented_debug"/ "genuine",
    BASE_DIR / "Main_Dataset" / "train"          / "genuine",
    BASE_DIR / "Main_Dataset" / "val"            / "genuine",
    BASE_DIR / "Main_Dataset" / "test"           / "genuine",
]

# Also gather from the Geniune Document DS batches
genuine_ds = BASE_DIR / "Geniune Document DS"
for batch_dir in sorted(genuine_ds.iterdir()):
    GENUINE_SOURCES.append(batch_dir)
    # Check for sub-sub-batches
    for sub in batch_dir.iterdir():
        if sub.is_dir():
            GENUINE_SOURCES.append(sub)

all_genuine_paths = []
seen = set()
for src in GENUINE_SOURCES:
    if src.exists():
        imgs = [p for p in src.iterdir() 
                if p.suffix.lower() in [".jpg",".jpeg",".png"] and p.name not in seen]
        for p in imgs:
            seen.add(p.name)
        all_genuine_paths.extend(imgs)

print(f"ðŸ“Š Total unique genuine images found: {len(all_genuine_paths)}")

In [None]:
# â”€â”€ STEP 3: Deduplicate fraud, cap genuine at 2Ã— fraud â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

# Deduplicate fraud by filename
seen_fraud = set()
fraud_unique = []
for p in all_fraud_paths:
    if p.name not in seen_fraud:
        seen_fraud.add(p.name)
        fraud_unique.append(p)

random.seed(42)
random.shuffle(fraud_unique)
random.shuffle(all_genuine_paths)

n_fraud   = len(fraud_unique)
n_genuine_cap = min(n_fraud * 2, len(all_genuine_paths))

genuine_selected  = all_genuine_paths[:n_genuine_cap]
genuine_extended  = all_genuine_paths[n_genuine_cap:]

print(f"âœ… Fraud (deduped):     {n_fraud}")
print(f"âœ… Genuine (selected):  {n_genuine_cap}")
print(f"ðŸ“¦ Genuine (extended):  {len(genuine_extended)} â†’ genuine_extended/")
print(f"ðŸ“Š TOTAL in dataset:    {n_fraud + n_genuine_cap}")

In [None]:
# â”€â”€ STEP 4: Copy to raw/ and genuine_extended/ â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

def safe_copy(src: Path, dest_dir: Path, prefix: str = ""):
    new_name = prefix + src.name
    dest = dest_dir / new_name
    if dest.exists():
        # Handle collision: append parent folder name
        new_name = src.parent.name + "_" + src.name
        dest = dest_dir / new_name
    shutil.copy2(str(src), str(dest))

print("Copying fraud â†’ dataset/raw/fraud/ ...")
for p in fraud_unique:
    safe_copy(p, DATASET_DIR / "raw" / "fraud")

print("Copying genuine â†’ dataset/raw/genuine/ ...")
for p in genuine_selected:
    safe_copy(p, DATASET_DIR / "raw" / "genuine")

print("Copying remaining genuine â†’ dataset/genuine_extended/ ...")
for p in genuine_extended:
    safe_copy(p, DATASET_DIR / "genuine_extended")

print("âœ… Copy complete")

In [None]:
# â”€â”€ STEP 5: 60/20/20 Stratified Split â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

fraud_files   = sorted((DATASET_DIR / "raw" / "fraud").iterdir())
genuine_files = sorted((DATASET_DIR / "raw" / "genuine").iterdir())

all_files  = fraud_files + genuine_files
all_labels = [0] * len(fraud_files) + [1] * len(genuine_files)  # 0=fraud, 1=genuine

# First split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    all_files, all_labels,
    test_size=0.40, stratify=all_labels, random_state=42
)

# Second split: 50% of temp â†’ val, 50% â†’ test  (= 20%/20% of total)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50, stratify=y_temp, random_state=42
)

print(f"Train: {len(X_train)} ({sum(1 for y in y_train if y==0)} fraud, {sum(1 for y in y_train if y==1)} genuine)")
print(f"Val:   {len(X_val)} ({sum(1 for y in y_val if y==0)} fraud, {sum(1 for y in y_val if y==1)} genuine)")
print(f"Test:  {len(X_test)} ({sum(1 for y in y_test if y==0)} fraud, {sum(1 for y in y_test if y==1)} genuine)")

In [None]:
# â”€â”€ STEP 6: Copy split files to train/val/test â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

CLASS_MAP = {0: "fraud", 1: "genuine"}

def copy_split(file_list, label_list, split_name):
    for fpath, lbl in zip(file_list, label_list):
        dest_dir = DATASET_DIR / split_name / CLASS_MAP[lbl]
        shutil.copy2(str(fpath), str(dest_dir / fpath.name))

print("Writing train split...")
copy_split(X_train, y_train, "train")
print("Writing val split...")
copy_split(X_val, y_val, "val")
print("Writing test split...")
copy_split(X_test, y_test, "test")

print("\nâœ… All splits written!")

In [None]:
# â”€â”€ STEP 7: Verification â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

print("=" * 50)
print("FINAL DATASET VERIFICATION")
print("=" * 50)

total = 0
for split in ["train", "val", "test"]:
    for cls in ["fraud", "genuine"]:
        p = DATASET_DIR / split / cls
        n = len(list(p.iterdir()))
        total += n
        print(f"  dataset/{split}/{cls}: {n} images")

ext_count = len(list((DATASET_DIR / "genuine_extended").iterdir()))
print(f"\n  dataset/genuine_extended/: {ext_count} images (OCR benchmark only)")
print(f"\n  TOTAL in train+val+test: {total}")
print("=" * 50)
print("âœ… Dataset consolidation complete!")
print("Old folders (Main_Dataset/, Geniune Document DS/) kept as archive.")