In [1]:
#om gan ganapathaye namah om namah shivaya

In [9]:
import os
import csv
from pathlib import Path
import re 

# ====== HELPER: PARSE FILENAME (FINAL CORRECTION) ======
def parse_filename(fname_stem, real_or_altered):
    """
    Parse SOCOFing filename stem (without extension).

    Handles:
    1. Multiple underscores (e.g., '100__M' -> '100_M').
    2. Finger name split into two parts (e.g., 'index_finger') for both real and altered files.
    """
    # 1. Clean the stem by replacing multiple underscores with a single underscore
    cleaned_stem = re.sub(r"__+", "_", fname_stem)
    parts = cleaned_stem.split("_")

    if real_or_altered == "real":
        # Expected correct parts: 4 (e.g., 100_M_Left_little)
        # Observed inconsistent parts: 5 (e.g., 100_M_Left_index_finger)
        
        if len(parts) == 4:
            subject_id, gender, hand, finger = parts
        elif len(parts) == 5:
            # Reconstruct the 'finger' name by joining the last two parts
            subject_id, gender, hand = parts[0], parts[1], parts[2]
            finger = "_".join(parts[3:]) # Joins 'index' and 'finger' back to 'index_finger'
        else:
            raise ValueError(f"Unexpected real filename format after cleanup: {fname_stem} -> {cleaned_stem}. Parts: {parts}")
            
        alteration_type = "none"
        
    else:
        # Expected correct parts: 5 (e.g., 100_M_Left_little_CR)
        # Observed inconsistent parts: 6 (e.g., 100_M_Left_index_finger_CR)
        
        if len(parts) == 5:
            subject_id, gender, hand, finger, alteration_type = parts
        elif len(parts) == 6:
            # Reconstruct the 'finger' name by joining the 4th and 5th parts (index 3 and 4)
            subject_id, gender, hand = parts[0], parts[1], parts[2]
            finger = "_".join(parts[3:5]) # Joins 'index' and 'finger'
            alteration_type = parts[5] # The last part is the alteration type
        else:
             raise ValueError(f"Unexpected altered filename format after cleanup: {fname_stem} -> {cleaned_stem}. Parts: {parts}")

    return {
        "subject_id": subject_id,
        "gender": gender,
        "hand": hand,
        "finger": finger,
        "alteration_type": alteration_type,
    }

# ... (The rest of the code for build_index() and __main__ should be placed below) ...

# ====== CONFIGURE THIS ======
# Change this to the root folder where your SOCOFing data is stored
# Example structure:
# Â  DATA_ROOT/
# Â  Â  Â  Real/
# Â  Â  Â  Altered/
# Â  Â  Â  Â  Â  Altered-Easy/
# Â  Â  Â  Â  Â  Altered-Medium/
# Â  Â  Â  Â  Â  Altered-Hard/
DATA_ROOT = r"D:\5th sem\mini_project\dataset\SOCOFing" 

# Output CSV path
OUTPUT_CSV = r"D:\5th sem\mini_project\dataset\SOCOFing\socofing_index.csv"

# Subfolders (relative to DATA_ROOT) and their corresponding alter_level labels
SUBFOLDERS = [
    ("Real", "none"),
    ("Altered/Altered-Easy", "easy"),
    ("Altered/Altered-Medium", "medium"),
    ("Altered/Altered-Hard", "hard"),
]


# ====== MAIN SCRIPT ======
def build_index():
    rows = []

    for rel_subfolder, alter_level in SUBFOLDERS:
        folder_path = Path(DATA_ROOT) / rel_subfolder

        if not folder_path.exists():
            print(f"Warning: folder not found -> {folder_path}")
            continue

        # Decide real_or_altered flag
        real_or_altered = "real" if alter_level == "none" else "altered"

        # Iterate over all common image extensions
        for ext in ("*.bmp", "*.BMP", "*.png", "*.jpg", "*.jpeg"):
            for img_path in folder_path.glob(ext):
                fname_stem = img_path.stem

                info = parse_filename(fname_stem, real_or_altered)

                row = {
                    "image_path": str(img_path.resolve()),
                    "subject_id": info["subject_id"],
                    "finger": info["finger"],
                    "real_or_altered": real_or_altered,
                    "alter_level": alter_level,  # none / easy / medium / hard
                    # Extra useful fields:
                    "gender": info["gender"],
                    "hand": info["hand"],
                    "alteration_type": info["alteration_type"],
                }
                rows.append(row)

    # Write CSV
    fieldnames = [
        "image_path",
        "subject_id",
        "finger",
        "real_or_altered",
        "alter_level",
        "gender",
        "hand",
        "alteration_type",
    ]

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"Saved {len(rows)} rows to {OUTPUT_CSV}")


if __name__ == "__main__":
    build_index()

Saved 110540 rows to D:\5th sem\mini_project\dataset\SOCOFing\socofing_index.csv


In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# --- CONFIGURATION ---
RAW_ROOT = r"D:\5th sem\mini_project\dataset\SOCOFing"
OUT_ROOT = r"D:\5th sem\mini_project\dataset\NPYSF"
IMG_SIZE = (512, 512)

# --- FOLDER DEFINITIONS ---
folders = [
    ("Real", "Real_npys"),
    (r"Altered\Altered-Easy", "Altered_Easy_npys"),
    (r"Altered\Altered-Medium", "Altered_Medium_npys"),
    (r"Altered\Altered-Hard", "Altered_Hard_npys")
]

# Create output base directory once
os.makedirs(OUT_ROOT, exist_ok=True)

# SIFT object is created once, using default settings (no nfeatures limit)
sift = cv2.SIFT_create() 

# --- CORE FEATURE EXTRACTION LOGIC (UNCHANGED) ---
def extract_aggregate_vector(img_path):
    """
    Extracts, enhances, detects SIFT features, aggregates them,
    and returns a centered, normalized vector.
    """
    img = cv2.imread(img_path, 0) # Read as grayscale
    if img is None:
        return None

    # Image Preprocessing: Resize and Histogram Equalization
    img = cv2.resize(img, IMG_SIZE)
    img = cv2.equalizeHist(img)
    
    # SIFT Detection and Computation
    kp, des = sift.detectAndCompute(img, None)
    
    if des is None or len(des) == 0:
        return None
    
    # Aggregation (Mean Vector)
    v = np.mean(des.astype(np.float32), axis=0)
    
    # Centering the Vector (v = v - mean(v)) - LOGIC PRESERVED
    v = v - np.mean(v)
    
    # Normalization (L2 Norm)
    n = np.linalg.norm(v)
    
    if n < 1e-12:
        return None
        
    return (v / n).astype(np.float32)

# --- MAIN PROCESSING FUNCTION ---
def process_dataset_folder(input_folder, output_folder):
    """
    Walks through input_folder, extracts features from all images,
    and saves them to the corresponding path in output_folder.
    """
    print(f"\nScanning: {input_folder}")
    
    # 1. Collect all image file paths first
    all_files = []
    for root, _, files in os.walk(input_folder):
        for fn in files:
            if fn.lower().endswith((".bmp", ".png", ".jpg", ".jpeg")):
                # Store (full path, relative path from input_folder)
                full_path = os.path.join(root, fn)
                rel_path = os.path.relpath(full_path, input_folder)
                all_files.append((full_path, rel_path))

    print(f"Total images found: {len(all_files)}")
    
    # 2. Process files with a clean progress bar
    for img_path, rel_path in tqdm(all_files, desc=f"Processing {os.path.basename(input_folder)}", colour="green"):
        
        # Determine output path, maintaining subfolder structure
        base_name, _ = os.path.splitext(rel_path)
        out_path = os.path.join(output_folder, base_name + ".npy")
        
        # Create necessary output subdirectories
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        
        # Skip if the NPY file already exists
        if os.path.exists(out_path):
            continue
            
        # Extract features and save
        agg_vector = extract_aggregate_vector(img_path)
        
        if agg_vector is not None:
            np.save(out_path, agg_vector)

# --- EXECUTION ---
print("================= STARTING FEATURE EXTRACTION =================\n")

for in_rel, out_rel in folders:
    input_dir = os.path.join(RAW_ROOT, in_rel)
    output_dir = os.path.join(OUT_ROOT, out_rel)
    
    process_dataset_folder(input_dir, output_dir)

print("\nðŸŽ‰ ALL DONE. NPYSF regenerated successfully!")

In [11]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# --- CONFIGURATION ---
RAW_ROOT = r"D:\5th sem\mini_project\dataset\SOCOFing"
DES_ROOT = r"D:\5th sem\mini_project\dataset\NPYSF_full_des"  # NEW folder for full descriptors
IMG_SIZE = (512, 512)

# --- FOLDER DEFINITIONS (same logical splits as before) ---
folders = [
    ("Real", "Real"),
    (r"Altered\Altered-Easy", "Altered_Easy"),
    (r"Altered\Altered-Medium", "Altered_Medium"),
    (r"Altered\Altered-Hard", "Altered_Hard")
]

os.makedirs(DES_ROOT, exist_ok=True)

# SIFT object
sift = cv2.SIFT_create()

def preprocess_image(img_path):
    """Read, grayscale, resize, and enhance fingerprint image."""
    img = cv2.imread(img_path, 0)  # grayscale
    if img is None:
        return None

    img = cv2.resize(img, IMG_SIZE)
    img = cv2.equalizeHist(img)
    return img

def extract_descriptors(img_path):
    """Return full SIFT descriptor array [N, 128] or None."""
    img = preprocess_image(img_path)
    if img is None:
        return None

    kp, des = sift.detectAndCompute(img, None)
    if des is None or len(des) == 0:
        return None

    return des.astype(np.float32)

def process_dataset_folder(input_folder, tag):
    """Save full descriptors for all images in a given folder."""
    print(f"\nScanning: {input_folder}")

    all_files = []
    for root, _, files in os.walk(input_folder):
        for fn in files:
            if fn.lower().endswith((".bmp", ".png", ".jpg", ".jpeg")):
                full_path = os.path.join(root, fn)
                rel_path = os.path.relpath(full_path, input_folder)
                all_files.append((full_path, rel_path))

    print(f"Total images found: {len(all_files)}")

    for img_path, rel_path in tqdm(all_files,
                                   desc=f"Full SIFT {tag}",
                                   colour="blue"):
        base_name, _ = os.path.splitext(rel_path)

        # Output path for descriptors (mirror subfolders)
        des_out = os.path.join(DES_ROOT, tag, base_name + ".npy")
        os.makedirs(os.path.dirname(des_out), exist_ok=True)

        # Skip if already processed
        if os.path.exists(des_out):
            continue

        des = extract_descriptors(img_path)
        if des is not None:
            np.save(des_out, des)

print("================= STARTING FULL DESCRIPTOR EXTRACTION =================\n")

for in_rel, tag in folders:
    input_dir = os.path.join(RAW_ROOT, in_rel)
    process_dataset_folder(input_dir, tag)

print("\nALL DONE. Full SIFT descriptors saved successfully!")




Scanning: D:\5th sem\mini_project\dataset\SOCOFing\Real
Total images found: 6000


Full SIFT Real: 100%|[34mâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ[0m| 6000/6000 [12:14<00:00,  8.17it/s][0m



Scanning: D:\5th sem\mini_project\dataset\SOCOFing\Altered\Altered-Easy
Total images found: 17931


Full SIFT Altered_Easy: 100%|[34mâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ[0m| 17931/17931 [30:21<00:00,  9.84it/s][0m



Scanning: D:\5th sem\mini_project\dataset\SOCOFing\Altered\Altered-Medium
Total images found: 17067


Full SIFT Altered_Medium: 100%|[34mâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ[0m| 17067/17067 [28:07<00:00, 10.11it/s][0m



Scanning: D:\5th sem\mini_project\dataset\SOCOFing\Altered\Altered-Hard
Total images found: 14272


Full SIFT Altered_Hard: 100%|[34mâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ[0m| 14272/14272 [23:09<00:00, 10.27it/s][0m


ALL DONE. Full SIFT descriptors saved successfully!





In [13]:
import os
import cv2
import numpy as np
import random
from collections import defaultdict
from pathlib import Path
import re # <-- Added import for regular expressions

# ========= CONFIGURE THESE =========
# Folder where you saved full descriptors (from NPYSF_full_des script)
DES_ROOT = r"D:\5th sem\mini_project\dataset\NPYSF_full_des"

# Subfolders inside DES_ROOT as used earlier
TAG_FOLDERS = {
    "real": "Real",
    "easy": "Altered_Easy",
    "medium": "Altered_Medium",
    "hard": "Altered_Hard",
}

# For matching tests
NUM_GENUINE_TESTS = 5    # how many genuine pairs to test
NUM_IMPOSTOR_TESTS = 5 # how many impostor pairs to test

# ========= HELPER: PARSE FILENAME (CORRECTED) =========
def parse_filename_from_stem(stem):
    """
    Parse filename stem to extract subject_id, finger, gender, and hand.
    Handles: 
      1. Multiple underscores (e.g., '100__M' -> '100_M').
      2. Finger name split into two parts (e.g., 'index_finger' leading to 5 or 6 parts total).
    """
    # 1. Clean the stem by replacing multiple underscores with a single underscore
    cleaned_stem = re.sub(r"__+", "_", stem)
    parts = cleaned_stem.split("_")
    
    # Analyze the length to determine the structure:
    # 4 parts: [subj, gender, hand, finger] (e.g., 100_M_Left_little)
    # 5 parts: [subj, gender, hand, finger_part1, finger_part2] OR [subj, gender, hand, finger, alter_type]
    # 6 parts: [subj, gender, hand, finger_part1, finger_part2, alter_type]

    if len(parts) < 4:
        raise ValueError(f"Unexpected filename format: {stem}")
    
    subject_id = parts[0]
    gender = parts[1]
    hand = parts[2]
    
    # Check if the finger name is split (parts[3] usually 'index' or 'middle', parts[4] usually 'finger')
    # If the file is altered (5 or 6 parts) or real (5 parts), the finger name is likely split
    if len(parts) >= 5 and parts[4].lower() in ['finger', 'thumb', 'index', 'middle', 'ring', 'little']:
        # If the 5th part is a finger name, we assume parts[3] and parts[4] must be joined.
        finger = "_".join(parts[3:5]) # e.g., 'index' + 'finger' -> 'index_finger'
    else:
        # Otherwise, the finger name is the single part[3] (e.g., 'little', or if only 4 parts)
        finger = parts[3]

    return subject_id, finger, gender, hand

# ========= STEP 5A: STATS PER CATEGORY =========
def compute_keypoint_stats():
# ... (function body is unchanged) ...
    stats = {} 

    for tag, subdir in TAG_FOLDERS.items():
        folder = Path(DES_ROOT) / subdir
        if not folder.exists():
            print(f"[WARN] Folder not found for tag '{tag}': {folder}")
            continue

        counts = []
        for npy_path in folder.rglob("*.npy"):
            des = np.load(str(npy_path))
            if des is None:
                continue
            # des is [N, 128]
            num_kp = des.shape[0]
            counts.append(num_kp)

        if not counts:
            print(f"[INFO] No descriptors found for tag '{tag}'.")
            continue

        counts = np.array(counts, dtype=np.int32)
        stats[tag] = counts

        print(f"\n=== {tag.upper()} ===")
        print(f"  Files: {len(counts)}")
        print(f"  num_keypoints min: {counts.min()}")
        print(f"  num_keypoints max: {counts.max()}")
        print(f"  num_keypoints mean: {counts.mean():.2f}")

    return stats

# ========= BUILD INDEX FOR MATCHING TESTS =========
def build_descriptor_index():
# ... (function body is unchanged) ...
    all_items = []
    groups = defaultdict(list)

    for tag, subdir in TAG_FOLDERS.items():
        folder = Path(DES_ROOT) / subdir
        if not folder.exists():
            continue

        for npy_path in folder.rglob("*.npy"):
            stem = npy_path.stem  # filename without .npy
            try:
                # This will use the corrected parsing function
                subject_id, finger, gender, hand = parse_filename_from_stem(stem) 
            except ValueError as e:
                print(f"[WARN] Skipping {npy_path}: {e}")
                continue

            idx = len(all_items)
            all_items.append({
                "path": str(npy_path),
                "tag": tag,
                "subject_id": subject_id,
                "finger": finger,
                "gender": gender,
                "hand": hand,
            })
            groups[(subject_id, finger)].append(idx)

    print(f"\nIndexed {len(all_items)} descriptor files in total.")
    print(f"Number of (subject_id, finger) groups: {len(groups)}")
    return all_items, groups

# ========= HELPER: BF + RATIO-TEST MATCHING =========
def count_good_matches(des1, des2, ratio_thresh=0.75):
# ... (function body is unchanged) ...
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=False)
    # des1, des2: float32 arrays [N1, 128], [N2, 128]
    matches = bf.knnMatch(des1, des2, k=2)
    good = []
    for m, n in matches:
        if m.distance < ratio_thresh * n.distance:
            good.append(m)
    return len(good)

# ========= STEP 5B: GENUINE VS IMPOSTOR MATCHING TESTS =========
def test_matching(all_items, groups):
# ... (function body is unchanged) ...
    rng = random.Random(42)

    # Build a list of group keys with at least 2 descriptors (for genuine)
    genuine_groups = [k for k, idxs in groups.items() if len(idxs) >= 2]
    if not genuine_groups:
        print("[ERROR] No (subject, finger) group has >= 2 descriptors.")
        return

    # Build a list of all indices by subject_id for impostors
    subject_to_indices = defaultdict(list)
    for i, item in enumerate(all_items):
        subject_to_indices[item["subject_id"]].append(i)
    subjects = list(subject_to_indices.keys())

    print("\n===== GENUINE MATCHES (same subject, same finger) =====")
    for _ in range(NUM_GENUINE_TESTS):
        # Pick a group and then two different indices from that group
        key = rng.choice(genuine_groups)
        idxs = groups[key]
        if len(idxs) < 2:
            continue
        i1, i2 = rng.sample(idxs, 2)

        item1 = all_items[i1]
        item2 = all_items[i2]

        des1 = np.load(item1["path"])
        des2 = np.load(item2["path"])

        good = count_good_matches(des1, des2)

        print(f"  Genuine pair: subj={item1['subject_id']} finger={item1['finger']} "
              f"({item1['tag']} vs {item2['tag']}) -> good_matches={good}")

    print("\n===== IMPOSTOR MATCHES (different subjects) =====")
    for _ in range(NUM_IMPOSTOR_TESTS):
        # Pick two different subjects
        subj1, subj2 = rng.sample(subjects, 2)
        i1 = rng.choice(subject_to_indices[subj1])
        i2 = rng.choice(subject_to_indices[subj2])

        item1 = all_items[i1]
        item2 = all_items[i2]

        des1 = np.load(item1["path"])
        des2 = np.load(item2["path"])

        good = count_good_matches(des1, des2)

        print(f"  Impostor pair: subj1={item1['subject_id']} subj2={item2['subject_id']} "
              f"({item1['tag']} vs {item2['tag']}) -> good_matches={good}")

# ========= MAIN =========
if __name__ == "__main__":
    print("===== STEP 5A: KEYPOINT STATS =====")
    stats = compute_keypoint_stats()

    print("\n===== BUILDING INDEX FOR MATCHING TESTS =====")
    all_items, groups = build_descriptor_index()

    print("\n===== STEP 5B: GENUINE VS IMPOSTOR MATCHING =====")
    test_matching(all_items, groups)

===== STEP 5A: KEYPOINT STATS =====

=== REAL ===
  Files: 6000
  num_keypoints min: 685
  num_keypoints max: 4425
  num_keypoints mean: 1772.72

=== EASY ===
  Files: 17931
  num_keypoints min: 504
  num_keypoints max: 4443
  num_keypoints mean: 1728.86

=== MEDIUM ===
  Files: 17067
  num_keypoints min: 434
  num_keypoints max: 4437
  num_keypoints mean: 1689.70

=== HARD ===
  Files: 14272
  num_keypoints min: 386
  num_keypoints max: 4418
  num_keypoints mean: 1649.13

===== BUILDING INDEX FOR MATCHING TESTS =====

Indexed 55270 descriptor files in total.
Number of (subject_id, finger) groups: 3000

===== STEP 5B: GENUINE VS IMPOSTOR MATCHING =====

===== GENUINE MATCHES (same subject, same finger) =====
  Genuine pair: subj=572 finger=thumb_finger (easy vs real) -> good_matches=1472
  Genuine pair: subj=303 finger=little_finger (easy vs hard) -> good_matches=680
  Genuine pair: subj=203 finger=little_finger (easy vs easy) -> good_matches=899
  Genuine pair: subj=536 finger=ring_fi

In [15]:
import os
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
CSV_PATH = r"D:\5th sem\mini_project\dataset\socofing_index.csv"

# Roots where you saved features
AGG_ROOT = r"D:\5th sem\mini_project\dataset\NPYSF"
DES_ROOT = r"D:\5th sem\mini_project\dataset\NPYSF_full_des"

# Output CSV with feature paths
OUT_CSV = r"D:\5th sem\mini_project\dataset\socofing_index_features.csv"

# Mapping (real_or_altered, alter_level) -> subfolders
AGG_SUBDIRS = {
    ("real", "none"):   "Real_npys",
    ("altered", "easy"):   "Altered_Easy_npys",
    ("altered", "medium"): "Altered_Medium_npys",
    ("altered", "hard"):   "Altered_Hard_npys",
}

DES_SUBDIRS = {
    ("real", "none"):   "Real",
    ("altered", "easy"):   "Altered_Easy",
    ("altered", "medium"): "Altered_Medium",
    ("altered", "hard"):   "Altered_Hard",
}

# ========= MAIN =========
def main():
    df = pd.read_csv(CSV_PATH)

    agg_paths = []
    des_paths = []

    for idx, row in df.iterrows():
        image_path = row["image_path"]
        real_or_altered = str(row["real_or_altered"]).lower()
        alter_level = str(row["alter_level"]).lower()

        key = (real_or_altered, alter_level)
        if key not in AGG_SUBDIRS or key not in DES_SUBDIRS:
            raise ValueError(f"Unexpected RA/alter_level combo at row {idx}: {key}")

        # filename stem from image_path
        img_stem = Path(image_path).stem  # e.g., "001_M_Left_little_finger"

        # Build aggregated vector path
        agg_sub = AGG_SUBDIRS[key]
        agg_path = Path(AGG_ROOT) / agg_sub / (img_stem + ".npy")

        # Build full descriptor path
        des_sub = DES_SUBDIRS[key]
        des_path = Path(DES_ROOT) / des_sub / (img_stem + ".npy")

        agg_paths.append(str(agg_path))
        des_paths.append(str(des_path))

    df["agg_path"] = agg_paths
    df["des_path"] = des_paths

    df.to_csv(OUT_CSV, index=False)
    print(f"Saved extended CSV to: {OUT_CSV}")
    print(f"Rows: {len(df)}")

if __name__ == "__main__":
    main()


Saved extended CSV to: D:\5th sem\mini_project\dataset\socofing_index_features.csv
Rows: 110540


In [17]:
import os
import pandas as pd

CSV_PATH = r"D:\5th sem\mini_project\dataset\socofing_index_features.csv"

df = pd.read_csv(CSV_PATH)

missing_agg = []
missing_des = []

for idx, row in df.iterrows():
    agg_path = row["agg_path"]
    des_path = row["des_path"]

    if not os.path.exists(agg_path):
        missing_agg.append(agg_path)
    if not os.path.exists(des_path):
        missing_des.append(des_path)

print(f"Total rows: {len(df)}")
print(f"Missing agg_path files: {len(missing_agg)}")
print(f"Missing des_path files: {len(missing_des)}")

if missing_agg:
    print("\nExamples of missing agg_path:")
    print("\n".join(missing_agg[:5]))

if missing_des:
    print("\nExamples of missing des_path:")
    print("\n".join(missing_des[:5]))


Total rows: 110540
Missing agg_path files: 0
Missing des_path files: 0
