# Section 1: Hash Matching & Image Fingerprinting

**Goal**: Image ingest → PDQ hashing → near-duplicate matching → hashlist import → quick API surface doc.

**Scope**: Days 1-4 of MVP build. Core hashing infrastructure for blocking known harmful content.

Build perceptual hash database, implement Hamming distance matching, support external hashlist feeds.

## Sources & References

**PDQ Hashing**:
- https://github.com/facebook/ThreatExchange
- https://github.com/darwinium-com/pdqhash

**Hash Matching & Action (HMA)**:
- https://github.com/facebook/ThreatExchange/tree/main/hasher-matcher-actioner

In [None]:
import platform
if 'google.colab' in str(get_ipython()):
    !apt-get update -y && apt-get install -y build-essential cmake

%pip install --quiet pdqhash opencv-python-headless pillow numpy

import pdqhash
import cv2
import numpy as np
from PIL import Image
import os
import json
import time
import csv
from pathlib import Path

print(f"PDQ version: {pdqhash.__version__}")
print(f"OpenCV version: {cv2.__version__}")
print(f"PIL version: {Image.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
IMG_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
LABELS = {"CSAM", "NCII", "TERROR"}

def now_ts():
    return int(time.time())

def ensure_dirs(*dirs):
    for d in dirs:
        Path(d).mkdir(exist_ok=True)

def to_json(obj):
    def json_serializer(o):
        if isinstance(o, set):
            return list(o)
        return str(o)
    return json.dumps(obj, default=json_serializer, indent=2)

In [None]:
def compute_pdq(image_path):
    try:
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            hash_int, quality = pdqhash.compute(img)
            hash_hex = f"{hash_int:064x}"
            return hash_hex, quality
    except Exception as e:
        try:
            img = cv2.imread(image_path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (8, 8))
            avg = resized.mean()
            bits = (resized > avg).astype(np.uint8)
            hash_int = int(''.join(bits.flatten().astype(str)), 2)
            hash_hex = f"{hash_int:016x}".zfill(64)
            return hash_hex, 50
        except:
            return "0" * 64, 0

def hex_to_int(hex_str):
    return int(hex_str, 16)

def hamming_distance_hex(hex1, hex2):
    return bin(hex_to_int(hex1) ^ hex_to_int(hex2)).count('1')

In [None]:
HASH_DB = {}

def add_hash(media_id, hash_hex, quality, source="user", labels=None):
    if labels is None:
        labels = set()
    HASH_DB[media_id] = {
        "hash": hash_hex,
        "quality": quality,
        "source": source,
        "labels": set(labels) if isinstance(labels, (list, tuple)) else labels
    }

def get_hash_count():
    return len(HASH_DB)

def get_hash_info(media_id):
    return HASH_DB.get(media_id)

def clear_hash_db():
    HASH_DB.clear()

In [None]:
def ingest_folder(folder_path, source="user", prefix=""):
    count = 0
    errors = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if Path(file).suffix.lower() in IMG_EXTS:
                image_path = os.path.join(root, file)
                media_id = f"{prefix}{Path(file).stem}_{now_ts()}_{count}"
                try:
                    hash_hex, quality = compute_pdq(image_path)
                    add_hash(media_id, hash_hex, quality, source)
                    count += 1
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
                    errors += 1
    return count, errors

def match_hash(query_hash, max_distance=30, topk=50):
    matches = []
    for media_id, data in HASH_DB.items():
        distance = hamming_distance_hex(query_hash, data["hash"])
        if distance <= max_distance:
            matches.append((media_id, distance))
    
    matches.sort(key=lambda x: x[1])
    return matches[:topk]

def match_hash_optimized(query_hash, max_distance=30, topk=50):
    query_int = hex_to_int(query_hash)
    matches = []
    
    items = list(HASH_DB.items())
    chunk_size = min(1000, len(items))
    
    for i in range(0, len(items), chunk_size):
        chunk = items[i:i + chunk_size]
        
        for media_id, data in chunk:
            hash_int = hex_to_int(data["hash"])
            distance = bin(query_int ^ hash_int).count('1')
            if distance <= max_distance:
                matches.append((media_id, distance))
                
        if len(matches) > topk * 2:
            matches.sort(key=lambda x: x[1])
            matches = matches[:topk]
    
    matches.sort(key=lambda x: x[1])
    return matches[:topk]

def benchmark_matching(num_hashes=1000, num_queries=100):
    print(f"\\nBenchmarking with {num_hashes} hashes and {num_queries} queries...")
    
    original_db = HASH_DB.copy()
    clear_hash_db()
    
    for i in range(num_hashes):
        hash_hex = f"{i:016x}" + "0" * 48
        add_hash(f"bench_{i}", hash_hex, 90, "benchmark", {"TEST"} if i % 10 == 0 else set())
    
    test_hashes = [f"{i*17:016x}" + "0" * 48 for i in range(num_queries)]
    
    start_time = time.time()
    for query_hash in test_hashes:
        matches = match_hash(query_hash, max_distance=20, topk=10)
    brute_time = (time.time() - start_time) * 1000
    
    start_time = time.time()
    for query_hash in test_hashes:
        matches = match_hash_optimized(query_hash, max_distance=20, topk=10)
    optimized_time = (time.time() - start_time) * 1000
    
    speedup = brute_time / optimized_time if optimized_time > 0 else float('inf')
    
    print(f"  Brute force: {brute_time:.1f}ms ({brute_time/num_queries:.2f}ms per query)")
    print(f"  Optimized: {optimized_time:.1f}ms ({optimized_time/num_queries:.2f}ms per query)")
    print(f"  Speedup: {speedup:.1f}x")
    
    HASH_DB.clear()
    HASH_DB.update(original_db)
    
    return {"brute_force_ms": brute_time, "optimized_ms": optimized_time, "speedup": speedup}

In [None]:
def import_csv_hashlist(csv_path, source):
    count = 0
    with open(csv_path, 'r', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            hash_hex = row.get('hash_hex', '').strip()
            label = row.get('label', '').strip()
            
            if not hash_hex or len(hash_hex) != 64:
                continue
                
            media_id = f"{source}:{hash_hex[:12]}"
            
            if media_id in HASH_DB:
                HASH_DB[media_id]["labels"].add(label)
            else:
                add_hash(media_id, hash_hex, 100, source, {label} if label else set())
            count += 1
    return count

def export_hashlist_csv(output_path):
    with open(output_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['media_id', 'hash_hex', 'quality', 'source', 'labels'])
        for media_id, data in HASH_DB.items():
            labels_str = ','.join(data['labels']) if data['labels'] else ''
            writer.writerow([media_id, data['hash'], data['quality'], data['source'], labels_str])
    return len(HASH_DB)

def create_sample_hashlist(output_path):
    sample_data = [
        ['hash_hex', 'label'],
        ['1234567890abcdef' + '0' * 48, 'CSAM'],
        ['fedcba0987654321' + '0' * 48, 'TERROR'],
        ['abcdef1234567890' + '0' * 48, 'NCII']
    ]
    
    with open(output_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(sample_data)
    return len(sample_data) - 1

In [None]:
ensure_dirs('samples')

def create_sample_image(path, color, size=(100, 100)):
    img = Image.new('RGB', size, color)
    img.save(path)

sample_paths = [
    'samples/red_square.jpg',
    'samples/blue_square.jpg'
]

create_sample_image(sample_paths[0], (255, 0, 0))
create_sample_image(sample_paths[1], (0, 0, 255))

ingested, errors = ingest_folder('samples', 'demo')
print(f"Ingested {ingested} sample images ({errors} errors)")

if sample_paths[0] and os.path.exists(sample_paths[0]):
    demo_hash, demo_quality = compute_pdq(sample_paths[0])
    print(f"Demo PDQ hash: {demo_hash}")
    print(f"Demo quality: {demo_quality}")
    
    matches = match_hash(demo_hash, max_distance=10)
    print(f"Matches (distance ≤ 10): {matches}")

sample_csv = 'samples/sample_hashes.csv'
created_hashes = create_sample_hashlist(sample_csv)
imported_hashes = import_csv_hashlist(sample_csv, 'threat_feed')
print(f"Created and imported {imported_hashes} threat hashes")

print(f"\\nTotal hashes in DB: {get_hash_count()}")
for media_id, data in list(HASH_DB.items())[:3]:
    print(f"  {media_id}: {data['hash'][:16]}... (labels: {data['labels']})")

bench_result = benchmark_matching(100, 20)
print(f"\\nPerformance benchmark complete: {bench_result['speedup']:.1f}x speedup")

## API Surface (For Integration)

### POST /hash
Compute hash for uploaded image
```json
{
  "image": "<base64_encoded_image>",
  "media_id": "optional_id"
}
```
Response:
```json
{
  "hash_hex": "abc123...",
  "quality": 85,
  "media_id": "img_1234567890"
}
```

### POST /match
Find similar hashes
```json
{
  "hash_hex": "abc123...",
  "max_distance": 30,
  "topk": 50
}
```
Response:
```json
{
  "matches": [
    {"media_id": "known_bad_123", "distance": 5, "labels": ["CSAM"]},
    {"media_id": "similar_456", "distance": 12, "labels": []}
  ]
}
```

## Day-by-Day Checklist (Days 1-4)

### Day 1: Core Hashing
- [x] Set up PDQ hashing with fallback to aHash
- [x] Implement Hamming distance calculation
- [x] Create in-memory hash database
- [x] Test with sample images

### Day 2: Matching & Performance
- [x] Implement brute-force hash matching
- [x] Add optimized matching with chunked processing
- [x] Test matching accuracy and performance
- [ ] Benchmark with larger datasets (1000+ hashes)

### Day 3: Import/Export
- [x] CSV hashlist import functionality
- [x] CSV export for backup
- [x] Sample threat hash creation
- [ ] Integration with external hash feeds (HMA/ThreatExchange)

### Day 4: Integration Prep
- [x] API specification documentation
- [ ] Performance optimization for production scale
- [ ] Error handling improvements
- [ ] Integration testing with Section 2