### Data Consistency Checking

In [None]:
import os
import pandas as pd

BASE_DIR = '/kaggle/input/surveillance-for-retail-stores/face_identification/face_identification'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
TEST_DIR = os.path.join(BASE_DIR, 'test')

train_csv_path = os.path.join(BASE_DIR, 'trainset.csv')
eval_csv_path = os.path.join(BASE_DIR, 'eval_set.csv')

train_df = pd.read_csv(train_csv_path)
eval_df = pd.read_csv(eval_csv_path)

In [None]:
print("=== Checking Training Set Consistency ===")
# For training images, CSV paths are like "train/person_100/85.jpg"
missing_train_files = []
for idx, row in train_df.iterrows():
    csv_path = row['image_path']
    full_path = os.path.join(BASE_DIR, csv_path)
    if not os.path.exists(full_path):
        missing_train_files.append(full_path)

if missing_train_files:
    print("Missing training files (from CSV):")
    for f in missing_train_files:
        print(f"  - {f}")
else:
    print("All files listed in trainset.csv exist.")

# Check for extra files in TRAIN_DIR not in CSV
csv_train_files = set()
for path in train_df['image_path']:
    rel_path = path[len("train/"):] if path.startswith("train/") else path
    csv_train_files.add(rel_path)

actual_train_files = []
for root, dirs, files in os.walk(TRAIN_DIR):
    for file in files:
        rel_path = os.path.relpath(os.path.join(root, file), TRAIN_DIR)
        actual_train_files.append(rel_path)

extra_train_files = set(actual_train_files) - csv_train_files

if extra_train_files:
    print("\nExtra training files in the train directory (not listed in CSV):")
    for f in extra_train_files:
        print(f"  - {f}")
else:
    print("No extra training files found; CSV matches the train directory.")

# === Checking Evaluation Set Consistency ===
print("\n=== Checking Evaluation Set Consistency ===")
# For evaluation, CSV image names (e.g., "9198.jpg") should be in TEST_DIR
missing_eval_files = []
for idx, row in eval_df.iterrows():
    image_name = row['image_path']
    full_path = os.path.join(TEST_DIR, image_name)
    if not os.path.exists(full_path):
        missing_eval_files.append(full_path)

if missing_eval_files:
    print("Missing evaluation files (from CSV):")
    for f in missing_eval_files:
        print(f"  - {f}")
else:
    print("All files listed in eval_set.csv exist in the test directory.")

# Check for extra files in TEST_DIR not in CSV
csv_eval_files = set(eval_df['image_path'].tolist())
actual_eval_files = [f for f in os.listdir(TEST_DIR) if os.path.isfile(os.path.join(TEST_DIR, f))]
extra_eval_files = set(actual_eval_files) - csv_eval_files

if extra_eval_files:
    print("\nExtra evaluation files in the test directory (not listed in CSV):")
    for f in extra_eval_files:
        print(f"  - {f}")
else:
    print("No extra evaluation files found; CSV matches the test directory.")

# === Numerical Report ===
print("\n=== Numerical Report ===")

# Training Set Statistics
print("\n--- Training Set Statistics ---")
total_train_images = len(train_df)
print(f"Total images in CSV: {total_train_images}")

# Extract person_id (e.g., "train/person_100/85.jpg" -> "person_100")
train_df['person_id'] = train_df['image_path'].apply(lambda x: x.split('/')[1])
unique_persons_train = train_df['person_id'].nunique()
print(f"Unique persons: {unique_persons_train}")

# Distribution of images per person
images_per_person = train_df.groupby('person_id').size()
print("Images per person:")
print(f"  Min: {images_per_person.min()}")
print(f"  Max: {images_per_person.max()}")
print(f"  Mean: {images_per_person.mean():.2f}")
print(f"  Median: {images_per_person.median()}")
print(f"  Std Dev: {images_per_person.std():.2f}")

print(f"Missing files: {len(missing_train_files)}")
print(f"Extra files: {len(extra_train_files)}")

# Test Set Statistics
print("\n--- Test Set Statistics ---")
total_test_images = len(eval_df)
print(f"Total images in CSV: {total_test_images}")

if 'label' in eval_df.columns:
    unique_persons_test = eval_df['label'].nunique()
    print(f"Unique persons: {unique_persons_test}")
    train_persons = set(train_df['person_id'])
    test_persons = set(eval_df['label'])
    unseen_persons = test_persons - train_persons
    print(f"Unseen persons: {len(unseen_persons)}")
else:
    print("Unique persons: N/A (no 'label' column)")

print(f"Missing files: {len(missing_eval_files)}")
print(f"Extra files: {len(extra_eval_files)}")

# Summary
print("\n--- Summary ---")
print(f"Training images: {total_train_images} (unique persons: {unique_persons_train})")
print(f"Test images: {total_test_images}")
if 'label' in eval_df.columns:
    print(f"Test unique persons: {unique_persons_test} (unseen: {len(unseen_persons)})")
else:
    print("Test unique persons: N/A")
print(f"Total missing files: {len(missing_train_files) + len(missing_eval_files)}")
print(f"Total extra files: {len(extra_train_files) + len(extra_eval_files)}")

<br><br><br><br>

### Data Is Consistant, We can move forward now

In [None]:
%%capture
!pip install deepface

#### Creating Validation Set

In [None]:
import os
import random
import statistics

# Organize dataset into training and validation sets
train_files, val_files = {}, {}

print("Organizing dataset...")

person_dirs = [d for d in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, d))]
for person in person_dirs:
    person_path = os.path.join(TRAIN_DIR, person)
    images = [os.path.join(person_path, f) for f in os.listdir(person_path) if os.path.isfile(os.path.join(person_path, f))]
    if not images:
        continue
    random.shuffle(images)
    split_idx = int(0.8 * len(images))
    train_files[person], val_files[person] = images[:split_idx], images[split_idx:]

# Function to generate report
def generate_report(file_dict, dataset_name):
    total_images = sum(len(images) for images in file_dict.values())
    num_persons = len(file_dict)
    person_image_counts = [len(images) for images in file_dict.values() if len(images) > 0]
    
    if person_image_counts:
        min_images = min(person_image_counts)
        max_images = max(person_image_counts)
        mean_images = total_images / num_persons
        median_images = statistics.median(person_image_counts)
    else:
        min_images = 0
        max_images = 0
        mean_images = 0
        median_images = 0
    
    print(f"\n{dataset_name} Dataset Report:")
    print(f"Total number of images: {total_images}")
    print(f"Number of unique persons: {num_persons}")
    print("Images per person:")
    print(f"  Minimum: {min_images}")
    print(f"  Maximum: {max_images}")
    print(f"  Mean: {mean_images:.2f}")
    print(f"  Median: {median_images}")

# Generate reports for training and validation sets
generate_report(train_files, "Training")
generate_report(val_files, "Validation")

<br><br><br><br>

### DeepFace Embedding Similarity Trial

In [None]:
import numpy as np
import pandas as pd
import os
import random
import time
from tqdm import tqdm
from deepface import DeepFace
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

# Configuration
MODELS = ['Facenet', 'Facenet512', 'VGG-Face', 'Dlib']
EMBEDDINGS_DIR = 'embeddings'
BACKEND = 'retinaface'
USE_ALIGNMENT = True
SIMILARITY_THRESHOLD = 0.3
RANDOM_SEED = 42

# Ensure embeddings directory exists
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
for model_name in MODELS:
    os.makedirs(os.path.join(EMBEDDINGS_DIR, model_name), exist_ok=True)

random.seed(RANDOM_SEED)

def get_embedding_deepface(image_path, model_name='Facenet512'):
    """Compute embedding using DeepFace with specified model and RetinaFace backend"""
    try:
        reps = DeepFace.represent(
            img_path=image_path, 
            model_name=model_name,
            detector_backend=BACKEND,
            align=USE_ALIGNMENT,
            enforce_detection=False
        )
        if reps and 'embedding' in reps[0]:
            return np.array(reps[0]['embedding'])
    except Exception as e:
        print(f"Error processing {image_path} with {model_name}: {e}")
    return None

def compute_and_save_embeddings(file_dict, dataset_type, model_name):
    """Compute embeddings for all images and save to files"""
    embeddings_path = os.path.join(EMBEDDINGS_DIR, model_name, f"{dataset_type}_embeddings.npz")
    
    # Check if embeddings already exist
    if os.path.exists(embeddings_path):
        print(f"Loading existing {model_name} embeddings for {dataset_type} dataset")
        return np.load(embeddings_path, allow_pickle=True)
    
    print(f"Computing {model_name} embeddings for {dataset_type} dataset...")
    embeddings = {}
    person_embeddings = {}
    image_paths = {}
    
    for person in tqdm(file_dict, desc=f"Processing {dataset_type} embeddings with {model_name}"):
        person_embeddings[person] = []
        image_paths[person] = []
        
        for img_path in file_dict[person]:
            emb = get_embedding_deepface(img_path, model_name=model_name)
            if emb is not None:
                person_embeddings[person].append(emb)
                image_paths[person].append(img_path)
        
        if person_embeddings[person]:
            # Store the mean embedding for each person
            embeddings[person] = np.mean(person_embeddings[person], axis=0)
    
    # Save embeddings to file
    np.savez(
        embeddings_path,
        embeddings=embeddings,
        person_embeddings=person_embeddings,
        image_paths=image_paths
    )
    
    print(f"Saved {model_name} embeddings for {dataset_type} dataset to {embeddings_path}")
    return np.load(embeddings_path, allow_pickle=True)

def predict_person(test_emb, train_embeddings, threshold=SIMILARITY_THRESHOLD):
    """Predict identity using cosine similarity"""
    similarities = {
        person: cosine_similarity(test_emb.reshape(1, -1), emb.reshape(1, -1))[0][0] 
        for person, emb in train_embeddings.items()
    }
    
    if not similarities:
        return "unknown"
    
    pred_person, max_sim = max(similarities.items(), key=lambda x: x[1])
    return pred_person if max_sim >= threshold else "unknown"

def evaluate_model(train_files, val_files, model_name):
    """Evaluate model performance on training and validation sets"""
    # Load or compute embeddings
    train_data = compute_and_save_embeddings(train_files, 'train', model_name)
    val_data = compute_and_save_embeddings(val_files, 'val', model_name)
    
    train_embeddings = train_data['embeddings'].item()
    
    results = {}
    
    # Evaluate on training set
    print(f"\nEvaluating {model_name} on training set...")
    train_true, train_pred = [], []
    
    for person in tqdm(train_files, desc="Processing training evaluation"):
        for img_path in train_files[person]:
            emb = get_embedding_deepface(img_path, model_name=model_name)
            if emb is None:
                continue
            pred = predict_person(emb, train_embeddings)
            train_true.append(person)
            train_pred.append(pred)
    
    # Evaluate on validation set
    print(f"\nEvaluating {model_name} on validation set...")
    val_true, val_pred = [], []
    
    for person in tqdm(val_files, desc="Processing validation evaluation"):
        for img_path in val_files[person]:
            emb = get_embedding_deepface(img_path, model_name=model_name)
            if emb is None:
                continue
            pred = predict_person(emb, train_embeddings)
            val_true.append(person)
            val_pred.append(pred)
    
    # Calculate metrics
    train_acc = accuracy_score(train_true, train_pred)
    train_f1 = f1_score(train_true, train_pred, average='weighted', zero_division=0)
    train_recall = recall_score(train_true, train_pred, average='weighted', zero_division=0)
    train_precision = precision_score(train_true, train_pred, average='weighted', zero_division=0)
    
    val_acc = accuracy_score(val_true, val_pred)
    val_f1 = f1_score(val_true, val_pred, average='weighted', zero_division=0)
    val_recall = recall_score(val_true, val_pred, average='weighted', zero_division=0)
    val_precision = precision_score(val_true, val_pred, average='weighted', zero_division=0)
    
    # Print metrics
    print(f"\n{model_name} Results:")
    print(f"Training set - Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}, Recall: {train_recall:.4f}, Precision: {train_precision:.4f}")
    print(f"Validation set - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}, Recall: {val_recall:.4f}, Precision: {val_precision:.4f}")
    
    # Print detailed classification report for validation set
    print(f"\nDetailed {model_name} Classification Report for Validation Set:")
    print(classification_report(val_true, val_pred, zero_division=0))
    
    # Store results
    results = {
        'model_name': model_name,
        'train_true': train_true,
        'train_pred': train_pred,
        'val_true': val_true,
        'val_pred': val_pred,
        'train_metrics': {
            'accuracy': train_acc,
            'f1': train_f1,
            'recall': train_recall,
            'precision': train_precision
        },
        'val_metrics': {
            'accuracy': val_acc,
            'f1': val_f1,
            'recall': val_recall,
            'precision': val_precision
        }
    }
    
    # Save results to file
    results_path = os.path.join(EMBEDDINGS_DIR, model_name, 'results.npz')
    np.savez(results_path, **results)
    print(f"Saved {model_name} results to {results_path}")
    
    return results

def run_full_pipeline(train_files, val_files):
    """Run the full pipeline for all models"""
    all_results = {}
    
    for model_name in MODELS:
        print(f"\n{'='*50}")
        print(f"Processing model: {model_name}")
        print(f"{'='*50}")
        
        start_time = time.time()
        results = evaluate_model(train_files, val_files, model_name)
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        print(f"Time taken for {model_name}: {elapsed_time:.2f} seconds")
        
        all_results[model_name] = results
    
    # Save summary of all results
    summary = {
        model_name: {
            'train_metrics': results['train_metrics'],
            'val_metrics': results['val_metrics']
        }
        for model_name, results in all_results.items()
    }
    
    summary_df = pd.DataFrame([
        {
            'Model': model_name,
            'Train Accuracy': results['train_metrics']['accuracy'],
            'Train F1': results['train_metrics']['f1'],
            'Train Recall': results['train_metrics']['recall'],
            'Train Precision': results['train_metrics']['precision'],
            'Val Accuracy': results['val_metrics']['accuracy'],
            'Val F1': results['val_metrics']['f1'],
            'Val Recall': results['val_metrics']['recall'],
            'Val Precision': results['val_metrics']['precision']
        }
        for model_name, results in all_results.items()
    ])
    
    # Save summary to CSV
    summary_path = os.path.join(EMBEDDINGS_DIR, 'model_comparison_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"\nSaved model comparison summary to {summary_path}")
    
    # Print summary table
    print("\nModel Comparison Summary:")
    print(summary_df.to_string(index=False))
    
    return all_results

# Main execution
if __name__ == "__main__":
    print("Initializing face recognition pipeline...")
    print("Starting multi-model evaluation...")
    all_results = run_full_pipeline(train_files, val_files)
    print("Process completed successfully!")

### Results Visualizations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score
from collections import Counter
import os
import argparse

def generate_person_classification_summary(train_true, train_pred, val_true, val_pred):
    """Generate summary statistics for each person in the dataset"""
    train_freq = Counter(train_true)
    val_freq = Counter(val_true)
    
    persons = np.unique([p for p in np.unique(np.concatenate([train_true, val_true])) 
                         if 'person' in str(p)])
    results = []
    
    for person in persons:
        y_true_train = (np.array(train_true) == person).astype(int)
        y_pred_train = (np.array(train_pred) == person).astype(int)
        
        train_precision = precision_score(y_true_train, y_pred_train, zero_division=0)
        train_recall = recall_score(y_true_train, y_pred_train, zero_division=0)
        train_f1 = f1_score(y_true_train, y_pred_train, zero_division=0)
        
        y_true_val = (np.array(val_true) == person).astype(int)
        y_pred_val = (np.array(val_pred) == person).astype(int)
        
        val_precision = precision_score(y_true_val, y_pred_val, zero_division=0)
        val_recall = recall_score(y_true_val, y_pred_val, zero_division=0)
        val_f1 = f1_score(y_true_val, y_pred_val, zero_division=0)
        
        person_id = int(str(person).split('_')[-1])
        
        results.append({
            'person': person,
            'person_id': person_id,
            'train_frequency': train_freq.get(person, 0),
            'train_precision': train_precision,
            'train_recall': train_recall,
            'train_f1': train_f1,
            'val_frequency': val_freq.get(person, 0),
            'val_precision': val_precision,
            'val_recall': val_recall,
            'val_f1': val_f1
        })
    
    results_df = pd.DataFrame(results).sort_values('person_id')
    
    return results_df

def plot_metrics_by_person(train_true, train_pred, val_true, val_pred, model_name, save_dir):
    """Plot F1, precision, and recall metrics for each person"""
    train_true = np.array(train_true)
    train_pred = np.array(train_pred)
    val_true = np.array(val_true)
    val_pred = np.array(val_pred)
    
    summary_df = generate_person_classification_summary(train_true, train_pred, val_true, val_pred)
    
    person_ids = summary_df['person_id'].values
    train_f1_values = summary_df['train_f1'].values
    val_f1_values = summary_df['val_f1'].values
    train_precision_values = summary_df['train_precision'].values
    val_precision_values = summary_df['val_precision'].values
    train_recall_values = summary_df['train_recall'].values
    val_recall_values = summary_df['val_recall'].values
    train_freqs = summary_df['train_frequency'].values
    val_freqs = summary_df['val_frequency'].values

    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(14, 24))
    
    # Plot 1: F1 Scores - Blue and Red (as is)
    ax1.plot(person_ids, train_f1_values, 'b-', label='Training F1 Score', marker='o', alpha=0.7)
    ax1.plot(person_ids, val_f1_values, 'r-', label='Validation F1 Score', marker='o', alpha=0.7)
    
    train_min_indices = np.argsort(train_f1_values)[:2]
    val_min_indices = np.argsort(val_f1_values)[:3]
    
    for i, idx in enumerate(train_min_indices):
        ax1.annotate(f'F1: {train_f1_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {train_freqs[idx]}',
                     xy=(person_ids[idx], train_f1_values[idx]),
                     xytext=(person_ids[idx], train_f1_values[idx] - 0.15),
                     arrowprops=dict(facecolor='blue', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='blue', ha='center', 
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="blue", alpha=0.8))
    
    for i, idx in enumerate(val_min_indices):
        y_offset = -0.15 - (i * 0.05)
        ax1.annotate(f'F1: {val_f1_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {val_freqs[idx]}',
                     xy=(person_ids[idx], val_f1_values[idx]),
                     xytext=(person_ids[idx], val_f1_values[idx] + y_offset),
                     arrowprops=dict(facecolor='red', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='red', ha='center', 
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="red", alpha=0.8))
    
    ax1.set_xlabel('Person ID')
    ax1.set_ylabel('F1 Score')
    ax1.set_title(f'F1 Scores by Person (Training vs Validation) - {model_name}')
    ax1.set_xticks(person_ids[::5])
    ax1.grid(True, alpha=0.3)
    ax1.legend()
    ax1.set_ylim(min(min(train_f1_values), min(val_f1_values)) - 0.3, 
                 max(max(train_f1_values), max(val_f1_values)) + 0.25)
    
    train_avg_f1 = np.mean(train_f1_values)
    val_avg_f1 = np.mean(val_f1_values)
    textstr_f1 = (f'Training Avg F1: {train_avg_f1:.3f}\n'
                  f'Validation Avg F1: {val_avg_f1:.3f}\n'
                  f'Annotated points:\n'
                  f'  Training: 2 lowest\n'
                  f'  Validation: 3 lowest')
    ax1.text(0.05, 0.95, textstr_f1, transform=ax1.transAxes, fontsize=10,
             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # Plot 2: Precision - Orange and Blue
    ax2.plot(person_ids, train_precision_values, 'C1-', marker='o', label='Training Precision', alpha=0.7, color='orange')
    ax2.plot(person_ids, val_precision_values, 'b-', marker='o', label='Validation Precision', alpha=0.7)
    
    train_min_indices = np.argsort(train_precision_values)[:2]
    val_min_indices = np.argsort(val_precision_values)[:3]
    
    for i, idx in enumerate(train_min_indices):
        ax2.annotate(f'Prec: {train_precision_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {train_freqs[idx]}',
                     xy=(person_ids[idx], train_precision_values[idx]),
                     xytext=(person_ids[idx], train_precision_values[idx] - 0.15),
                     arrowprops=dict(facecolor='orange', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='darkorange', ha='center',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="orange", alpha=0.8))
    
    for i, idx in enumerate(val_min_indices):
        y_offset = -0.15 - (i * 0.05)
        ax2.annotate(f'Prec: {val_precision_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {val_freqs[idx]}',
                     xy=(person_ids[idx], val_precision_values[idx]),
                     xytext=(person_ids[idx], val_precision_values[idx] + y_offset),
                     arrowprops=dict(facecolor='blue', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='blue', ha='center',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="blue", alpha=0.8))
    
    ax2.set_xlabel('Person ID')
    ax2.set_ylabel('Precision')
    ax2.set_title(f'Precision by Person (Training vs Validation) - {model_name}')
    ax2.set_xticks(person_ids[::5])
    ax2.grid(True, alpha=0.3)
    ax2.legend()
    ax2.set_ylim(min(min(train_precision_values), min(val_precision_values)) - 0.3, 
                 max(max(train_precision_values), max(val_precision_values)) + 0.25)
    
    train_avg_prec = np.mean(train_precision_values)
    val_avg_prec = np.mean(val_precision_values)
    textstr_prec = (f'Training Avg Precision: {train_avg_prec:.3f}\n'
                    f'Validation Avg Precision: {val_avg_prec:.3f}\n'
                    f'Annotated points:\n'
                    f'  Training: 2 lowest\n'
                    f'  Validation: 3 lowest')
    ax2.text(0.05, 0.95, textstr_prec, transform=ax2.transAxes, fontsize=10,
             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # Plot 3: Recall - Black and Green
    ax3.plot(person_ids, train_recall_values, 'k-', marker='o', label='Training Recall', alpha=0.7)
    ax3.plot(person_ids, val_recall_values, 'g-', marker='o', label='Validation Recall', alpha=0.7)
    
    train_min_indices = np.argsort(train_recall_values)[:2]
    val_min_indices = np.argsort(val_recall_values)[:3]
    
    for i, idx in enumerate(train_min_indices):
        ax3.annotate(f'Recall: {train_recall_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {train_freqs[idx]}',
                     xy=(person_ids[idx], train_recall_values[idx]),
                     xytext=(person_ids[idx], train_recall_values[idx] - 0.15),
                     arrowprops=dict(facecolor='black', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='black', ha='center',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", alpha=0.8))
    
    for i, idx in enumerate(val_min_indices):
        y_offset = -0.15 - (i * 0.05)
        ax3.annotate(f'Recall: {val_recall_values[idx]:.2f}\nPerson {person_ids[idx]}\nFreq: {val_freqs[idx]}',
                     xy=(person_ids[idx], val_recall_values[idx]),
                     xytext=(person_ids[idx], val_recall_values[idx] + y_offset),
                     arrowprops=dict(facecolor='green', shrink=0.05, width=1.5, headwidth=8),
                     fontsize=9, color='green', ha='center',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="green", alpha=0.8))
    
    ax3.set_xlabel('Person ID')
    ax3.set_ylabel('Recall')
    ax3.set_title(f'Recall by Person (Training vs Validation) - {model_name}')
    ax3.set_xticks(person_ids[::5])
    ax3.grid(True, alpha=0.3)
    ax3.legend()
    ax3.set_ylim(min(min(train_recall_values), min(val_recall_values)) - 0.3, 
                 max(max(train_recall_values), max(val_recall_values)) + 0.25)
    
    train_avg_recall = np.mean(train_recall_values)
    val_avg_recall = np.mean(val_recall_values)
    textstr_recall = (f'Training Avg Recall: {train_avg_recall:.3f}\n'
                      f'Validation Avg Recall: {val_avg_recall:.3f}\n'
                      f'Annotated points:\n'
                      f'  Training: 2 lowest\n'
                      f'  Validation: 3 lowest')
    ax3.text(0.05, 0.95, textstr_recall, transform=ax3.transAxes, fontsize=10,
             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    
    # Create directory for saving plots
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f'{model_name}_person_metrics.png')
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Plots for {model_name} saved to {save_path}")
    
    # Save summary data to CSV
    summary_path = os.path.join(save_dir, f'{model_name}_person_metrics.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Summary data for {model_name} saved to {summary_path}")
    
    # Print lowest performers
    train_lowest = summary_df.nsmallest(2, 'train_f1')
    val_lowest = summary_df.nsmallest(3, 'val_f1')

    print(f"\n2 lowest performers in {model_name} training set:")
    print(train_lowest[['person_id', 'train_frequency', 'train_f1']])

    print(f"\n3 lowest performers in {model_name} validation set:")
    print(val_lowest[['person_id', 'val_frequency', 'val_f1']])
    
    return fig, summary_df

def plot_model_comparison(models_results, save_dir):
    """Plot comparison of models performance"""
    # Extract metrics for all models
    models = list(models_results.keys())
    train_f1 = [models_results[model]['train_metrics']['f1'] for model in models]
    val_f1 = [models_results[model]['val_metrics']['f1'] for model in models]
    train_precision = [models_results[model]['train_metrics']['precision'] for model in models]
    val_precision = [models_results[model]['val_metrics']['precision'] for model in models]
    train_recall = [models_results[model]['train_metrics']['recall'] for model in models]
    val_recall = [models_results[model]['val_metrics']['recall'] for model in models]
    
    # Set up bar plot
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 18))
    
    # F1 Score comparison
    x = np.arange(len(models))
    width = 0.35
    
    ax1.bar(x - width/2, train_f1, width, label='Training F1', color='blue', alpha=0.7)
    ax1.bar(x + width/2, val_f1, width, label='Validation F1', color='red', alpha=0.7)
    ax1.set_xlabel('Model')
    ax1.set_ylabel('F1 Score')
    ax1.set_title('F1 Score Comparison Across Models')
    ax1.set_xticks(x)
    ax1.set_xticklabels(models)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(train_f1):
        ax1.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    for i, v in enumerate(val_f1):
        ax1.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    
    # Precision comparison
    ax2.bar(x - width/2, train_precision, width, label='Training Precision', color='orange', alpha=0.7)
    ax2.bar(x + width/2, val_precision, width, label='Validation Precision', color='blue', alpha=0.7)
    ax2.set_xlabel('Model')
    ax2.set_ylabel('Precision')
    ax2.set_title('Precision Comparison Across Models')
    ax2.set_xticks(x)
    ax2.set_xticklabels(models)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(train_precision):
        ax2.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    for i, v in enumerate(val_precision):
        ax2.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    
    # Recall comparison
    ax3.bar(x - width/2, train_recall, width, label='Training Recall', color='black', alpha=0.7)
    ax3.bar(x + width/2, val_recall, width, label='Validation Recall', color='green', alpha=0.7)
    ax3.set_xlabel('Model')
    ax3.set_ylabel('Recall')
    ax3.set_title('Recall Comparison Across Models')
    ax3.set_xticks(x)
    ax3.set_xticklabels(models)
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(train_recall):
        ax3.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    for i, v in enumerate(val_recall):
        ax3.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    
    # Save comparison plot
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, 'model_comparison.png')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Model comparison plot saved to {save_path}")
    
    # Create a summary dataframe
    summary_df = pd.DataFrame({
        'Model': models,
        'Training F1': train_f1,
        'Validation F1': val_f1,
        'Training Precision': train_precision,
        'Validation Precision': val_precision,
        'Training Recall': train_recall,
        'Validation Recall': val_recall
    })
    
    # Save summary to CSV
    summary_path = os.path.join(save_dir, 'model_comparison_metrics.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Model comparison data saved to {summary_path}")
    
    return fig, summary_df

def analyze_results_from_file(embeddings_dir, models, plots_dir='plots'):
    """Load results from files and generate visualizations"""
    all_results = {}
    
    for model_name in models:
        results_path = os.path.join(embeddings_dir, model_name, 'results.npz')
        
        if not os.path.exists(results_path):
            print(f"Results file for {model_name} not found at {results_path}")
            continue
        
        try:
            data = np.load(results_path, allow_pickle=True)
            results = {key: data[key] for key in data.files}
            
            # Plot individual model results
            plot_metrics_by_person(
                results['train_true'], 
                results['train_pred'], 
                results['val_true'], 
                results['val_pred'],
                model_name,
                plots_dir
            )
            
            all_results[model_name] = {
                'train_metrics': results['train_metrics'].item(),
                'val_metrics': results['val_metrics'].item()
            }
            
        except Exception as e:
            print(f"Error processing results for {model_name}: {e}")
    
    # Generate model comparison plot
    if all_results:
        plot_model_comparison(all_results, plots_dir)
    
    return all_results

if __name__ == "__main__":
    embeddings_dir = 'embeddings'  # Directory containing model embeddings and results
    plots_dir = 'plots'            # Directory to save plots
    models = ['Facenet', 'Facenet512', 'VGG-Face', 'Dlib']  # List of models to analyze
    
    print(f"Analyzing results for models: {models}")
    results = analyze_results_from_file(embeddings_dir, models, plots_dir)
    
    if not results:
        print("No results were found. Please ensure the model evaluation has been run first.")
    else:
        print("Analysis completed successfully!")