# Logo Similarity Analysis using Cosine Similarity
## Load trained models and test logo similarity with comprehensive visualization

This notebook performs comprehensive analysis of logo similarity using trained models:
- Loads pre-trained models with proper error handling
- Calculates cosine similarity between logo embeddings
- Provides detailed visualization and analysis
- Tests model performance on logo verification tasks

In [None]:
# Fix environment setup and imports
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Fix numpy compatibility issues
os.environ['OPENBLAS_NUM_THREADS'] = '1'

# Import libraries step by step to avoid conflicts
import numpy as np
print(f"NumPy version: {np.__version__}")

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from torch import optim
from tqdm import tqdm
from torchvision import models
import random
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.cuda import amp
from torch.optim import lr_scheduler
import torch.nn.functional as F

# Import sklearn with error handling
try:
    from sklearn.metrics import accuracy_score
    from sklearn.metrics.pairwise import cosine_similarity
    print("✅ Sklearn imported successfully")
except ImportError as e:
    print(f"Sklearn import error: {e}")
    print("Installing scikit-learn...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])
    from sklearn.metrics import accuracy_score
    from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import pandas as pd
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Manual cosine similarity implementation as backup
def cosine_similarity_manual(X, Y=None):
    """Manual implementation of cosine similarity to avoid sklearn issues"""
    if Y is None:
        Y = X
    
    # Normalize vectors
    X_norm = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
    Y_norm = Y / (np.linalg.norm(Y, axis=1, keepdims=True) + 1e-8)
    
    # Calculate cosine similarity
    return np.dot(X_norm, Y_norm.T)

# Use manual implementation if sklearn fails
try:
    # Test sklearn cosine_similarity
    test_a = np.random.randn(2, 3)
    test_b = np.random.randn(2, 3)
    _ = cosine_similarity(test_a, test_b)
    print("✅ Using sklearn cosine_similarity")
except:
    cosine_similarity = cosine_similarity_manual
    print("⚠️ Using manual cosine_similarity implementation")

In [None]:
# Define model architecture - flexible for different backbones
class Network(nn.Module):
    def __init__(self, emb_dim=256, backbone='mobilenet_v2'):
        super(Network, self).__init__()
        
        if backbone == 'mobilenet_v2':
            base_model = models.mobilenet_v2(pretrained=True)
            self.backbone = base_model.features
            in_features = 1280
        elif backbone == 'vgg16':
            base_model = models.vgg16(pretrained=True)
            self.backbone = base_model.features
            in_features = 512 * 7 * 7  # VGG16 output
        elif backbone == 'resnet18':
            base_model = models.resnet18(pretrained=True)
            self.backbone = nn.Sequential(*list(base_model.children())[:-1])
            in_features = 512
        elif backbone == 'resnet50':
            base_model = models.resnet50(pretrained=True)
            self.backbone = nn.Sequential(*list(base_model.children())[:-1])
            in_features = 2048
        else:
            # Default to MobileNetV2
            base_model = models.mobilenet_v2(pretrained=True)
            self.backbone = base_model.features
            in_features = 1280
        
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.PReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, emb_dim)
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
<VSCode.Cell language="python">
# Load trained model with comprehensive error handling
def load_trained_model(model_paths, backbone='mobilenet_v2', emb_dim=256):
    """
    Load trained model with multiple fallback options
    
    Args:
        model_paths: list of potential model file paths
        backbone: backbone architecture type
        emb_dim: embedding dimension
    
    Returns:
        loaded model and success status
    """
    model = Network(emb_dim=emb_dim, backbone=backbone).to(device)
    model_loaded = False
    loaded_info = {}
    
    # Try loading from multiple paths
    for model_path in model_paths:
        if os.path.exists(model_path):
            try:
                # Method 1: Load with weights_only=False (for newer PyTorch)
                checkpoint = torch.load(model_path, map_location=device, weights_only=False)
                
                if isinstance(checkpoint, dict):
                    if 'model_state_dict' in checkpoint:
                        model.load_state_dict(checkpoint['model_state_dict'])
                        loaded_info = {
                            'path': model_path,
                            'accuracy': checkpoint.get('accuracy', 'N/A'),
                            'epoch': checkpoint.get('epoch', 'N/A')
                        }
                    elif 'state_dict' in checkpoint:
                        model.load_state_dict(checkpoint['state_dict'])
                        loaded_info = {'path': model_path, 'accuracy': 'N/A', 'epoch': 'N/A'}
                    else:
                        # Assume checkpoint is direct state_dict
                        model.load_state_dict(checkpoint)
                        loaded_info = {'path': model_path, 'accuracy': 'N/A', 'epoch': 'N/A'}
                else:
                    # Checkpoint is the model itself
                    model = checkpoint.to(device)
                    loaded_info = {'path': model_path, 'accuracy': 'N/A', 'epoch': 'N/A'}
                
                print(f"✅ Model loaded successfully from: {os.path.basename(model_path)}")
                print(f"   Accuracy: {loaded_info.get('accuracy', 'N/A')}")
                print(f"   Epoch: {loaded_info.get('epoch', 'N/A')}")
                model_loaded = True
                break
                
            except Exception as e:
                print(f"❌ Failed to load {os.path.basename(model_path)}: {str(e)[:100]}...")
                continue
        else:
            print(f"📁 Path not found: {os.path.basename(model_path)}")

    if not model_loaded:
        print("⚠️ No model loaded successfully. Using randomly initialized model.")
        print("   Results will not be meaningful!")
        loaded_info = {'path': 'random_init', 'accuracy': 0.0, 'epoch': 0}

    model.eval()
    return model, model_loaded, loaded_info

# Define potential model paths
model_paths = [
    "SupConLoss_BBMobileNetV2.pth",
    "Model2/SupConLoss_BBMobileNetV2.pth", 
    "SupConLoss_BBModel2/Model2/SupConLoss_BBMobileNetV2.pth",
    "../SupConLoss_BBMobileNetV2.pth",
    "outputs/model_best.pth",
    "outputs/SupConLoss_BBMobileNetV2.pth"
]

# Load model
model, model_loaded, model_info = load_trained_model(model_paths, backbone='mobilenet_v2', emb_dim=256)
print(f"\nModel Status: {'Loaded' if model_loaded else 'Random Init'}")
print("="*50)

In [None]:
<VSCode.Cell language="python">
def calculate_cosine_similarity_matrix_enhanced(folder_path, model, transforms, key="000000", max_samples=100):
    """
    Enhanced cosine similarity calculation with better error handling and efficiency
    
    Args:
        folder_path: path to test data folder
        model: trained model
        transforms: preprocessing transforms
        key: key to distinguish reference vs test images
        max_samples: limit samples to avoid memory issues
    
    Returns:
        similarity_data: dict containing similarity information
    """
    
    if not os.path.exists(folder_path):
        print(f"❌ Folder not found: {folder_path}")
        return None
    
    label_org = []
    embeddings_org = []
    paths_org = []
    label_test = []
    embeddings_test = []
    paths_test = []
    
    REFER_DICT = {}
    
    print("🔍 Scanning for images...")
    
    # Collect all image paths first
    all_ref_paths = []
    all_test_paths = []
    all_ref_labels = []
    all_test_labels = []
    
    try:
        subfolders = [f for f in os.listdir(folder_path) 
                     if os.path.isdir(os.path.join(folder_path, f))]
        
        for label_index, subfolder_name in enumerate(subfolders):
            REFER_DICT[label_index] = subfolder_name
            subfolder_path = os.path.join(folder_path, subfolder_name)
            
            if not os.path.exists(subfolder_path):
                continue
                
            # Get all image files
            image_files = [f for f in os.listdir(subfolder_path) 
                          if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
            
            ref_count = 0
            test_count = 0
            
            for image_file in image_files:
                image_path = os.path.join(subfolder_path, image_file)
                
                if key in image_file and ref_count < max_samples//2:
                    # Reference images
                    all_ref_paths.append(image_path)
                    all_ref_labels.append(label_index)
                    ref_count += 1
                    
                elif key not in image_file and test_count < max_samples//2:
                    # Test images
                    all_test_paths.append(image_path)
                    all_test_labels.append(label_index)
                    test_count += 1
        
        print(f"📊 Found {len(all_ref_paths)} reference images and {len(all_test_paths)} test images")
        
        if not all_ref_paths or not all_test_paths:
            print(f"⚠️ No images found with key '{key}' for reference or no test images found")
            return None
        
        # Extract embeddings in batches
        print("🔮 Extracting reference embeddings...")
        embeddings_org, paths_org = extract_embeddings_batch(all_ref_paths, model, transforms)
        label_org = [all_ref_labels[all_ref_paths.index(path)] for path in paths_org]
        
        print("🔮 Extracting test embeddings...")
        embeddings_test, paths_test = extract_embeddings_batch(all_test_paths, model, transforms)
        label_test = [all_test_labels[all_test_paths.index(path)] for path in paths_test]
        
        if not embeddings_org or not embeddings_test:
            print("❌ No valid embeddings extracted!")
            return None
        
        # Convert to numpy arrays
        embeddings_org = np.vstack(embeddings_org)
        embeddings_test = np.vstack(embeddings_test)
        
        print(f"✅ Successfully extracted embeddings:")
        print(f"   Reference: {embeddings_org.shape}")
        print(f"   Test: {embeddings_test.shape}")
        
        # Calculate cosine similarity matrix
        print("📐 Calculating cosine similarity matrix...")
        similarity_matrix = cosine_similarity(embeddings_test, embeddings_org)
        
        print(f"   Similarity matrix shape: {similarity_matrix.shape}")
        
        return {
            'similarity_matrix': similarity_matrix,
            'embeddings_org': embeddings_org,
            'embeddings_test': embeddings_test,
            'label_org': label_org,
            'label_test': label_test,
            'paths_org': paths_org,
            'paths_test': paths_test,
            'refer_dict': REFER_DICT
        }
        
    except Exception as e:
        print(f"❌ Error in similarity calculation: {e}")
        return None

In [None]:
def visualize_similarity_distribution(similarity_data):
    """Visualize cosine similarity distribution"""
    
    similarity_matrix = similarity_data['similarity_matrix']
    label_test = similarity_data['label_test']
    label_org = similarity_data['label_org']
    refer_dict = similarity_data['refer_dict']
    
    # Calculate similarity scores
    positive_similarities = []  # Same class
    negative_similarities = []  # Different class
    
    for i, test_label in enumerate(label_test):
        for j, ref_label in enumerate(label_org):
            sim_score = similarity_matrix[i, j]
            if test_label == ref_label:
                positive_similarities.append(sim_score)
            else:
                negative_similarities.append(sim_score)
    
    # Create visualization
    plt.figure(figsize=(15, 12))
    
    # 1. Histogram of similarity scores
    plt.subplot(2, 3, 1)
    plt.hist(positive_similarities, bins=50, alpha=0.7, label='Same Class (Positive)', color='green', density=True)
    plt.hist(negative_similarities, bins=50, alpha=0.7, label='Different Class (Negative)', color='red', density=True)
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Density')
    plt.title('Distribution of Cosine Similarity Scores')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 2. Box plot
    plt.subplot(2, 3, 2)
    data_for_box = [positive_similarities, negative_similarities]
    labels_for_box = ['Same Class', 'Different Class']
    plt.boxplot(data_for_box, labels=labels_for_box)
    plt.ylabel('Cosine Similarity')
    plt.title('Similarity Score Distribution')
    plt.grid(True, alpha=0.3)
    
    # 3. Heatmap of similarity matrix (sample)
    plt.subplot(2, 3, 3)
    sample_size = min(20, similarity_matrix.shape[0], similarity_matrix.shape[1])
    sample_matrix = similarity_matrix[:sample_size, :sample_size]
    sns.heatmap(sample_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title(f'Similarity Matrix Heatmap (Sample {sample_size}x{sample_size})')
    plt.xlabel('Reference Images')
    plt.ylabel('Test Images')
    
    # 4. Statistics summary
    plt.subplot(2, 3, 4)
    stats_data = {
        'Metric': ['Mean', 'Std', 'Min', 'Max', 'Count'],
        'Same Class': [
            np.mean(positive_similarities),
            np.std(positive_similarities),
            np.min(positive_similarities),
            np.max(positive_similarities),
            len(positive_similarities)
        ],
        'Different Class': [
            np.mean(negative_similarities),
            np.std(negative_similarities),
            np.min(negative_similarities),
            np.max(negative_similarities),
            len(negative_similarities)
        ]
    }
    
    stats_df = pd.DataFrame(stats_data)
    plt.axis('tight')
    plt.axis('off')
    table = plt.table(cellText=stats_df.values, colLabels=stats_df.columns, 
                     cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    plt.title('Similarity Statistics Summary')
    
    # 5. Threshold analysis
    plt.subplot(2, 3, 5)
    thresholds = np.linspace(0, 1, 100)
    accuracies = []
    
    for threshold in thresholds:
        correct = 0
        total = 0
        
        for i, test_label in enumerate(label_test):
            max_sim = np.max(similarity_matrix[i])
            max_idx = np.argmax(similarity_matrix[i])
            predicted_label = label_org[max_idx]
            
            if max_sim >= threshold:
                total += 1
                if predicted_label == test_label:
                    correct += 1
        
        if total > 0:
            accuracies.append(correct / total)
        else:
            accuracies.append(0)
    
    plt.plot(thresholds, accuracies)
    plt.xlabel('Similarity Threshold')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Similarity Threshold')
    plt.grid(True, alpha=0.3)
    
    # 6. Precision-Recall curve approximation
    plt.subplot(2, 3, 6)
    
    # Calculate optimal threshold
    best_threshold = thresholds[np.argmax(accuracies)]
    best_accuracy = np.max(accuracies)
    
    plt.axvline(x=best_threshold, color='red', linestyle='--', 
                label=f'Best Threshold: {best_threshold:.3f}\nAccuracy: {best_accuracy:.3f}')
    plt.plot(thresholds, accuracies, 'b-', linewidth=2)
    plt.xlabel('Similarity Threshold')
    plt.ylabel('Accuracy')
    plt.title('Optimal Threshold Analysis')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n" + "="*60)
    print("COSINE SIMILARITY ANALYSIS SUMMARY")
    print("="*60)
    print(f"Total test samples: {len(label_test)}")
    print(f"Total reference samples: {len(label_org)}")
    print(f"Number of classes: {len(set(label_test))}")
    print(f"\nSame Class Similarities:")
    print(f"  Mean: {np.mean(positive_similarities):.4f}")
    print(f"  Std:  {np.std(positive_similarities):.4f}")
    print(f"  Range: [{np.min(positive_similarities):.4f}, {np.max(positive_similarities):.4f}]")
    print(f"\nDifferent Class Similarities:")
    print(f"  Mean: {np.mean(negative_similarities):.4f}")
    print(f"  Std:  {np.std(negative_similarities):.4f}")
    print(f"  Range: [{np.min(negative_similarities):.4f}, {np.max(negative_similarities):.4f}]")
    print(f"\nOptimal Threshold: {best_threshold:.4f}")
    print(f"Best Accuracy: {best_accuracy:.4f}")
    print("="*60)
    
    return {
        'positive_similarities': positive_similarities,
        'negative_similarities': negative_similarities,
        'best_threshold': best_threshold,
        'best_accuracy': best_accuracy,
        'stats_df': stats_df
    }

In [None]:
def detailed_similarity_analysis(similarity_data, num_examples=5):
    """Detailed analysis with specific examples"""
    
    similarity_matrix = similarity_data['similarity_matrix']
    label_test = similarity_data['label_test']
    label_org = similarity_data['label_org']
    paths_test = similarity_data['paths_test']
    paths_org = similarity_data['paths_org']
    refer_dict = similarity_data['refer_dict']
    
    print("DETAILED SIMILARITY ANALYSIS")
    print("="*50)
    
    # Show top similarities for each test image
    for i in range(min(num_examples, len(label_test))):
        test_label = label_test[i]
        test_path = paths_test[i]
        
        # Get similarities for this test image
        similarities = similarity_matrix[i]
        
        # Get top 3 most similar reference images
        top_indices = np.argsort(similarities)[::-1][:3]
        
        print(f"\nTest Image {i+1}: {os.path.basename(test_path)}")
        print(f"True Class: {refer_dict[test_label]}")
        print("Top 3 Most Similar Reference Images:")
        
        for rank, ref_idx in enumerate(top_indices):
            ref_label = label_org[ref_idx]
            ref_path = paths_org[ref_idx]
            similarity_score = similarities[ref_idx]
            
            match_status = "✓ MATCH" if ref_label == test_label else "✗ NO MATCH"
            
            print(f"  {rank+1}. {os.path.basename(ref_path)}")
            print(f"     Class: {refer_dict[ref_label]}")
            print(f"     Similarity: {similarity_score:.4f} {match_status}")
        
        print("-" * 50)

In [None]:
def save_similarity_results(similarity_data, analysis_results, output_dir="similarity_analysis"):
    """Save analysis results to files"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Save similarity matrix
    np.save(os.path.join(output_dir, 'similarity_matrix.npy'), 
            similarity_data['similarity_matrix'])
    
    # Save detailed results
    results = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'num_test_samples': len(similarity_data['label_test']),
        'num_reference_samples': len(similarity_data['label_org']),
        'num_classes': len(set(similarity_data['label_test'])),
        'best_threshold': float(analysis_results['best_threshold']),
        'best_accuracy': float(analysis_results['best_accuracy']),
        'positive_sim_mean': float(np.mean(analysis_results['positive_similarities'])),
        'positive_sim_std': float(np.std(analysis_results['positive_similarities'])),
        'negative_sim_mean': float(np.mean(analysis_results['negative_similarities'])),
        'negative_sim_std': float(np.std(analysis_results['negative_similarities']))
    }
    
    import json
    with open(os.path.join(output_dir, 'similarity_analysis_results.json'), 'w') as f:
        json.dump(results, f, indent=2)
    
    # Save statistics DataFrame
    analysis_results['stats_df'].to_csv(
        os.path.join(output_dir, 'similarity_statistics.csv'), index=False)
    
    print(f"Results saved to: {output_dir}")
    
    return results

In [None]:
# Main execution
def run_similarity_analysis(test_path, model, transforms, key="000000"):
    """Run complete similarity analysis"""
    
    print("Starting Cosine Similarity Analysis...")
    print("="*60)
    
    # Step 1: Calculate similarity matrix
    print("Step 1: Calculating similarity matrix...")
    similarity_data = calculate_cosine_similarity_matrix(
        test_path, model, transforms, key=key, max_samples=100
    )
    
    if similarity_data is None:
        print("Failed to calculate similarities!")
        return None
    
    # Step 2: Visualize results
    print("\nStep 2: Generating visualizations...")
    analysis_results = visualize_similarity_distribution(similarity_data)
    
    # Step 3: Detailed analysis
    print("\nStep 3: Detailed similarity analysis...")
    detailed_similarity_analysis(similarity_data, num_examples=5)
    
    # Step 4: Save results
    print("\nStep 4: Saving results...")
    saved_results = save_similarity_results(similarity_data, analysis_results)
    
    return {
        'similarity_data': similarity_data,
        'analysis_results': analysis_results,
        'saved_results': saved_results
    }

# Execute analysis
test_path = "logo_verify_test"  # Thay đổi path theo dữ liệu của bạn

if os.path.exists(test_path):
    results = run_similarity_analysis(test_path, model, preprocess, key="000000")
else:
    print(f"Test path not found: {test_path}")
    print("Please update the path to your test data.")
    
    # Demo với synthetic data nếu không có real data
    print("\nCreating demo with synthetic data...")
    
    # Tạo fake embeddings để demo
    num_classes = 5
    samples_per_class = 10
    embedding_dim = 256
    
    fake_embeddings_ref = []
    fake_embeddings_test = []
    fake_labels_ref = []
    fake_labels_test = []
    
    for class_id in range(num_classes):
        # Tạo center embedding cho mỗi class
        center = np.random.randn(embedding_dim)
        
        # Reference embeddings (gần center)
        for _ in range(samples_per_class//2):
            noise = np.random.randn(embedding_dim) * 0.1
            fake_embeddings_ref.append(center + noise)
            fake_labels_ref.append(class_id)
        
        # Test embeddings (xa center hơn một chút)
        for _ in range(samples_per_class//2):
            noise = np.random.randn(embedding_dim) * 0.2
            fake_embeddings_test.append(center + noise)
            fake_labels_test.append(class_id)
    
    fake_embeddings_ref = np.vstack(fake_embeddings_ref)
    fake_embeddings_test = np.vstack(fake_embeddings_test)
    
    # Normalize embeddings
    fake_embeddings_ref = fake_embeddings_ref / np.linalg.norm(fake_embeddings_ref, axis=1, keepdims=True)
    fake_embeddings_test = fake_embeddings_test / np.linalg.norm(fake_embeddings_test, axis=1, keepdims=True)
    
    # Calculate similarity
    fake_similarity_matrix = cosine_similarity(fake_embeddings_test, fake_embeddings_ref)
    
    # Create fake data structure
    fake_similarity_data = {
        'similarity_matrix': fake_similarity_matrix,
        'embeddings_org': fake_embeddings_ref,
        'embeddings_test': fake_embeddings_test,
        'label_org': fake_labels_ref,
        'label_test': fake_labels_test,
        'paths_org': [f"ref_class_{l}_{i}.jpg" for i, l in enumerate(fake_labels_ref)],
        'paths_test': [f"test_class_{l}_{i}.jpg" for i, l in enumerate(fake_labels_test)],
        'refer_dict': {i: f"Logo_Class_{i}" for i in range(num_classes)}
    }
    
    print("Analyzing synthetic data...")
    fake_analysis_results = visualize_similarity_distribution(fake_similarity_data)
    detailed_similarity_analysis(fake_similarity_data, num_examples=3)

In [None]:
# Enhanced preprocessing and embedding extraction
preprocess = A.Compose([
    A.Resize(224, 224),
    A.Normalize(
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225], 
        max_pixel_value=255.0, 
        p=1.0
    ),
    ToTensorV2()
], p=1.)

def extract_embedding(image_path, model, transforms):
    """Extract embedding vector from image with error handling"""
    try:
        # Load and convert image
        image = Image.open(image_path).convert('RGB')
        image_array = np.array(image)
        
        # Apply transforms
        transformed = transforms(image=image_array)["image"]
        
        # Extract embedding
        with torch.no_grad():
            embedding = model(transformed.unsqueeze(0).to(device))
            # Normalize embedding
            embedding = F.normalize(embedding, p=2, dim=1)
        
        return embedding.cpu().numpy()
        
    except Exception as e:
        print(f"Error processing {os.path.basename(image_path)}: {e}")
        return None

def extract_embeddings_batch(image_paths, model, transforms, batch_size=32):
    """Extract embeddings in batches for better efficiency"""
    embeddings = []
    valid_paths = []
    
    model.eval()
    with torch.no_grad():
        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i+batch_size]
            batch_images = []
            batch_valid_paths = []
            
            # Load batch of images
            for path in batch_paths:
                try:
                    image = Image.open(path).convert('RGB')
                    image_array = np.array(image)
                    transformed = transforms(image=image_array)["image"]
                    batch_images.append(transformed)
                    batch_valid_paths.append(path)
                except Exception as e:
                    print(f"Error loading {os.path.basename(path)}: {e}")
                    continue
            
            if batch_images:
                # Process batch
                batch_tensor = torch.stack(batch_images).to(device)
                batch_embeddings = model(batch_tensor)
                # Normalize embeddings
                batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
                
                embeddings.extend(batch_embeddings.cpu().numpy())
                valid_paths.extend(batch_valid_paths)
    
    return embeddings, valid_paths

In [None]:
# Main execution with comprehensive testing
def run_complete_similarity_analysis(test_paths, model, transforms, key="000000"):
    """Run complete similarity analysis with multiple fallback paths"""
    
    print("🚀 Starting Logo Similarity Analysis")
    print("="*60)
    
    # Try multiple test paths
    test_successful = False
    similarity_data = None
    
    for test_path in test_paths:
        if os.path.exists(test_path):
            print(f"🔍 Testing with: {test_path}")
            try:
                similarity_data = calculate_cosine_similarity_matrix_enhanced(
                    test_path, model, transforms, key=key, max_samples=100
                )
                
                if similarity_data is not None:
                    test_successful = True
                    print(f"✅ Successfully processed data from: {test_path}")
                    break
                    
            except Exception as e:
                print(f"❌ Failed to process {test_path}: {e}")
                continue
        else:
            print(f"📁 Path not found: {test_path}")
    
    if not test_successful or similarity_data is None:
        print("⚠️ No real data found. Creating synthetic demo...")
        similarity_data = create_synthetic_demo()
    
    if similarity_data is None:
        print("❌ Failed to create any data for analysis")
        return None
    
    # Run analysis
    print("\n📊 Generating visualizations...")
    try:
        analysis_results = visualize_similarity_distribution(similarity_data)
        
        print("\n🔍 Detailed analysis...")
        detailed_similarity_analysis(similarity_data, num_examples=5)
        
        print("\n💾 Saving results...")
        saved_results = save_similarity_results(similarity_data, analysis_results)
        
        return {
            'similarity_data': similarity_data,
            'analysis_results': analysis_results,
            'saved_results': saved_results,
            'model_info': model_info
        }
        
    except Exception as e:
        print(f"❌ Error in analysis: {e}")
        return {'similarity_data': similarity_data, 'model_info': model_info}

def create_synthetic_demo():
    """Create synthetic data for demonstration"""
    print("🎭 Creating synthetic logo similarity demo...")
    
    num_classes = 10
    samples_per_class = 20
    embedding_dim = 256
    
    fake_embeddings_ref = []
    fake_embeddings_test = []
    fake_labels_ref = []
    fake_labels_test = []
    
    # Create realistic logo class names
    logo_names = ['Amazon', 'Apple', 'Google', 'Microsoft', 'Nike', 
                 'Adidas', 'Coca-Cola', 'Pepsi', 'McDonald', 'Starbucks']
    
    for class_id in range(num_classes):
        # Create center embedding for each logo class
        center = np.random.randn(embedding_dim)
        center = center / np.linalg.norm(center)  # Normalize
        
        # Reference embeddings (close to center)
        for i in range(samples_per_class//2):
            noise = np.random.randn(embedding_dim) * 0.05  # Small noise
            embedding = center + noise
            embedding = embedding / np.linalg.norm(embedding)  # Normalize
            fake_embeddings_ref.append(embedding)
            fake_labels_ref.append(class_id)
        
        # Test embeddings (slightly more variation)
        for i in range(samples_per_class//2):
            noise = np.random.randn(embedding_dim) * 0.15  # More noise
            embedding = center + noise
            embedding = embedding / np.linalg.norm(embedding)  # Normalize
            fake_embeddings_test.append(embedding)
            fake_labels_test.append(class_id)
    
    fake_embeddings_ref = np.vstack(fake_embeddings_ref)
    fake_embeddings_test = np.vstack(fake_embeddings_test)
    
    # Calculate similarity matrix
    fake_similarity_matrix = cosine_similarity(fake_embeddings_test, fake_embeddings_ref)
    
    # Create fake data structure
    fake_similarity_data = {
        'similarity_matrix': fake_similarity_matrix,
        'embeddings_org': fake_embeddings_ref,
        'embeddings_test': fake_embeddings_test,
        'label_org': fake_labels_ref,
        'label_test': fake_labels_test,
        'paths_org': [f"ref_{logo_names[l]}_{i:03d}.jpg" for i, l in enumerate(fake_labels_ref)],
        'paths_test': [f"test_{logo_names[l]}_{i:03d}.jpg" for i, l in enumerate(fake_labels_test)],
        'refer_dict': {i: logo_names[i] for i in range(num_classes)}
    }
    
    print(f"✅ Created synthetic dataset:")
    print(f"   Classes: {num_classes}")
    print(f"   Reference samples: {len(fake_labels_ref)}")
    print(f"   Test samples: {len(fake_labels_test)}")
    
    return fake_similarity_data

# Test paths to try
test_paths = [
    "logo_verify_test",
    "../logo_verify_test", 
    "../../logo_verify_test",
    "data/logo_verify_test",
    "../data/logo_verify_test"
]

print(f"🏁 Starting analysis with {'trained' if model_loaded else 'random'} model")
print(f"Model info: {model_info}")

# Run the complete analysis
results = run_complete_similarity_analysis(test_paths, model, preprocess, key="000000")

if results:
    print("\n" + "="*60)
    print("🎉 ANALYSIS COMPLETED SUCCESSFULLY!")
    print("="*60)
    if 'analysis_results' in results:
        analysis = results['analysis_results']
        print(f"📈 Best Accuracy: {analysis.get('best_accuracy', 'N/A'):.4f}")
        print(f"🎯 Optimal Threshold: {analysis.get('best_threshold', 'N/A'):.4f}")
    print(f"🤖 Model Status: {'Trained' if model_loaded else 'Random Initialization'}")
    print("="*60)
else:
    print("❌ Analysis failed completely")

In [None]:
# Test with different model architectures
def test_multiple_models():
    """Test similarity analysis with different model architectures"""
    
    model_configs = [
        {'backbone': 'mobilenet_v2', 'emb_dim': 256, 'paths': ['SupConLoss_BBMobileNetV2.pth']},
        {'backbone': 'vgg16', 'emb_dim': 256, 'paths': ['SupConLoss_BBVGG16.pth']},
        {'backbone': 'resnet18', 'emb_dim': 256, 'paths': ['SupConLoss_BBResNet18.pth']},
        {'backbone': 'resnet50', 'emb_dim': 256, 'paths': ['SupConLoss_BBResNet50.pth']}
    ]
    
    results_comparison = {}
    
    for config in model_configs:
        print(f"\n{'='*60}")
        print(f"🧠 Testing {config['backbone'].upper()} Architecture")
        print(f"{'='*60}")
        
        # Try loading model
        model_paths = [f"Model2/{path}" for path in config['paths']] + config['paths']
        test_model, loaded, info = load_trained_model(
            model_paths, 
            backbone=config['backbone'], 
            emb_dim=config['emb_dim']
        )
        
        if loaded:
            # Quick test with synthetic data
            print("🎭 Quick test with synthetic data...")
            synthetic_data = create_synthetic_demo()
            
            if synthetic_data:
                analysis = visualize_similarity_distribution(synthetic_data)
                results_comparison[config['backbone']] = {
                    'loaded': True,
                    'accuracy': analysis.get('best_accuracy', 0.0),
                    'threshold': analysis.get('best_threshold', 0.0),
                    'model_info': info
                }
            else:
                results_comparison[config['backbone']] = {'loaded': True, 'error': 'Failed synthetic test'}
        else:
            results_comparison[config['backbone']] = {'loaded': False, 'model_info': info}
    
    # Summary comparison
    print(f"\n{'='*60}")
    print("📊 MODEL COMPARISON SUMMARY")
    print(f"{'='*60}")
    
    for backbone, result in results_comparison.items():
        status = "✅ Loaded" if result.get('loaded', False) else "❌ Failed"
        accuracy = result.get('accuracy', 'N/A')
        print(f"{backbone.upper():<15} | {status:<10} | Accuracy: {accuracy}")
    
    return results_comparison

# Uncomment to test multiple models
# model_comparison = test_multiple_models()

## Usage Instructions

### 1. **Model Loading**
The notebook automatically tries to load trained models from multiple locations:
- `SupConLoss_BBMobileNetV2.pth`
- `Model2/SupConLoss_BBMobileNetV2.pth`
- `outputs/model_best.pth`

### 2. **Data Paths**
Update the `test_paths` list with your actual data locations:
```python
test_paths = [
    "your/actual/path/to/logo_verify_test",
    "another/possible/path"
]
```

### 3. **Key Parameter**
The `key="000000"` parameter distinguishes reference vs test images:
- Images containing "000000" = Reference images
- Images NOT containing "000000" = Test images

### 4. **Output**
The analysis generates:
- Similarity distribution visualizations
- Accuracy vs threshold plots
- Detailed similarity statistics
- Saved results in `similarity_analysis/` folder

### 5. **Troubleshooting**
- If no real data is found, synthetic demo data is created
- If sklearn fails, manual cosine similarity is used
- Model loading has multiple fallback options