# Painting Similarity Detection Demo

This notebook demonstrates how to use the painting similarity detection model in the ArtExtract project. We'll cover:

1. Feature extraction from paintings
2. Building a similarity model
3. Finding similar paintings
4. Visualizing the results

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from PIL import Image
import pickle
from tqdm.notebook import tqdm
from pathlib import Path

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from models.similarity_detection.feature_extraction import FeatureExtractor
from models.similarity_detection.similarity_model import (
    create_similarity_model,
    PaintingSimilaritySystem,
    CosineSimilarityModel,
    FaissIndexModel
)
from evaluation.similarity_metrics import SimilarityEvaluator

## 1. Setup and Configuration

First, let's set up our configuration for the demo. You'll need to adjust the paths to point to your dataset.

In [None]:
# Configuration
DATA_DIR = "../data/national_gallery"  # Change this to your dataset directory
OUTPUT_DIR = "../output/similarity_demo"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Feature extraction configuration
FEATURE_EXTRACTOR_TYPE = "resnet50"  # Options: "resnet50", "efficientnet", "clip"
BATCH_SIZE = 32

# Similarity model configuration
SIMILARITY_MODEL_TYPE = "faiss"  # Options: "cosine", "faiss"
INDEX_TYPE = "L2"  # Options: "L2", "IP", "Cosine"
USE_GPU = torch.cuda.is_available()

# Number of similar paintings to retrieve
K = 5

## 2. Load or Create Dataset

We need a dataset of paintings to work with. For this demo, we'll load images from a directory and create a simple metadata DataFrame.

In [None]:
# Function to load images from a directory
def load_images_from_directory(directory):
    """Load images from a directory."""
    image_paths = []
    metadata = []
    
    # Get all image files
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(root, file)
                image_paths.append(image_path)
                
                # Extract simple metadata from path
                filename = os.path.basename(image_path)
                parent_dir = os.path.basename(os.path.dirname(image_path))
                
                metadata.append({
                    'filename': filename,
                    'category': parent_dir,
                    'image_path': image_path
                })
    
    # Create metadata DataFrame
    metadata_df = pd.DataFrame(metadata)
    
    print(f"Loaded {len(image_paths)} images from {directory}")
    
    return image_paths, metadata_df

# Load images
try:
    image_paths, metadata_df = load_images_from_directory(DATA_DIR)
    
    # Display first few rows of metadata
    display(metadata_df.head())
except Exception as e:
    print(f"Error loading images: {e}")
    print("For this demo, let's create a dummy dataset with some sample images.")
    
    # Create dummy dataset with sample images
    # This is just for demonstration purposes
    image_paths = []
    metadata = []
    
    # You can replace these with actual image paths
    sample_images = [
        "../data/sample/portrait1.jpg",
        "../data/sample/portrait2.jpg",
        "../data/sample/landscape1.jpg",
        "../data/sample/landscape2.jpg",
        "../data/sample/abstract1.jpg"
    ]
    
    for i, image_path in enumerate(sample_images):
        category = "portrait" if "portrait" in image_path else "landscape" if "landscape" in image_path else "abstract"
        
        image_paths.append(image_path)
        metadata.append({
            'filename': os.path.basename(image_path),
            'category': category,
            'image_path': image_path
        })
    
    metadata_df = pd.DataFrame(metadata)
    print(f"Created dummy dataset with {len(image_paths)} sample images")
    display(metadata_df)

## 3. Feature Extraction

Now we'll extract features from the paintings using a pre-trained CNN model.

In [None]:
# Create feature extractor
feature_extractor = FeatureExtractor(model_type=FEATURE_EXTRACTOR_TYPE)
print(f"Created {FEATURE_EXTRACTOR_TYPE} feature extractor")

In [None]:
# Extract features from images
features_file = os.path.join(OUTPUT_DIR, f"{FEATURE_EXTRACTOR_TYPE}_features.pkl")

# Check if features file already exists
if os.path.exists(features_file):
    print(f"Loading features from {features_file}")
    with open(features_file, 'rb') as f:
        features_data = pickle.load(f)
    
    features = features_data['features']
    feature_image_paths = features_data['image_paths']
    
    # Verify that the loaded features match our image paths
    if len(feature_image_paths) != len(image_paths):
        print(f"Warning: Loaded features for {len(feature_image_paths)} images, but we have {len(image_paths)} images")
        print("Extracting features again...")
        extract_features = True
    else:
        extract_features = False
else:
    print(f"Features file not found, extracting features...")
    extract_features = True

if extract_features:
    # Extract features
    features = []
    valid_image_paths = []
    
    for i, image_path in enumerate(tqdm(image_paths, desc="Extracting features")):
        try:
            # Load image
            image = Image.open(image_path).convert('RGB')
            
            # Extract features
            feature = feature_extractor.extract_features_from_image(image)
            
            # Add to lists
            features.append(feature)
            valid_image_paths.append(image_path)
        except Exception as e:
            print(f"Error extracting features from {image_path}: {e}")
    
    # Convert to numpy array
    features = np.vstack(features)
    
    # Save features
    with open(features_file, 'wb') as f:
        pickle.dump({'features': features, 'image_paths': valid_image_paths}, f)
    
    print(f"Extracted features for {len(valid_image_paths)} images")
    print(f"Features shape: {features.shape}")
    
    # Update image paths and metadata
    image_paths = valid_image_paths
    metadata_df = metadata_df[metadata_df['image_path'].isin(valid_image_paths)].reset_index(drop=True)
else:
    print(f"Loaded features for {len(feature_image_paths)} images")
    print(f"Features shape: {features.shape}")
    
    # Update image paths and metadata
    image_paths = feature_image_paths
    metadata_df = metadata_df[metadata_df['image_path'].isin(feature_image_paths)].reset_index(drop=True)

## 4. Build Similarity Model

Now we'll build a similarity model using the extracted features.

In [None]:
# Create similarity model
if SIMILARITY_MODEL_TYPE == 'faiss':
    # Get feature dimension
    feature_dim = features.shape[1]
    
    # Create Faiss model
    similarity_model = create_similarity_model(
        'faiss',
        feature_dim=feature_dim,
        index_type=INDEX_TYPE,
        use_gpu=USE_GPU
    )
else:
    # Create cosine similarity model
    similarity_model = create_similarity_model('cosine')

print(f"Created {SIMILARITY_MODEL_TYPE} similarity model")

In [None]:
# Create painting similarity system
similarity_system = PaintingSimilaritySystem(
    similarity_model=similarity_model,
    features=features,
    image_paths=image_paths,
    metadata=metadata_df
)

print(f"Created painting similarity system with {len(image_paths)} paintings")

## 5. Find Similar Paintings

Let's find paintings similar to a query painting.

In [None]:
# Select a random query painting
query_idx = np.random.randint(len(image_paths))
print(f"Selected query painting: {image_paths[query_idx]}")

# Display query painting
query_img = Image.open(image_paths[query_idx]).convert('RGB')
plt.figure(figsize=(5, 5))
plt.imshow(query_img)
plt.title("Query Painting")
plt.axis('off')
plt.show()

In [None]:
# Find similar paintings
result = similarity_system.find_similar_paintings(query_idx=query_idx, k=K)

# Print similar paintings information
print("Similar Paintings:")
for i, (path, sim) in enumerate(zip(result['similar_paths'], result['similarities'])):
    print(f"{i+1}. {path} (Similarity: {sim:.3f})")
    
    # Print metadata if available
    if 'similar_metadata' in result:
        metadata = result['similar_metadata'][i]
        print(f"   Category: {metadata.get('category', 'N/A')}")

In [None]:
# Visualize similar paintings
similarity_system.visualize_similar_paintings(
    query_idx=query_idx,
    k=K,
    figsize=(15, 5),
    save_path=os.path.join(OUTPUT_DIR, 'similar_paintings.png')
)

## 6. Find Similar Paintings for a New Image

Now let's try finding similar paintings for a new image that's not in our database.

In [None]:
# Function to find similar paintings for a new image
def find_similar_for_new_image(image_path):
    """Find similar paintings for a new image."""
    # Load image
    image = Image.open(image_path).convert('RGB')
    
    # Extract features
    feature = feature_extractor.extract_features_from_image(image)
    
    # Find similar paintings
    result = similarity_system.find_similar_to_new_painting(
        query_feature=feature,
        query_path=image_path,
        k=K
    )
    
    # Display query image
    plt.figure(figsize=(5, 5))
    plt.imshow(image)
    plt.title("Query Painting")
    plt.axis('off')
    plt.show()
    
    # Print similar paintings information
    print("Similar Paintings:")
    for i, (path, sim) in enumerate(zip(result['similar_paths'], result['similarities'])):
        print(f"{i+1}. {path} (Similarity: {sim:.3f})")
        
        # Print metadata if available
        if 'similar_metadata' in result:
            metadata = result['similar_metadata'][i]
            print(f"   Category: {metadata.get('category', 'N/A')}")
    
    # Visualize similar paintings
    # Load images
    query_img = image
    similar_imgs = [Image.open(path).convert('RGB') for path in result['similar_paths']]
    
    # Get similarities
    similarities = result['similarities']
    
    # Create figure
    fig, axes = plt.subplots(1, len(similar_imgs) + 1, figsize=(15, 5))
    
    # Plot query image
    axes[0].imshow(query_img)
    axes[0].set_title('Query')
    axes[0].axis('off')
    
    # Plot similar images
    for i, (img, sim) in enumerate(zip(similar_imgs, similarities)):
        axes[i + 1].imshow(img)
        axes[i + 1].set_title(f'Similarity: {sim:.3f}')
        axes[i + 1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    return result

In [None]:
# Try with a new image
# Replace this with the path to your new image
new_image_path = "../data/sample/new_painting.jpg"

# Check if the file exists
if os.path.exists(new_image_path):
    result = find_similar_for_new_image(new_image_path)
else:
    print(f"File not found: {new_image_path}")
    print("Using a random image from our database instead")
    
    # Use a random image from our database
    random_idx = np.random.randint(len(image_paths))
    while random_idx == query_idx:  # Make sure it's different from the previous query
        random_idx = np.random.randint(len(image_paths))
    
    new_image_path = image_paths[random_idx]
    result = find_similar_for_new_image(new_image_path)

## 7. Save the Similarity Model

Let's save the trained similarity model for future use.

In [None]:
# Save the similarity system
model_path = os.path.join(OUTPUT_DIR, 'painting_similarity_system.pkl')
similarity_system.save_system(model_path)
print(f"Saved painting similarity system to {model_path}")

## 8. Evaluate the Similarity Model

Let's evaluate the performance of our similarity model using some basic metrics.

In [None]:
# Create evaluator
evaluator = SimilarityEvaluator()

# For demonstration purposes, let's create a simple evaluation scenario
# We'll use the category information as ground truth
# Paintings in the same category are considered relevant to each other

# Get unique categories
categories = metadata_df['category'].unique()
print(f"Found {len(categories)} unique categories: {categories}")

# Create mapping from category to indices
category_to_indices = {}
for category in categories:
    category_to_indices[category] = metadata_df[metadata_df['category'] == category].index.tolist()

# Prepare evaluation data
all_relevant_items = []
all_recommended_items = []

# For each painting, find similar paintings and check if they are in the same category
num_eval_samples = min(100, len(image_paths))  # Limit to 100 samples for speed
eval_indices = np.random.choice(len(image_paths), num_eval_samples, replace=False)

for idx in tqdm(eval_indices, desc="Evaluating"):
    # Get category of the query painting
    query_category = metadata_df.loc[idx, 'category']
    
    # Get indices of paintings in the same category (relevant items)
    relevant_indices = category_to_indices[query_category]
    relevant_indices = [i for i in relevant_indices if i != idx]  # Exclude the query itself
    
    # Find similar paintings
    result = similarity_system.find_similar_paintings(query_idx=idx, k=max(args.k_values))
    recommended_indices = result['similar_indices']
    
    # Add to evaluation data
    all_relevant_items.append(relevant_indices)
    all_recommended_items.append(recommended_indices)

# Evaluate the model
k_values = [5, 10, 20]
results = evaluator.evaluate_similarity_model(
    all_relevant_items=all_relevant_items,
    all_recommended_items=all_recommended_items,
    k_values=k_values
)

# Print results
print(f"Evaluation results:")
print(f"MAP: {results['map']:.4f}")
print(f"MRR: {results['mrr']:.4f}")
for k, precision in results['precision_at_k'].items():
    print(f"{k}: {precision:.4f}")

# Plot precision@k
evaluator.plot_precision_at_k(
    results=results,
    save_path=os.path.join(OUTPUT_DIR, 'precision_at_k.png')
)

# Save results to JSON
evaluator.save_results_to_json(
    results=results,
    output_path=os.path.join(OUTPUT_DIR, 'evaluation_results.json')
)

## 9. Conclusion

In this notebook, we've demonstrated how to use the painting similarity detection model in the ArtExtract project. We've covered:

1. Feature extraction from paintings using a pre-trained CNN model
2. Building a similarity model using the extracted features
3. Finding similar paintings for a query painting
4. Evaluating the performance of the similarity model

This approach can be used to find similar paintings based on visual features, which is useful for art exploration, recommendation systems, and more.