In [None]:
# Satoshi Poster Binary Extractor - Validation and Tuning

This notebook provides tools to validate the binary extraction pipeline and tune its parameters to achieve optimal results.


In [None]:
print("Hello, world!")


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import yaml

# Add the project root to the path so we can import our modules
project_root = Path(os.getcwd()).parent.parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import our modules
from binary_extractor.extractor.pipeline import Pipeline
from binary_extractor.extractor.grid import GridDetector
from binary_extractor.extractor.classify import CellClassifier
from binary_extractor.extractor.utils import load_config, setup_logger

# Set up plotting
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_style('whitegrid')

# Configure warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load reference data
reference_digits_path = Path(project_root) / "recognized_digits.csv"
reference_overlay_path = Path(project_root) / "overlay_unknown_cells.csv"

# Check if reference files exist
if not reference_digits_path.exists():
    print(f"Warning: Reference digits file not found at {reference_digits_path}")
    reference_digits = None
else:
    reference_digits = pd.read_csv(reference_digits_path)
    print(f"Loaded reference digits: {len(reference_digits)} entries")
    display(reference_digits.head())
    
if not reference_overlay_path.exists():
    print(f"Warning: Reference overlay file not found at {reference_overlay_path}")
    reference_overlay = None
else:
    reference_overlay = pd.read_csv(reference_overlay_path)
    print(f"Loaded reference overlay data: {len(reference_overlay)} entries")
    display(reference_overlay.head())


In [None]:
# Function to load the most recent pipeline output
def load_pipeline_output(base_dir=None):
    if base_dir is None:
        # Check for output directories in order of preference
        possible_dirs = [
            Path(project_root) / "binary_extractor" / "output",
            Path(project_root) / "binary_extractor" / "output2",
            Path(project_root) / "output3",
        ]
        
        # Find the most recent directory with output files
        for dir_path in possible_dirs:
            if dir_path.exists():
                # Look for recognized_digits.csv in this directory
                digits_file = dir_path / "recognized_digits.csv"
                overlay_file = dir_path / "overlay_unknown_cells.csv"
                
                if digits_file.exists() or overlay_file.exists():
                    base_dir = dir_path
                    break
    
    if base_dir is None:
        print("No output directory found with pipeline results")
        return None, None, None
    
    # Load digits file if it exists
    digits_file = base_dir / "recognized_digits.csv"
    if digits_file.exists():
        pipeline_digits = pd.read_csv(digits_file)
        print(f"Loaded pipeline digits from {digits_file}: {len(pipeline_digits)} entries")
        display(pipeline_digits.head())
    else:
        pipeline_digits = None
        print(f"No recognized digits file found at {digits_file}")
    
    # Load overlay file if it exists
    overlay_file = base_dir / "overlay_unknown_cells.csv"
    if overlay_file.exists():
        pipeline_overlay = pd.read_csv(overlay_file)
        print(f"Loaded pipeline overlay data from {overlay_file}: {len(pipeline_overlay)} entries")
        display(pipeline_overlay.head())
    else:
        pipeline_overlay = None
        print(f"No overlay data file found at {overlay_file}")
    
    return base_dir, pipeline_digits, pipeline_overlay

# Load the most recent pipeline output
output_dir, pipeline_digits, pipeline_overlay = load_pipeline_output()


In [None]:
# Analyze distribution of 0s and 1s in both datasets
def analyze_digit_distribution(df, title="Digit Distribution"):
    if df is None:
        print(f"Cannot analyze distribution for {title}: No data available")
        return None
    
    # Count 0s and 1s
    digit_counts = df['digit'].value_counts().sort_index()
    
    # Calculate percentages
    total = len(df)
    percentages = digit_counts / total * 100
    
    # Create a DataFrame for display
    distribution_df = pd.DataFrame({
        'Count': digit_counts,
        'Percentage': percentages
    })
    
    # Display statistics
    print(f"\n{title}:")
    print(f"Total digits: {total}")
    print(f"Distribution: {distribution_df.to_dict()}")
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=digit_counts.index, y=digit_counts.values)
    
    # Add count and percentage labels on bars
    for i, (count, pct) in enumerate(zip(digit_counts, percentages)):
        ax.text(i, count/2, f"{count}\n({pct:.1f}%)", 
                ha='center', va='center', fontweight='bold')
    
    plt.title(title)
    plt.xlabel('Digit')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    return distribution_df

# Analyze reference data distribution
ref_distribution = analyze_digit_distribution(reference_digits, "Reference Data Distribution")

# Analyze pipeline output distribution
pipeline_distribution = analyze_digit_distribution(pipeline_digits, "Pipeline Output Distribution")


In [None]:
# Compare pipeline output with reference data
def compare_datasets(pipeline_df, reference_df, title="Comparison with Reference Data"):
    if pipeline_df is None or reference_df is None:
        print(f"Cannot perform {title}: Missing data")
        return
    
    # Merge datasets on row and col
    merged = pd.merge(
        pipeline_df, 
        reference_df, 
        on=['row', 'col'], 
        how='outer',
        suffixes=('_pipeline', '_reference')
    )
    
    # Fill NaN values for better analysis
    merged = merged.fillna({
        'digit_pipeline': -1,  # -1 indicates missing in pipeline
        'digit_reference': -1  # -1 indicates missing in reference
    })
    
    # Convert digits to integers for comparison
    merged['digit_pipeline'] = merged['digit_pipeline'].astype(int)
    merged['digit_reference'] = merged['digit_reference'].astype(int)
    
    # Calculate match status
    merged['match_status'] = 'Unknown'
    
    # Both have valid digits (0 or 1)
    valid_mask = (merged['digit_pipeline'].isin([0, 1])) & (merged['digit_reference'].isin([0, 1]))
    merged.loc[valid_mask & (merged['digit_pipeline'] == merged['digit_reference']), 'match_status'] = 'Match'
    merged.loc[valid_mask & (merged['digit_pipeline'] != merged['digit_reference']), 'match_status'] = 'Mismatch'
    
    # One has a digit, other doesn't
    merged.loc[(merged['digit_pipeline'].isin([0, 1])) & (merged['digit_reference'] == -1), 'match_status'] = 'Extra in Pipeline'
    merged.loc[(merged['digit_pipeline'] == -1) & (merged['digit_reference'].isin([0, 1])), 'match_status'] = 'Missing in Pipeline'
    
    # Calculate statistics
    total_reference = len(reference_df)
    total_pipeline = len(pipeline_df)
    
    match_counts = merged['match_status'].value_counts()
    
    # Calculate metrics
    matches = match_counts.get('Match', 0)
    mismatches = match_counts.get('Mismatch', 0)
    extras = match_counts.get('Extra in Pipeline', 0)
    missing = match_counts.get('Missing in Pipeline', 0)
    
    accuracy = matches / total_reference if total_reference > 0 else 0
    precision = matches / (matches + mismatches + extras) if (matches + mismatches + extras) > 0 else 0
    recall = matches / (matches + mismatches + missing) if (matches + mismatches + missing) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Display results
    print(f"\n{title}:")
    print(f"Total in reference: {total_reference}")
    print(f"Total in pipeline: {total_pipeline}")
    print(f"Matches: {matches} ({matches/total_reference*100:.1f}% of reference)")
    print(f"Mismatches: {mismatches} ({mismatches/total_reference*100:.1f}% of reference)")
    print(f"Extra in pipeline: {extras}")
    print(f"Missing in pipeline: {missing} ({missing/total_reference*100:.1f}% of reference)")
    print(f"\nMetrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Visualize match status
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=match_counts.index, y=match_counts.values)
    
    # Add count labels on bars
    for i, count in enumerate(match_counts.values):
        ax.text(i, count/2, str(count), ha='center', va='center', fontweight='bold')
    
    plt.title(f"{title} - Match Status")
    plt.xlabel('Status')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # Return the merged dataframe for further analysis
    return merged

# Compare digits
comparison_df = compare_datasets(pipeline_digits, reference_digits, "Digit Recognition Comparison")


In [None]:
# Compare overlay detection
def compare_overlay_detection(pipeline_df, reference_df, title="Overlay Detection Comparison"):
    if pipeline_df is None or reference_df is None:
        print(f"Cannot perform {title}: Missing data")
        return
    
    # Merge datasets on row and col
    merged = pd.merge(
        pipeline_df, 
        reference_df, 
        on=['row', 'col'], 
        how='outer',
        suffixes=('_pipeline', '_reference')
    )
    
    # Fill NaN values to indicate missing entries
    merged = merged.fillna({
        'row': -1,
        'col': -1
    })
    
    # Calculate match status
    merged['match_status'] = 'Unknown'
    
    # Both have the cell
    both_have = (merged['row'] != -1) & (merged['col'] != -1)
    merged.loc[both_have, 'match_status'] = 'Match'
    
    # Only in pipeline
    only_pipeline = (merged['row'] != -1) & (merged['col'] != -1) & pd.isna(merged['row_reference'])
    merged.loc[only_pipeline, 'match_status'] = 'Extra in Pipeline'
    
    # Only in reference
    only_reference = (merged['row'] != -1) & (merged['col'] != -1) & pd.isna(merged['row_pipeline'])
    merged.loc[only_reference, 'match_status'] = 'Missing in Pipeline'
    
    # Calculate statistics
    total_reference = len(reference_df)
    total_pipeline = len(pipeline_df)
    
    match_counts = merged['match_status'].value_counts()
    
    # Calculate metrics
    matches = match_counts.get('Match', 0)
    extras = match_counts.get('Extra in Pipeline', 0)
    missing = match_counts.get('Missing in Pipeline', 0)
    
    precision = matches / total_pipeline if total_pipeline > 0 else 0
    recall = matches / total_reference if total_reference > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Display results
    print(f"\n{title}:")
    print(f"Total in reference: {total_reference}")
    print(f"Total in pipeline: {total_pipeline}")
    print(f"Matches: {matches} ({matches/total_reference*100:.1f}% of reference)")
    print(f"Extra in pipeline: {extras}")
    print(f"Missing in pipeline: {missing} ({missing/total_reference*100:.1f}% of reference)")
    print(f"\nMetrics:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Visualize match status
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=match_counts.index, y=match_counts.values)
    
    # Add count labels on bars
    for i, count in enumerate(match_counts.values):
        ax.text(i, count/2, str(count), ha='center', va='center', fontweight='bold')
    
    plt.title(f"{title} - Match Status")
    plt.xlabel('Status')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # Return the merged dataframe for further analysis
    return merged

# Compare overlay detection
overlay_comparison_df = compare_overlay_detection(pipeline_overlay, reference_overlay, "Overlay Detection Comparison")


In [None]:
# Visualize debug artifacts
def load_and_display_image(file_path, title=None):
    """Load and display an image with a title"""
    if not Path(file_path).exists():
        print(f"Image not found: {file_path}")
        return None
    
    try:
        img = cv2.imread(str(file_path))
        if img is None:
            print(f"Failed to load image: {file_path}")
            return None
        
        # Convert BGR to RGB for display
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        plt.figure(figsize=(12, 10))
        plt.imshow(img_rgb)
        if title:
            plt.title(title)
        plt.axis('off')
        plt.tight_layout()
        plt.show()
        
        return img_rgb
    except Exception as e:
        print(f"Error loading image {file_path}: {e}")
        return None

# Find and display debug artifacts if they exist
def display_debug_artifacts(output_dir):
    if output_dir is None:
        print("No output directory provided")
        return
    
    # List of common debug artifacts
    debug_files = [
        ("grid_overlay.png", "Grid Detection Overlay"),
        ("bw_mask.png", "Binary Mask"),
        ("cells_color.png", "Classified Cells"),
        ("cells_with_digits.png", "Cells with Detected Digits"),
        ("overlay_mask.png", "Overlay Detection Mask")
    ]
    
    for filename, title in debug_files:
        file_path = output_dir / filename
        if file_path.exists():
            print(f"\nDisplaying {title}:")
            load_and_display_image(file_path, title)
        else:
            print(f"\n{title} not found at {file_path}")

# Display debug artifacts if output directory was found
if output_dir:
    display_debug_artifacts(output_dir)


In [None]:
# Parameter tuning and grid search
def load_config():
    """Load the current configuration"""
    config_path = Path(project_root) / "binary_extractor" / "cfg.yaml"
    if not config_path.exists():
        print(f"Config file not found at {config_path}")
        return None
    
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    
    return config

def save_config(config, suffix=None):
    """Save a configuration with an optional suffix"""
    if config is None:
        print("No config to save")
        return None
    
    if suffix:
        config_path = Path(project_root) / "binary_extractor" / f"cfg_{suffix}.yaml"
    else:
        config_path = Path(project_root) / "binary_extractor" / "cfg.yaml"
    
    with open(config_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"Config saved to {config_path}")
    return config_path

# Load the current configuration
current_config = load_config()

if current_config:
    # Display the current configuration
    print("Current Configuration:")
    for section, params in current_config.items():
        print(f"\n{section}:")
        if isinstance(params, dict):
            for key, value in params.items():
                print(f"  {key}: {value}")
        else:
            print(f"  {params}")
else:
    print("Failed to load configuration")


In [None]:
# Run the pipeline with a specific configuration
def run_pipeline_with_config(config=None, output_dir=None, suffix=None):
    """Run the pipeline with a specific configuration"""
    if config is None:
        print("No configuration provided")
        return None
    
    # Save the configuration if a suffix is provided
    if suffix:
        config_path = save_config(config, suffix)
    else:
        config_path = Path(project_root) / "binary_extractor" / "cfg.yaml"
        # Save the current config temporarily
        with open(config_path, 'w') as f:
            yaml.dump(config, f, default_flow_style=False)
    
    # Set up the output directory
    if output_dir is None:
        if suffix:
            output_dir = Path(project_root) / "binary_extractor" / f"output_{suffix}"
        else:
            output_dir = Path(project_root) / "binary_extractor" / "output"
    
    # Make sure the output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Set up the pipeline
    try:
        # Create a pipeline instance
        pipeline = Pipeline(
            config_path=str(config_path),
            output_dir=str(output_dir)
        )
        
        # Run the pipeline
        print(f"Running pipeline with configuration{' ' + suffix if suffix else ''}...")
        pipeline.run()
        
        print(f"Pipeline completed. Results saved to {output_dir}")
        return output_dir
    except Exception as e:
        print(f"Error running pipeline: {e}")
        return None

# Example function to modify a configuration for tuning
def create_tuned_config(base_config, changes, suffix):
    """Create a tuned configuration by applying changes to a base config"""
    if base_config is None:
        print("No base configuration provided")
        return None
    
    # Create a deep copy of the base config
    import copy
    tuned_config = copy.deepcopy(base_config)
    
    # Apply changes
    for section, params in changes.items():
        if section not in tuned_config:
            tuned_config[section] = {}
        
        if isinstance(params, dict):
            for key, value in params.items():
                tuned_config[section][key] = value
        else:
            tuned_config[section] = params
    
    # Save the tuned config
    config_path = save_config(tuned_config, suffix)
    
    return tuned_config

# Example: Create a tuned configuration with different threshold values
# Uncomment and modify as needed
"""
if current_config:
    # Create a tuned configuration with lower threshold
    tuned_config_1 = create_tuned_config(
        current_config,
        {
            'preprocessing': {
                'threshold_value': 120  # Lower threshold value
            }
        },
        'lower_threshold'
    )
    
    # Run the pipeline with the tuned configuration
    output_dir_1 = run_pipeline_with_config(tuned_config_1, suffix='lower_threshold')
    
    # Load and analyze the results
    if output_dir_1:
        _, pipeline_digits_1, pipeline_overlay_1 = load_pipeline_output(output_dir_1)
        analyze_digit_distribution(pipeline_digits_1, "Lower Threshold - Digit Distribution")
        compare_datasets(pipeline_digits_1, reference_digits, "Lower Threshold - Comparison")
"""


In [None]:
# Conclusion and Recommendations

Based on the analysis of the pipeline output compared to the reference data, we can make the following observations and recommendations:

1. **Distribution of 0s and 1s**: 
   - The reference data should have a relatively even distribution of 0s and 1s
   - If the pipeline output shows a significant skew, consider adjusting the classification parameters

2. **Overlay Detection**:
   - Accurate overlay detection is crucial for avoiding false positives
   - If overlay detection is missing cells, consider adjusting the overlay detection parameters

3. **Parameter Tuning Recommendations**:
   - **Preprocessing**: Adjust threshold values, blur kernel size, and morphological operations
   - **Grid Detection**: Fine-tune grid origin and cell size
   - **Classification**: Adjust template matching threshold or classification algorithm

4. **Next Steps**:
   - Run grid search over key parameters to find optimal configuration
   - Validate results against reference data
   - Document the optimal configuration and results
