In [None]:
# Test Predictions for "To bee or not to bee"
# Generate predictions for images 251-347

import os
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
from PIL import Image
import cv2
from skimage import measure, feature
from scipy.optimize import minimize
from skimage.transform import rotate

# Import your feature extraction functions here
# You need to copy the functions from your feature_extraction notebook

# ==================== COPY YOUR FEATURE EXTRACTION FUNCTIONS HERE ====================
# Copy these functions from your feature_extraction.ipynb:
# - load_image_and_mask
# - compute_best_inscribed_circle
# - compute_best_symmetry_plane
# - extract_shape_features
# - calculate_bug_ratio
# - extract_color_features
# - extract_additional_features
# - create_default_features

# For now, I'll provide a template function
def extract_all_features_test(image_id, test_dir="../test"):
    """
    Extract all features for a test image
    This should be identical to your training feature extraction
    """
    try:
        # Load image and mask
        image_path = os.path.join(test_dir, "images", f"{image_id}.jpg")
        mask_path = os.path.join(test_dir, "masks", f"binary_{image_id}.tif")
        
        # Check if files exist
        if not os.path.exists(image_path) or not os.path.exists(mask_path):
            print(f"Missing files for image {image_id}")
            return create_default_features(image_id)
        
        # Load and process (you need to use your actual functions here)
        image, mask = load_image_and_mask(image_id, visualize=False)
        
        # Extract all features
        shape_features = extract_shape_features(mask)
        bug_ratio = calculate_bug_ratio(mask)
        color_features = extract_color_features(image, mask)
        additional_features = extract_additional_features(image, mask)
        
        # Compute inscribed circle and symmetry
        max_radius, circle_center = compute_best_inscribed_circle(mask)
        
        if max_radius > 5:
            symmetry_score, symmetry_angle = compute_best_symmetry_plane(
                image, mask, circle_center, max_radius
            )
        else:
            symmetry_score = 0.0
            symmetry_angle = 0.0
        
        # Combine all features
        features = {
            'image_id': int(image_id),
            'bug_ratio': bug_ratio,
            'max_inscribed_radius': max_radius,
            'symmetry_score': symmetry_score,
            'symmetry_angle': symmetry_angle,
            **shape_features,
            **color_features,
            **additional_features
        }
        
        return features
        
    except Exception as e:
        print(f"Error processing image {image_id}: {e}")
        return create_default_features(image_id)

# ==================== MAIN PREDICTION PIPELINE ====================

def main():
    print("=== TEST SET PREDICTIONS (Images 251-347) ===\n")
    
    # 1. Load saved models and preprocessors
    print("1. Loading models and preprocessors...")
    try:
        best_model = joblib.load('models/best_model.pkl')
        scaler = joblib.load('../scaler.pkl')  # From feature extraction
        label_encoder = joblib.load('models/label_encoder.pkl')
        print("✓ Models loaded successfully")
    except Exception as e:
        print(f"❌ Error loading models: {e}")
        print("Make sure you have run the ML notebook and feature extraction first!")
        return
    
    # 2. Get feature columns from training data
    train_features = pd.read_csv('../features_normalized.csv')
    feature_cols = [col for col in train_features.columns 
                   if col not in ['image_id', 'bug_type', 'species']]
    print(f"✓ Using {len(feature_cols)} features")
    
    # 3. Extract features for test images
    print("\n2. Extracting features for test images...")
    test_features = []
    failed_images = []
    
    for image_id in tqdm(range(251, 348), desc="Processing images"):
        features = extract_all_features_test(image_id)
        if features is not None:
            test_features.append(features)
        else:
            failed_images.append(image_id)
    
    print(f"✓ Successfully processed {len(test_features)} images")
    if failed_images:
        print(f"⚠️  Failed to process {len(failed_images)} images: {failed_images}")
    
    # 4. Create DataFrame and ensure column consistency
    test_df = pd.DataFrame(test_features)
    
    # Ensure all required columns are present
    missing_cols = set(feature_cols) - set(test_df.columns)
    if missing_cols:
        print(f"\n⚠️  Adding missing columns with default values: {missing_cols}")
        for col in missing_cols:
            test_df[col] = 0.0
    
    # Select features in the same order as training
    X_test = test_df[feature_cols].values
    
    # 5. Normalize features
    print("\n3. Normalizing features...")
    X_test_normalized = scaler.transform(X_test)
    
    # 6. Generate predictions
    print("\n4. Generating predictions...")
    predictions_encoded = best_model.predict(X_test_normalized)
    predictions = label_encoder.inverse_transform(predictions_encoded)
    
    # 7. Create submission DataFrame
    submission_df = pd.DataFrame({
        'ID': test_df['image_id'].astype(int),
        'bug type': predictions
    })
    
    # Sort by ID to ensure correct order
    submission_df = submission_df.sort_values('ID')
    
    # 8. Display prediction summary
    print("\n5. Prediction Summary:")
    print("-" * 40)
    print(submission_df['bug type'].value_counts())
    print("-" * 40)
    
    # Check for any suspicious predictions
    if 'Bee & Bumblebee' in submission_df['bug type'].values:
        count = (submission_df['bug type'] == 'Bee & Bumblebee').sum()
        print(f"\n⚠️  Warning: {count} images predicted as 'Bee & Bumblebee'")
        print("This rare class had only 1 training sample - predictions may be unreliable")
    
    # 9. Save submission file
    output_path = '../predictions_test.csv'
    submission_df.to_csv(output_path, index=False)
    print(f"\n✓ Predictions saved to: {output_path}")
    print(f"✓ File contains {len(submission_df)} predictions")
    
    # 10. Verify submission format
    print("\n6. Submission file verification:")
    print(f"  - Columns: {list(submission_df.columns)}")
    print(f"  - ID range: {submission_df['ID'].min()} to {submission_df['ID'].max()}")
    print(f"  - Unique bug types: {sorted(submission_df['bug type'].unique())}")
    print(f"  - Shape: {submission_df.shape}")
    
    # Display first few predictions
    print("\nFirst 10 predictions:")
    print(submission_df.head(10))
    
    print("\n✅ TEST PREDICTIONS COMPLETE!")
    print(f"📁 Submit the file: {output_path}")

# ==================== ALTERNATIVE: BATCH PREDICTION ====================

def predict_from_existing_features(test_features_path):
    """
    Alternative method if you've already extracted test features
    """
    print("=== PREDICTIONS FROM EXISTING FEATURES ===\n")
    
    # Load models
    best_model = joblib.load('models/best_model.pkl')
    scaler = joblib.load('../scaler.pkl')
    label_encoder = joblib.load('models/label_encoder.pkl')
    
    # Load test features
    test_df = pd.read_csv(test_features_path)
    
    # Get feature columns
    train_features = pd.read_csv('../features_normalized.csv')
    feature_cols = [col for col in train_features.columns 
                   if col not in ['image_id', 'bug_type', 'species']]
    
    # Prepare data
    X_test = test_df[feature_cols].values
    X_test_normalized = scaler.transform(X_test)
    
    # Predict
    predictions_encoded = best_model.predict(X_test_normalized)
    predictions = label_encoder.inverse_transform(predictions_encoded)
    
    # Create submission
    submission_df = pd.DataFrame({
        'ID': test_df['image_id'].astype(int),
        'bug type': predictions
    })
    
    submission_df.to_csv('../predictions_test.csv', index=False)
    print(f"✓ Predictions saved to: ../predictions_test.csv")
    
    return submission_df

# ==================== RUN THE PREDICTION ====================

if __name__ == "__main__":
    # Option 1: Extract features and predict
    main()
    
    # Option 2: If you have already extracted test features to a CSV file
    # test_features_path = '../test_features.csv'
    # if os.path.exists(test_features_path):
    #     predict_from_existing_features(test_features_path)