In [None]:
import zipfile
import os

# Upload your dataset.zip to Colab first
zip_path = '/content/binary_medical_classifier_dataset.zip'

# Extract to a known directory
extract_dir = 'extracted_data'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# NOW verify what you actually have
print("Actual directory structure:")
for root, dirs, files in os.walk(extract_dir):
    level = root.replace(extract_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:3]:
        print(f'{subindent}{file}')
    if len(files) > 3:
        print(f'{subindent}... and {len(files)-3} more files')


In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image
import os
import matplotlib.pyplot as plt
from collections import Counter

# ==================== STEP 1: FIX YOUR DATASET ====================
def fix_dataset_completely(input_dir, output_dir, target_size=(224, 224)):
    """
    Fix all images in dataset - resize, convert to RGB, save properly
    """
    print("🔧 FIXING DATASET - STANDARDIZING ALL IMAGES")
    print("="*60)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    fixed_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')):
                input_path = os.path.join(root, file)
                
                # Create output directory structure
                rel_path = os.path.relpath(root, input_dir)
                output_folder = os.path.join(output_dir, rel_path)
                os.makedirs(output_folder, exist_ok=True)
                output_path = os.path.join(output_folder, file)
                
                try:
                    with Image.open(input_path) as img:
                        # Convert to RGB
                        if img.mode != 'RGB':
                            if img.mode == 'L':  # Grayscale
                                img = img.convert('RGB')
                            elif img.mode == 'RGBA':  # Has alpha
                                background = Image.new('RGB', img.size, (255, 255, 255))
                                background.paste(img, mask=img.split()[3])
                                img = background
                            else:
                                img = img.convert('RGB')
                        
                        # Resize to target size
                        if img.size != target_size:
                            img = img.resize(target_size, Image.Resampling.LANCZOS)
                        
                        # Save as high quality JPEG
                        img.save(output_path, 'JPEG', quality=95)
                        fixed_count += 1
                        
                except Exception as e:
                    print(f"❌ Failed to process {input_path}: {e}")
                    error_count += 1
    
    print(f"\n✅ DATASET FIX COMPLETE:")
    print(f"   Fixed images: {fixed_count:,}")
    print(f"   Errors: {error_count}")
    print(f"   All images now: {target_size}")
    
    return fixed_count, error_count

# ==================== STEP 2: VERIFY DATASET ====================
def verify_fixed_dataset(dataset_dir):
    """
    Verify all images are properly formatted
    """
    print(f"\n🔍 VERIFYING FIXED DATASET: {dataset_dir}")
    print("="*60)
    
    total_images = 0
    size_distribution = Counter()
    mode_distribution = Counter()
    errors = []
    
    for root, dirs, files in os.walk(dataset_dir):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                filepath = os.path.join(root, file)
                try:
                    with Image.open(filepath) as img:
                        size_distribution[img.size] += 1
                        mode_distribution[img.mode] += 1
                        total_images += 1
                except Exception as e:
                    errors.append(f"{filepath}: {e}")
    
    print(f"📊 VERIFICATION RESULTS:")
    print(f"   Total images: {total_images:,}")
    print(f"   Unique sizes: {len(size_distribution)}")
    print(f"   Errors: {len(errors)}")
    
    # Show size distribution
    for size, count in size_distribution.most_common():
        status = "✅" if size == (224, 224) else "❌"
        percentage = (count / total_images * 100) if total_images > 0 else 0
        print(f"   {status} Size {size}: {count:,} images ({percentage:.1f}%)")
    
    all_correct_size = len(size_distribution) == 1 and (224, 224) in size_distribution
    all_rgb = len(mode_distribution) == 1 and 'RGB' in mode_distribution
    
    if all_correct_size and all_rgb and len(errors) == 0:
        print("✅ DATASET IS PROPERLY FORMATTED")
        return True
    else:
        print("❌ DATASET STILL HAS ISSUES")
        return False

# ==================== STEP 3: WORKING TRAINING PIPELINE ====================
def create_simple_dataset(data_dir, batch_size=32, validation_split=0.2):
    """
    Create dataset using TensorFlow's built-in functions - FIXED VERSION
    """
    # Get the original dataset with class_names BEFORE transformations
    train_ds = tf.keras.utils.image_dataset_from_directory(
        data_dir,
        validation_split=validation_split,
        subset="training",
        seed=123,
        image_size=(224, 224),
        batch_size=batch_size,
        label_mode='binary'
    )
    
    val_ds = tf.keras.utils.image_dataset_from_directory(
        data_dir,
        validation_split=validation_split,
        subset="validation",
        seed=123,
        image_size=(224, 224),
        batch_size=batch_size,
        label_mode='binary'
    )
    
    # CRITICAL: Get class_names BEFORE transformations remove this attribute
    class_names = train_ds.class_names
    
    # Now apply transformations - FIXED: Use tf.keras.layers.Rescaling, not tf.keras.utils.Rescaling
    normalization_layer = tf.keras.layers.Rescaling(1./255)
    train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
    val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))
    
    # Performance optimization
    AUTOTUNE = tf.data.AUTOTUNE
    train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    
    return train_ds, val_ds, class_names

def create_simple_model():
    """
    Simple CNN model appropriately sized for your dataset
    """
    model = tf.keras.Sequential([
        # Data augmentation
        tf.keras.layers.RandomFlip("horizontal"),
        tf.keras.layers.RandomRotation(0.1),
        tf.keras.layers.RandomZoom(0.1),
        
        # CNN layers
        tf.keras.layers.Conv2D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(64, 3, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(128, 3, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        
        # Classification head
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

def train_medical_classifier(dataset_path, epochs=20, batch_size=32):
    """
    Complete training pipeline that actually works - FIXED VERSION
    """
    print("🏥 MEDICAL IMAGE CLASSIFIER TRAINING")
    print("="*60)
    
    # Create datasets with FIXED class_names handling
    train_ds, val_ds, class_names = create_simple_dataset(
        f'{dataset_path}/train', 
        batch_size=batch_size
    )
    
    # Handle test dataset if exists
    test_path = f'{dataset_path}/test'
    if os.path.exists(test_path):
        test_ds = tf.keras.utils.image_dataset_from_directory(
            test_path,
            image_size=(224, 224),
            batch_size=batch_size,
            label_mode='binary'
        )
        normalization = tf.keras.layers.Rescaling(1./255)
        test_ds = test_ds.map(lambda x, y: (normalization(x), y))
        test_ds = test_ds.cache().prefetch(tf.data.AUTOTUNE)
    else:
        test_ds = None
    
    # Now class_names works properly - FIXED
    print(f"📊 Classes found: {class_names}")
    
    # Create and compile model
    model = create_simple_model()
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    print("\n🏗️ MODEL ARCHITECTURE:")
    model.summary()
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3
        )
    ]
    
    # Train model
    print(f"\n🚀 Starting training for {epochs} epochs...")
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=callbacks
    )
    
    # Evaluate on test set if available
    if test_ds is not None:
        print("\n🧪 EVALUATING ON TEST SET:")
        test_loss, test_accuracy = model.evaluate(test_ds)
        print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Save model
    model.save('medical_classifier.keras')
    print("\n💾 Model saved as 'medical_classifier.keras'")
    
    return model, history

# ==================== STEP 4: TEST SINGLE IMAGES ====================
def test_single_image(model_path, image_path, class_names=['medical', 'non_medical']):
    """
    Test the trained model on a single image
    """
    model = tf.keras.models.load_model(model_path)
    
    # Load and preprocess image
    img = tf.keras.utils.load_img(image_path, target_size=(224, 224))
    img_array = tf.keras.utils.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)  # Create batch dimension
    img_array = img_array / 255.0  # Normalize
    
    # Make prediction
    prediction = model.predict(img_array)[0][0]
    predicted_class = class_names[int(prediction > 0.5)]
    confidence = prediction if prediction > 0.5 else (1 - prediction)
    
    print(f"Image: {image_path}")
    print(f"Prediction: {predicted_class}")
    print(f"Confidence: {confidence:.3f}")
    
    return predicted_class, confidence

# ==================== MAIN EXECUTION ====================
def main():
    """
    Complete pipeline from broken dataset to trained model
    """
    original_dataset = "binary_medical_classifier"
    fixed_dataset = "binary_medical_classifier_fixed"
    
    print("🚀 COMPLETE MEDICAL IMAGE CLASSIFIER PIPELINE")
    print("="*80)
    
    # Step 1: Fix the dataset
    if not os.path.exists(fixed_dataset):
        print("Step 1: Fixing dataset...")
        fixed_count, error_count = fix_dataset_completely(original_dataset, fixed_dataset)
        
        if fixed_count == 0:
            print("❌ No images found or processed. Check your dataset path.")
            return
    else:
        print(f"Step 1: Using existing fixed dataset at {fixed_dataset}")
    
    # Step 2: Verify the dataset
    print("\nStep 2: Verifying dataset...")
    is_valid = verify_fixed_dataset(fixed_dataset)
    
    if not is_valid:
        print("❌ Dataset verification failed. Check the issues above.")
        return
    
    # Step 3: Train the model
    print("\nStep 3: Training model...")
    model, history = train_medical_classifier(fixed_dataset, epochs=15, batch_size=32)
    
    print("\n✅ PIPELINE COMPLETE!")
    print("Your medical image classifier is ready to use.")

# ==================== EXECUTE EVERYTHING ====================
if __name__ == "__main__":
    main()
    
    # Optional: Test on a single image after training
    # test_single_image('medical_classifier.keras', 'path/to/test/image.jpg')
