### Kaggle-compatible training script for Chest X-Ray Images (Pneumonia) dataset


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Kaggle specific setup
print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Check for GPU
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Paths in Kaggle
BASE_PATH = '/kaggle/input/chest-xray-pneumonia/chest_xray'
TRAIN_PATH = os.path.join(BASE_PATH, 'train')
VAL_PATH = os.path.join(BASE_PATH, 'val')
TEST_PATH = os.path.join(BASE_PATH, 'test')

# Create output directory
OUTPUT_DIR = '/kaggle/working/models'
os.makedirs(OUTPUT_DIR, exist_ok=True)

### Data Analysis Function

In [None]:
def load_and_analyze_data():
    """Load and analyze dataset"""
    print("Analyzing dataset structure  ")
    
    # Count images in each directory
    data_info = {}
    for split in ['train', 'val', 'test']:
        split_path = os.path.join(BASE_PATH, split)
        if os.path.exists(split_path):
            for class_name in ['NORMAL', 'PNEUMONIA']:
                class_path = os.path.join(split_path, class_name)
                if os.path.exists(class_path):
                    num_images = len(os.listdir(class_path))
                    data_info[f'{split}_{class_name}'] = num_images
                    print(f"{split}/{class_name}: {num_images} images")
    
    return data_info

### Data Generators Function

In [None]:
def create_data_generators():
    """Create data generators with augmentation"""
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    
    # Training data generator with augmentation
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.15,
        zoom_range=0.15,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    
    # Validation and test data generator (only rescaling)
    val_test_datagen = ImageDataGenerator(rescale=1./255)
    
    # Create generators
    train_generator = train_datagen.flow_from_directory(
        TRAIN_PATH,
        target_size=(224, 224),
        batch_size=32,
        class_mode='binary',
        color_mode='rgb',
        shuffle=True,
        seed=42
    )
    
    # For validation, if separate val directory exists
    if os.path.exists(VAL_PATH) and len(os.listdir(VAL_PATH)) > 0:
        validation_generator = val_test_datagen.flow_from_directory(
            VAL_PATH,
            target_size=(224, 224),
            batch_size=32,
            class_mode='binary',
            color_mode='rgb',
            shuffle=False
        )
    else:
        # If no validation directory, split training data
        print("No validation directory found. Splitting training data ")
        total_samples = train_generator.samples
        split_index = int(total_samples * 0.8)
        
        # Create validation generator from training data
        # Note: This is simplified. In practice, you'd want to properly split the data
        validation_generator = val_test_datagen.flow_from_directory(
            TRAIN_PATH,
            target_size=(224, 224),
            batch_size=32,
            class_mode='binary',
            color_mode='rgb',
            shuffle=False,
            subset='validation'  # Requires validation_split parameter
        )
    
    test_generator = val_test_datagen.flow_from_directory(
        TEST_PATH,
        target_size=(224, 224),
        batch_size=32,
        class_mode='binary',
        color_mode='rgb',
        shuffle=False
    )
    
    return train_generator, validation_generator, test_generator

### Model Building Function

In [None]:
def build_advanced_model():
    """Build an advanced CNN model"""
    model = keras.Sequential([
        # First convolutional block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same', 
                      input_shape=(224, 224, 3)),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second convolutional block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third convolutional block
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fourth convolutional block
        layers.Conv2D(512, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(512, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Flatten and dense layers
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        
        # Output layer
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model

### Model Training Function

In [None]:
def train_model():
    """Train the model with callbacks"""
    # Load data
    print("Loading data...")
    train_gen, val_gen, test_gen = create_data_generators()
    
    # Calculate class weights
    from sklearn.utils.class_weight import compute_class_weight
    classes = train_gen.classes
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(classes),
        y=classes
    )
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    print(f"Class weights: {class_weight_dict}")
    
    # Build model
    print("Building model...")
    model = build_advanced_model()
    
    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc')
        ]
    )
    
    print("Model summary:")
    model.summary()
    
    # Callbacks
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        ),
        keras.callbacks.ModelCheckpoint(
            os.path.join(OUTPUT_DIR, 'best_model_kaggle.h5'),
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=7,
            min_lr=0.000001,
            verbose=1
        ),
        keras.callbacks.TensorBoard(
            log_dir=os.path.join(OUTPUT_DIR, 'logs'),
            histogram_freq=1
        )
    ]
    
    # Train model
    print("Training model...")
    history = model.fit(
        train_gen,
        epochs=50,
        validation_data=val_gen,
        callbacks=callbacks,
        class_weight=class_weight_dict,
        verbose=1
    )
    
    return model, history, test_gen

### Model Evaluation Function

In [None]:
def evaluate_model(model, test_gen):
    """Evaluate model on test set"""
    print("\nEvaluating on test set...")
    
    # Get predictions
    test_gen.reset()
    y_pred_proba = model.predict(test_gen, verbose=1)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Get true labels
    y_true = test_gen.classes
    
    # Calculate metrics
    from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
    
    print("\n" + "="*50)
    print("TEST SET EVALUATION")
    print("="*50)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, 
                                target_names=['NORMAL', 'PNEUMONIA']))
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    # ROC-AUC
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    print(f"\nROC-AUC Score: {roc_auc:.4f}")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['NORMAL', 'PNEUMONIA'],
                yticklabels=['NORMAL', 'PNEUMONIA'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'), dpi=100)
    plt.show()
    
    return y_true, y_pred, y_pred_proba

### Training History Plotting Function

In [1]:
def plot_training_history(history):
    """Plot training history"""
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    metrics = ['loss', 'accuracy', 'precision', 'recall', 'auc']
    
    for i, metric in enumerate(metrics):
        ax = axes[i // 3, i % 3]
        ax.plot(history.history[metric], label=f'Training {metric}')
        ax.plot(history.history[f'val_{metric}'], label=f'Validation {metric}')
        ax.set_title(f'Model {metric}')
        ax.set_xlabel('Epoch')
        ax.set_ylabel(metric)
        ax.legend()
        ax.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'training_history.png'), dpi=100)
    plt.show()

### Model Saving Function

In [None]:
def save_model_for_production(model, test_gen):
    """Save model for production use"""
    print("\nSaving model for production...")
    
    # Save Keras model
    model.save(os.path.join(OUTPUT_DIR, 'pneumonia_model_final.h5'))
    
    # Save with joblib (including preprocessing info)
    pipeline = {
        'model': model,
        'img_size': (224, 224),
        'class_names': ['NORMAL', 'PNEUMONIA'],
        'class_indices': test_gen.class_indices,
        'threshold': 0.5,
        'version': '2.0.0',
        'trained_on': 'kaggle'
    }
    
    joblib.dump(pipeline, os.path.join(OUTPUT_DIR, 'pneumonia_pipeline.pkl'))
    
    # Save model architecture as JSON
    model_json = model.to_json()
    with open(os.path.join(OUTPUT_DIR, 'model_architecture.json'), 'w') as f:
        f.write(model_json)
    
    print(f"Models saved to {OUTPUT_DIR}")
    print(f"Files created:")
    print(f"  - pneumonia_model_final.h5")
    print(f"  - pneumonia_pipeline.pkl")
    print(f"  - model_architecture.json")
    print(f"  - training_history.png")
    print(f"  - confusion_matrix.png")

### Submission File Creation Function

In [None]:
def create_submission_file(model, test_gen):
    """Create Kaggle submission file if in competition"""
    # This is optional and depends on if you're in a Kaggle competition
    submission_dir = '/kaggle/working/submission'
    os.makedirs(submission_dir, exist_ok=True)
    
    print("\nCreating sample predictions for verification...")
    
    # Get a few sample predictions
    test_gen.reset()
    sample_batch = next(test_gen)
    sample_images, sample_labels = sample_batch
    
    predictions = model.predict(sample_images[:5])
    
    # Create a simple verification file
    verification_df = pd.DataFrame({
        'image_index': range(5),
        'true_label': sample_labels[:5],
        'predicted_prob': predictions.flatten()[:5],
        'predicted_label': (predictions.flatten()[:5] > 0.5).astype(int)
    })
    
    verification_df.to_csv(os.path.join(submission_dir, 'sample_predictions.csv'), index=False)
    print("Sample predictions saved for verification")

### Main Execution Function and Entry Point

In [None]:
def main():
    """Main training function"""
    print("Starting Kaggle training pipeline...")
    print("="*60)
    
    # Step 1: Analyze data
    data_info = load_and_analyze_data()
    
    # Step 2: Train model
    model, history, test_gen = train_model()
    
    # Step 3: Evaluate model
    evaluate_model(model, test_gen)
    
    # Step 4: Plot training history
    plot_training_history(history)
    
    # Step 5: Save model for production
    save_model_for_production(model, test_gen)
    
    # Step 6: Create submission file (for Kaggle competition if applicable)
    create_submission_file(model, test_gen)
    
    print("\n" + "="*60)
    print("Training completed successfully!")
    print(f"Model files are saved in: {OUTPUT_DIR}")
    print("="*60)

if __name__ == "__main__":
    main()