# Brain Tumor Classification with VGG16

This notebook demonstrates brain tumor classification using VGG16 transfer learning.

## Overview

1. Mount Google Drive and set up environment
2. Update repository and install dependencies
3. Set up paths and configuration
4. Preprocess the dataset
5. Load and explore dataset
6. Train VGG16 model
7. Evaluate model performance
8. Display results and make predictions

## 1. Mount Google Drive and Setup Environment

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Verify TensorFlow and GPU availability
import tensorflow as tf
import platform

print('TensorFlow version:', tf.__version__)
print('Python version:', platform.python_version())
print('GPUs available:', tf.config.list_physical_devices('GPU'))

# Set seed for reproducibility
import numpy as np
import random
import os

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## 2. Update Repository and Install Dependencies

In [None]:
# Navigate to the main directory and install required packages
%cd /content/drive/MyDrive/BrainTumor

# Install required packages
!pip install -q opencv-python
!pip install -q scikit-learn
!pip install -q matplotlib
!pip install -q seaborn

print("✅ Dependencies installed successfully")

## 3. Import Required Libraries

In [None]:
# Import all required libraries
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
import json
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Import custom modules - assuming code is in BrainTumor directory
import sys
sys.path.append('/content/drive/MyDrive/BrainTumor')

# If you have the src folder structure in BrainTumor, uncomment these:
# from src.common.dataset_utils import create_datasets
# from src.common.preprocessing import get_augmentation_pipeline, verify_dataset, split_and_copy
# from src.common.gradcam import generate_gradcam

print("✅ All libraries imported successfully")

## 3.1. Import Custom Modules

Import the required preprocessing and utility functions from the repository:

In [None]:
# Import custom modules from the repository
from src.common.preprocess import create_dirs, split_and_copy, verify_dataset
from src.common.preprocessing import count_samples_in_directory
from src.models.vgg16.build_vgg16 import build_vgg16_model
from src.models.vgg16.train_vgg16 import train_model
from src.models.vgg16.evaluate_vgg16 import evaluate_model

print("✅ Custom modules imported successfully")

## 4. Setup Paths and Configuration

In [None]:
# Configuration - corrected paths for actual folder structure
BASE_DIR = "/content/drive/MyDrive/BrainTumor"
RAW_DATA_DIR = os.path.join(BASE_DIR, "data", "archive")  # Raw images are in BrainTumor/data/archive/
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
RESULTS_DIR = os.path.join(BASE_DIR, "Result", "vgg16")

# Model configuration - OPTIMIZED FOR SPEED
INPUT_SHAPE = (224, 224, 3)  # VGG16 optimal input size
BATCH_SIZE = 64  # Increased from 32 for faster training (if GPU memory allows)
EPOCHS = 15  # Reduced from 20 - with early stopping, 15 is usually enough
LEARNING_RATE = 0.001  # Slightly higher LR for faster convergence

# Create necessary directories
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"✅ OPTIMIZED Configuration:")
print(f"   Base directory: {BASE_DIR}")
print(f"   Raw data: {RAW_DATA_DIR}")
print(f"   Processed data: {PROCESSED_DATA_DIR}")
print(f"   Results: {RESULTS_DIR}")
print(f"   Input shape: {INPUT_SHAPE}")
print(f"   Batch size: {BATCH_SIZE} (INCREASED for speed)")
print(f"   Epochs: {EPOCHS} (REDUCED - early stopping will handle)")
print(f"   Learning rate: {LEARNING_RATE} (OPTIMIZED)")

# Quick dataset size check for optimization
yes_dir = os.path.join(RAW_DATA_DIR, "yes")
no_dir = os.path.join(RAW_DATA_DIR, "no")

if os.path.exists(yes_dir) and os.path.exists(no_dir):
    yes_count = len([f for f in os.listdir(yes_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    no_count = len([f for f in os.listdir(no_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    total_images = yes_count + no_count
    
    print(f"\n📊 Dataset Info:")
    print(f"   Total images: {total_images}")
    print(f"   Yes: {yes_count}, No: {no_count}")
    
    # Calculate expected training time
    train_images = int(total_images * 0.7)  # 70% for training
    steps_per_epoch = train_images // BATCH_SIZE
    print(f"   Estimated steps per epoch: {steps_per_epoch}")
    
    if steps_per_epoch > 100:
        print(f"   ⚠️ Large dataset detected. Consider reducing epochs or increasing batch size.")
    else:
        print(f"   ✅ Good dataset size for efficient training.")
else:
    print(f"⚠️ Could not find raw data. Please check your Google Drive structure.")

In [None]:
# Performance optimizations for faster training
import tensorflow as tf

# Enable mixed precision training (faster on modern GPUs)
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Configure GPU memory growth (prevents memory allocation issues)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"⚠️ GPU configuration error: {e}")

# Enable XLA compilation for faster training
tf.config.optimizer.set_jit(True)

# Optimize data pipeline
tf.data.experimental.enable_debug_mode = False

print("🚀 Performance optimizations applied:")

### ⚡ Speed Optimization Tips

**If training is still slow (>5 min/epoch), try these:**

1. **Reduce Batch Size**: If you get memory errors with batch_size=64, try 32 or 16
2. **Reduce Image Size**: Change INPUT_SHAPE to (128, 128, 3) for faster training
3. **Use Colab Pro**: For faster GPU (T4/V100 instead of basic GPU)
4. **Reduce Dataset**: Use a subset for initial testing
5. **Check GPU Usage**: Run `!nvidia-smi` to verify GPU is being used

**Expected Times:**
- **Colab Free (K80)**: 5-10 min/epoch (normal)
- **Colab Pro (T4/V100)**: 1-3 min/epoch (fast)
- **CPU Only**: 20+ min/epoch (very slow - avoid)

In [None]:
# Quick GPU and performance check
print("🔍 Performance Check:")
print(f"   TensorFlow version: {tf.__version__}")

# Check GPU availability and type
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"   ✅ GPU available: {len(gpus)} device(s)")
    
    # Try to get GPU name (works in Colab)
    try:
        !nvidia-smi --query-gpu=name --format=csv,noheader,nounits
    except:
        print("   GPU info not available via nvidia-smi")
        
    # Check if mixed precision is enabled
    print(f"   Mixed precision: {tf.keras.mixed_precision.global_policy().name}")
else:
    print("   ❌ No GPU available - training will be VERY slow!")
    print("   💡 Enable GPU in Colab: Runtime > Change runtime type > Hardware accelerator > GPU")

# Memory check
import psutil
ram_gb = psutil.virtual_memory().total / (1024**3)
print(f"   Available RAM: {ram_gb:.1f} GB")

print(f"\n⏱️ Expected training time per epoch:")
if gpus:
    print(f"   • With GPU: 2-8 minutes (normal)")
    print(f"   • With optimization: 1-3 minutes (fast)")
else:
    print(f"   • Without GPU: 20+ minutes (too slow!)")
    
print(f"\n🎯 If still slow, reduce BATCH_SIZE to 32 or 16")

## 5. Data Preprocessing

Check if preprocessing is needed and perform data preprocessing if required.

In [None]:
# Check if preprocessing is needed
train_dir = os.path.join(PROCESSED_DATA_DIR, "train")
val_dir = os.path.join(PROCESSED_DATA_DIR, "val") 
test_dir = os.path.join(PROCESSED_DATA_DIR, "test")

processed_exists, class_folders_valid = verify_dataset(PROCESSED_DATA_DIR)

if processed_exists and class_folders_valid:
    print("✅ Processed data already exists. Skipping preprocessing.")
else:
    print("🔄 Starting preprocessing...")
    
    if os.path.exists(RAW_DATA_DIR):
        total_files = split_and_copy(RAW_DATA_DIR, PROCESSED_DATA_DIR, ["yes", "no"])
        print(f"✅ Preprocessing completed! Processed {total_files} images.")
    else:
        print("⚠️ Raw data directory not found. Please check your Google Drive structure.")

## 6. Load and Explore Dataset

In [None]:
# Verify processed data structure
processed_exists, class_folders_valid = verify_dataset(PROCESSED_DATA_DIR)

if processed_exists and class_folders_valid:
    TRAIN_DATA_DIR = PROCESSED_DATA_DIR
    print(f"✅ Using processed data: {TRAIN_DATA_DIR}")
else:
    TRAIN_DATA_DIR = RAW_DATA_DIR
    print(f"⚠️ Using raw data: {TRAIN_DATA_DIR}")

# Display dataset statistics
if processed_exists and class_folders_valid:
    train_counts = count_samples_in_directory(os.path.join(PROCESSED_DATA_DIR, "train"))
    val_counts = count_samples_in_directory(os.path.join(PROCESSED_DATA_DIR, "val"))
    test_counts = count_samples_in_directory(os.path.join(PROCESSED_DATA_DIR, "test"))
    
    print(f"\n📊 Dataset Statistics:")
    print(f"   Training: {sum(train_counts.values())} images")
    print(f"   Validation: {sum(val_counts.values())} images")
    print(f"   Test: {sum(test_counts.values())} images")
    print(f"   Classes: {list(train_counts.keys())}")
else:
    print(f"\n📊 Using raw data structure with 'yes' and 'no' folders")

## 7. Build VGG16 Model Variants

Create multiple VGG16 model variants for comparison:

In [None]:
## 7. Train VGG16 Model

Train the VGG16 model using the optimized training script:

## 8. Train VGG16 Model

Train the VGG16 model using the training script:

In [None]:
# Train the VGG16 model
print("Training VGG16 model...")

# Use the same approach as your friend's notebook - command line execution
!python -m src.models.vgg16.train_vgg16 \
    --data_dir {TRAIN_DATA_DIR} \
    --results_dir {RESULTS_DIR} \
    --epochs 20 \
    --batch_size 32 \
    --img_size 224 224 \
    --use_processed {1 if processed_exists and class_folders_valid else 0}

## 9. Evaluate Model

Evaluate the VGG16 model on the test set:

In [None]:
# Evaluate the model
print("Evaluating VGG16 model...")

!python -m src.models.vgg16.evaluate_vgg16 \
    --data_dir {TRAIN_DATA_DIR} \
    --results_dir {RESULTS_DIR} \
    --batch_size 32 \
    --img_size 224 224 \
    --use_processed {1 if processed_exists and class_folders_valid else 0}

## 10. Display Results

Show the training results and model performance:

In [None]:
# Display training history plot
import matplotlib.pyplot as plt
from PIL import Image

try:
    history_img = Image.open(f"{RESULTS_DIR}/training_plot.png")
    plt.figure(figsize=(10, 6))
    plt.imshow(history_img)
    plt.axis('off')
    plt.title('VGG16 Training History')
    plt.show()
except Exception as e:
    print(f"Error displaying training history: {e}")

## 11. Display Confusion Matrix

In [None]:
# Display confusion matrix
try:
    cm_img = Image.open(f"{RESULTS_DIR}/confusion_matrix.png")
    plt.figure(figsize=(8, 8))
    plt.imshow(cm_img)
    plt.axis('off')
    plt.title('VGG16 Confusion Matrix')
    plt.show()
except Exception as e:
    print(f"Error displaying confusion matrix: {e}")

## 12. Display Classification Report and Metrics

In [None]:
# Display classification report
try:
    with open(f"{RESULTS_DIR}/classification_report.txt", 'r') as f:
        report = f.read()
    print("Classification Report:")
    print(report)
except Exception as e:
    print(f"Error reading classification report: {e}")

# Display metrics
import json

try:
    with open(f"{RESULTS_DIR}/metrics.json", 'r') as f:
        metrics = json.load(f)
    print("\nModel Metrics:")
    for metric, value in metrics.items():
        if isinstance(value, (int, float)):
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}: {value}")
except Exception as e:
    print(f"Error reading metrics: {e}")

## 13. Make Predictions

Load the model and make predictions on sample images:

In [None]:
import tensorflow as tf
import numpy as np
import glob

# Load the best model
try:
    model_path = f"{RESULTS_DIR}/best_model.h5"
    model = tf.keras.models.load_model(model_path, compile=False)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    print(f"✅ Successfully loaded model from {model_path}")
    
    # Load class names
    try:
        with open(f"{RESULTS_DIR}/class_names.json", 'r') as f:
            class_names = json.load(f)
    except:
        class_names = ["no", "yes"]
    
    print(f"Classes: {class_names}")
except Exception as e:
    print(f"Error loading model: {e}")

# Function to make predictions
def predict_and_display(image_path):
    # Load and preprocess image
    img = tf.io.read_file(image_path)
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.resize(img, (224, 224))
    img_display = img.numpy().astype(np.uint8)
    img = tf.cast(img, tf.float32) / 255.0
    img = tf.expand_dims(img, axis=0)
    
    # Make prediction
    pred = model.predict(img, verbose=0)[0][0]
    predicted_class = class_names[1] if pred > 0.5 else class_names[0]
    confidence = pred if pred > 0.5 else 1 - pred
    
    # Display results
    plt.figure(figsize=(6, 6))
    plt.imshow(img_display)
    plt.title(f"Prediction: {predicted_class} ({confidence:.2f})")
    plt.axis('off')
    plt.show()
    
    return predicted_class, confidence

# Find sample images and make predictions
test_yes_dir = os.path.join(PROCESSED_DATA_DIR, "test", "yes") 
test_no_dir = os.path.join(PROCESSED_DATA_DIR, "test", "no")

sample_images = []

# Get sample images
if os.path.exists(test_yes_dir):
    yes_files = os.listdir(test_yes_dir)[:2]
    sample_images.extend([os.path.join(test_yes_dir, f) for f in yes_files])
    
if os.path.exists(test_no_dir):
    no_files = os.listdir(test_no_dir)[:2]
    sample_images.extend([os.path.join(test_no_dir, f) for f in no_files])

# Fallback to raw data if processed not available
if not sample_images:
    if os.path.exists(os.path.join(RAW_DATA_DIR, "yes")):
        yes_files = os.listdir(os.path.join(RAW_DATA_DIR, "yes"))[:2]
        sample_images.extend([os.path.join(RAW_DATA_DIR, "yes", f) for f in yes_files])
        
    if os.path.exists(os.path.join(RAW_DATA_DIR, "no")):
        no_files = os.listdir(os.path.join(RAW_DATA_DIR, "no"))[:2]
        sample_images.extend([os.path.join(RAW_DATA_DIR, "no", f) for f in no_files])

if sample_images:
    print(f"Making predictions on {len(sample_images)} sample images:")
    for img_path in sample_images:
        true_class = "yes" if "yes" in img_path else "no"
        pred_class, conf = predict_and_display(img_path)
        print(f"True: {true_class} | Predicted: {pred_class} | Confidence: {conf:.2f}")
else:
    print("No sample images found for prediction.")

## 14. Conclusion

This notebook has successfully demonstrated VGG16 transfer learning for brain tumor classification with:

- **VGG16 Model**: Pre-trained ImageNet weights with fine-tuned layers
- **Efficient Training**: Using existing preprocessing and training scripts
- **Comprehensive Evaluation**: Accuracy, precision, recall, F1-score metrics
- **Minimal Code**: Clean, streamlined approach for easy comparison

The model is now ready for comparison with other architectures like CNN, ResNet50, etc.