# Brain Tumor Detection - Exploratory Data Analysis

This notebook explores the MRI brain tumor dataset to understand:
- Data distribution
- Image characteristics
- Class balance
- Sample visualizations

In [None]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from collections import Counter

from config import RAW_DATA_DIR, CLASS_NAMES, IMAGE_SIZE

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Data directory: {RAW_DATA_DIR}")
print(f"Classes: {CLASS_NAMES}")

## 1. Dataset Overview

In [None]:
# Count images per class
class_counts = {}

for class_name in CLASS_NAMES:
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        images = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
        class_counts[class_name] = len(images)
    else:
        class_counts[class_name] = 0
        print(f"⚠️ Directory not found: {class_dir}")

# Display counts
total = sum(class_counts.values())
print(f"\nTotal images: {total}")
print("\nImages per class:")
for cls, count in class_counts.items():
    print(f"  {cls}: {count} ({count/total*100:.1f}%)")

In [None]:
# Visualize class distribution
fig, ax = plt.subplots(figsize=(10, 6))

colors = sns.color_palette('husl', len(class_counts))
bars = ax.bar(class_counts.keys(), class_counts.values(), color=colors)

# Add count labels
for bar, count in zip(bars, class_counts.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,
            str(count), ha='center', fontsize=12)

ax.set_xlabel('Tumor Type', fontsize=12)
ax.set_ylabel('Number of Images', fontsize=12)
ax.set_title('Brain Tumor Dataset - Class Distribution', fontsize=14)

plt.tight_layout()
plt.show()

## 2. Sample Images

In [None]:
# Display sample images from each class
fig, axes = plt.subplots(4, 5, figsize=(15, 12))

for row, class_name in enumerate(CLASS_NAMES):
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        images = list(class_dir.glob('*.jpg'))[:5]
        for col, img_path in enumerate(images):
            img = cv2.imread(str(img_path))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            axes[row, col].imshow(img)
            axes[row, col].axis('off')
            if col == 0:
                axes[row, col].set_ylabel(class_name, fontsize=12)

plt.suptitle('Sample MRI Images by Tumor Type', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 3. Image Properties Analysis

In [None]:
# Analyze image sizes and properties
image_sizes = []
image_channels = []

for class_name in CLASS_NAMES:
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        images = list(class_dir.glob('*.jpg'))[:50]  # Sample 50 images
        for img_path in images:
            img = cv2.imread(str(img_path))
            if img is not None:
                h, w = img.shape[:2]
                c = img.shape[2] if len(img.shape) > 2 else 1
                image_sizes.append((w, h))
                image_channels.append(c)

# Statistics
sizes_array = np.array(image_sizes)
print("Image Size Statistics:")
print(f"  Width  - Min: {sizes_array[:,0].min()}, Max: {sizes_array[:,0].max()}, Mean: {sizes_array[:,0].mean():.0f}")
print(f"  Height - Min: {sizes_array[:,1].min()}, Max: {sizes_array[:,1].max()}, Mean: {sizes_array[:,1].mean():.0f}")
print(f"\nChannel distribution: {Counter(image_channels)}")

In [None]:
# Plot size distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(sizes_array[:,0], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Width (pixels)')
axes[0].set_ylabel('Count')
axes[0].set_title('Image Width Distribution')

axes[1].hist(sizes_array[:,1], bins=30, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Height (pixels)')
axes[1].set_ylabel('Count')
axes[1].set_title('Image Height Distribution')

plt.tight_layout()
plt.show()

## 4. Pixel Intensity Analysis

In [None]:
# Analyze pixel intensity distributions per class
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, class_name in enumerate(CLASS_NAMES):
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        all_pixels = []
        images = list(class_dir.glob('*.jpg'))[:20]
        for img_path in images:
            img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                all_pixels.extend(img.flatten())
        
        axes[idx].hist(all_pixels, bins=50, edgecolor='black', alpha=0.7, density=True)
        axes[idx].set_xlabel('Pixel Intensity')
        axes[idx].set_ylabel('Density')
        axes[idx].set_title(f'{class_name.title()} - Pixel Distribution')

plt.suptitle('Pixel Intensity Distribution by Tumor Type', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. Data Preprocessing Preview

In [None]:
from src.data.preprocessing import MRISpecificPreprocessor

preprocessor = MRISpecificPreprocessor()

# Get a sample image
sample_path = list((RAW_DATA_DIR / 'glioma').glob('*.jpg'))[0]
original = cv2.imread(str(sample_path))
original = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)

# Apply preprocessing steps
resized = cv2.resize(original, IMAGE_SIZE)
gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)
denoised = preprocessor.apply_denoising(gray)
clahe = preprocessor.apply_clahe(denoised)
normalized = preprocessor.normalize(clahe)

# Display preprocessing steps
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

axes[0].imshow(original)
axes[0].set_title('Original')
axes[0].axis('off')

axes[1].imshow(resized)
axes[1].set_title(f'Resized ({IMAGE_SIZE})')
axes[1].axis('off')

axes[2].imshow(denoised, cmap='gray')
axes[2].set_title('Denoised')
axes[2].axis('off')

axes[3].imshow(clahe, cmap='gray')
axes[3].set_title('CLAHE Enhanced')
axes[3].axis('off')

axes[4].imshow(normalized, cmap='gray')
axes[4].set_title('Normalized')
axes[4].axis('off')

plt.suptitle('Preprocessing Pipeline', fontsize=14)
plt.tight_layout()
plt.show()

## 6. Summary

### Key Findings:
- Dataset contains MRI brain scans categorized into 4 classes
- Image sizes vary - will need resizing for model input
- Class distribution may require attention (check for imbalance)
- Preprocessing steps help enhance image quality

### Next Steps:
1. Preprocess all images using the pipeline
2. Apply data augmentation to balance classes
3. Train the X-Farmer model
4. Evaluate on test set