# Chest X-Ray Dataset Exploration

Este notebook explora o dataset de raio-X torácico usado para classificação entre casos normais e pneumonia.

## Objetivos:
1. Carregar e explorar a estrutura do dataset
2. Visualizar amostras de imagens
3. Analisar distribuição das classes
4. Calcular estatísticas do dataset
5. Visualizar exemplos de data augmentation

In [None]:
# Imports
import sys
sys.path.append('..')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

from src.utils import load_config, set_seed
from src.data_loader import get_dataloaders, load_data_from_directory, create_data_splits
from src.visualization import plot_sample_images, plot_class_distribution

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Configuração

In [None]:
# Load configuration
config = load_config('../configs/config.yaml')

# Set seed for reproducibility
set_seed(config.get('seed', 42))

print("Configuration loaded successfully!")
print(f"\nDataset: {config['data']['dataset_name']}")
print(f"Image size: {config['data']['image_size']}")
print(f"Batch size: {config['data']['batch_size']}")
print(f"Train/Val/Test split: {config['data']['train_split']}/{config['data']['val_split']}/{config['data']['test_split']}")

## 2. Exploração da Estrutura do Dataset

In [None]:
# Define data directory
data_dir = Path('../data/raw/chest_xray')

# Check if dataset exists
if not data_dir.exists():
    print("❌ Dataset não encontrado!")
    print("\nPor favor, execute: python ../prepare_data.py")
else:
    print("✓ Dataset encontrado!\n")
    
    # Explore directory structure
    for split in ['train', 'val', 'test']:
        split_path = data_dir / split
        if split_path.exists():
            print(f"\n{split.upper()} set:")
            for class_name in ['NORMAL', 'PNEUMONIA']:
                class_path = split_path / class_name
                if class_path.exists():
                    num_images = len(list(class_path.glob('*.jpeg')))
                    print(f"  {class_name}: {num_images} images")

## 3. Carregar Dataset

In [None]:
# Load data paths and labels
image_paths, labels = load_data_from_directory(str(data_dir), config)

print(f"Total de imagens: {len(image_paths)}")
print(f"\nDistribuição das classes:")
unique, counts = np.unique(labels, return_counts=True)
for class_idx, count in zip(unique, counts):
    class_name = 'Normal' if class_idx == 0 else 'Pneumonia'
    print(f"  {class_name}: {count} ({count/len(labels)*100:.1f}%)")
    
imbalance_ratio = max(counts) / min(counts)
print(f"\nRazão de desbalanceamento: {imbalance_ratio:.2f}:1")

## 4. Visualizar Distribuição das Classes

In [None]:
# Create data splits
train_paths, val_paths, test_paths, train_labels, val_labels, test_labels = \
    create_data_splits(image_paths, labels, config)

# Plot distribution for each split
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
class_names = ['Normal', 'Pneumonia']

for i, (split_labels, title) in enumerate([
    (train_labels, 'Train Set'),
    (val_labels, 'Validation Set'),
    (test_labels, 'Test Set')
]):
    unique, counts = np.unique(split_labels, return_counts=True)
    bars = axes[i].bar([class_names[j] for j in unique], counts, 
                       color=['steelblue', 'coral'], alpha=0.7)
    axes[i].set_ylabel('Count')
    axes[i].set_title(title)
    axes[i].grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        axes[i].text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}',
                    ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/figures/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Visualizar Amostras de Imagens

In [None]:
# Sample images from each class
normal_samples = [path for path, label in zip(train_paths, train_labels) if label == 0][:4]
pneumonia_samples = [path for path, label in zip(train_paths, train_labels) if label == 1][:4]

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Normal samples
for i, img_path in enumerate(normal_samples):
    img = Image.open(img_path).convert('RGB')
    axes[0, i].imshow(img, cmap='gray')
    axes[0, i].set_title('Normal', fontsize=12, fontweight='bold')
    axes[0, i].axis('off')

# Pneumonia samples
for i, img_path in enumerate(pneumonia_samples):
    img = Image.open(img_path).convert('RGB')
    axes[1, i].imshow(img, cmap='gray')
    axes[1, i].set_title('Pneumonia', fontsize=12, fontweight='bold', color='red')
    axes[1, i].axis('off')

plt.tight_layout()
plt.savefig('../results/figures/sample_images.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Estatísticas das Imagens

In [None]:
# Analyze image sizes
sample_paths = np.random.choice(train_paths, min(100, len(train_paths)), replace=False)
widths = []
heights = []
aspects = []

for img_path in sample_paths:
    img = Image.open(img_path)
    w, h = img.size
    widths.append(w)
    heights.append(h)
    aspects.append(w/h)

# Create statistics table
stats_data = {
    'Metric': ['Mean Width', 'Mean Height', 'Mean Aspect Ratio', 
               'Min Width', 'Max Width', 'Min Height', 'Max Height'],
    'Value': [
        f"{np.mean(widths):.1f} px",
        f"{np.mean(heights):.1f} px",
        f"{np.mean(aspects):.2f}",
        f"{np.min(widths)} px",
        f"{np.max(widths)} px",
        f"{np.min(heights)} px",
        f"{np.max(heights)} px"
    ]
}

stats_df = pd.DataFrame(stats_data)
print("\nImage Statistics:")
print(stats_df.to_string(index=False))

# Plot distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(widths, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Width (pixels)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Width Distribution')
axes[0].grid(alpha=0.3)

axes[1].hist(heights, bins=30, color='coral', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Height (pixels)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Height Distribution')
axes[1].grid(alpha=0.3)

axes[2].hist(aspects, bins=30, color='green', alpha=0.7, edgecolor='black')
axes[2].set_xlabel('Aspect Ratio (W/H)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Aspect Ratio Distribution')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/image_statistics.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Resumo do Dataset

In [None]:
# Create comprehensive summary
summary_data = {
    'Split': ['Train', 'Validation', 'Test', 'Total'],
    'Normal': [
        sum(1 for l in train_labels if l == 0),
        sum(1 for l in val_labels if l == 0),
        sum(1 for l in test_labels if l == 0),
        sum(1 for l in labels if l == 0)
    ],
    'Pneumonia': [
        sum(1 for l in train_labels if l == 1),
        sum(1 for l in val_labels if l == 1),
        sum(1 for l in test_labels if l == 1),
        sum(1 for l in labels if l == 1)
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df['Total'] = summary_df['Normal'] + summary_df['Pneumonia']
summary_df['% Normal'] = (summary_df['Normal'] / summary_df['Total'] * 100).round(1)
summary_df['% Pneumonia'] = (summary_df['Pneumonia'] / summary_df['Total'] * 100).round(1)

print("\n" + "="*80)
print("DATASET SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))
print("="*80)

## Conclusões

Este notebook forneceu uma visão geral completa do dataset de raio-X torácico:

1. **Dataset balanceado?** Verificamos a distribuição das classes
2. **Qualidade das imagens**: Analisamos dimensões e aspectos
3. **Splits apropriados**: Confirmamos as proporções de treino/validação/teste
4. **Variabilidade**: Observamos exemplos de ambas as classes

### Próximos Passos:
- Notebook 02: Treinamento de modelos individuais
- Notebook 03: Avaliação do ensemble
- Notebook 04: Interpretabilidade com Grad-CAM