# Exploratory Data Analysis (EDA)
This notebook provides an exploratory analysis of the brain MRI dataset for metastasis segmentation.

In [1]:
# Importing necessary libraries
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

## Dataset Overview
Let's check the structure of our dataset.

In [2]:
# Path to the dataset
raw_images_dir = 'data/raw/images'
raw_masks_dir = 'data/raw/masks'

# List the files
image_files = os.listdir(raw_images_dir)
mask_files = os.listdir(raw_masks_dir)

print(f'Number of images: {len(image_files)}')
print(f'Number of masks: {len(mask_files)}')

## Visualizing Sample Images and Masks
Let's visualize some sample images along with their corresponding masks.

In [3]:
# Function to visualize images and masks
def visualize_samples(image_files, mask_files, num_samples=5):
    plt.figure(figsize=(15, 10))
    for i in range(num_samples):
        img_path = os.path.join(raw_images_dir, image_files[i])
        mask_path = os.path.join(raw_masks_dir, mask_files[i])
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        plt.subplot(num_samples, 2, 2*i + 1)
        plt.imshow(image, cmap='gray')
        plt.title('MRI Image')
        plt.axis('off')
        plt.subplot(num_samples, 2, 2*i + 2)
        plt.imshow(mask, cmap='gray')
        plt.title('Segmentation Mask')
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# Visualizing samples
visualize_samples(image_files, mask_files, num_samples=5)

## Checking Image and Mask Sizes
Understanding the dimensions of the images and masks is crucial for model training.

In [4]:
# Function to get image sizes
def get_image_sizes(image_files):
    sizes = []
    for img_name in image_files:
        img_path = os.path.join(raw_images_dir, img_name)
        img = cv2.imread(img_path)
        sizes.append(img.shape)
    return sizes

# Getting sizes
image_sizes = get_image_sizes(image_files)
sizes_df = pd.DataFrame(image_sizes, columns=['Height', 'Width', 'Channels'])
sizes_df.describe()

## Visualizing Size Distribution
We can plot the distribution of image sizes to see if they are consistent.

In [5]:
plt.figure(figsize=(10, 5))
sns.histplot(sizes_df['Height'], bins=30, kde=True, color='blue', label='Height')
sns.histplot(sizes_df['Width'], bins=30, kde=True, color='red', label='Width')
plt.title('Distribution of Image Sizes')
plt.xlabel('Size (pixels)')
plt.ylabel('Frequency')
plt.legend()
plt.show()

## Conclusion
This notebook provided an exploratory analysis of the brain MRI dataset for metastasis segmentation. We visualized samples of images and their corresponding masks, checked the number of images and masks, and analyzed the size distribution of the images.