# Skin Cancer Detection - Data Exploration

This notebook explores the skin cancer dataset and performs exploratory data analysis.

In [None]:
import sys
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.config import TRAIN_DIR, VAL_DIR, TEST_DIR
from src.utils import count_dataset_images

%matplotlib inline

## 1. Dataset Overview

In [None]:
# Count images in each dataset
train_counts = count_dataset_images(TRAIN_DIR)
val_counts = count_dataset_images(VAL_DIR)
test_counts = count_dataset_images(TEST_DIR)

print("Training set:")
print(train_counts)
print(f"Total: {sum(train_counts.values())}")
print()
print("Validation set:")
print(val_counts)
print(f"Total: {sum(val_counts.values())}")
print()
print("Test set:")
print(test_counts)
print(f"Total: {sum(test_counts.values())}")

## 2. Visualize Class Distribution

In [None]:
# Create visualization of class distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (title, counts) in zip(axes, 
                                [('Training', train_counts), 
                                 ('Validation', val_counts), 
                                 ('Test', test_counts)]):
    classes = list(counts.keys())
    values = list(counts.values())
    ax.bar(classes, values)
    ax.set_title(f'{title} Set Distribution')
    ax.set_ylabel('Number of Images')
    for i, v in enumerate(values):
        ax.text(i, v, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 3. Sample Images Visualization

In [None]:
# TODO: Add code to visualize sample images from each class
# This will be implemented once the dataset is added

## 4. Image Statistics

In [None]:
# TODO: Add code to analyze image dimensions, color distributions, etc.
# This will be implemented once the dataset is added