In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm import tqdm # Used for progress bars, though not directly in the plotting section you provided
import torch # Also not directly used in the plotting EDA, but part of your imports
from torch.utils.data import Dataset, DataLoader # Also not directly used in the plotting EDA
from sklearn.metrics import classification_report, confusion_matrix # Not directly used in the plotting EDA
from torchvision import models, transforms # Not directly used in the plotting EDA
from sklearn.model_selection import train_test_split # Not directly used in the plotting EDA
import torch.nn as nn # Not directly used in the plotting EDA
from torchvision.models import vit_b_16, ViT_B_16_Weights # Not directly used in the plotting EDA

# --- Define Paths ---
input_dir = 'ISIC_2019_TFM/data/ISIC_2019_Training_Input/ISIC_2019_Training_Input'
labels_csv = 'ISIC_2019_TFM/data/ISIC_2019_Training_GroundTruth.csv'
metadata_csv = 'ISIC_2019_TFM/data/ISIC_2019_Training_Metadata.csv'

print("Libraries imported and paths defined.")

In [None]:
# --- Load DataFrames ---
df_labels = pd.read_csv(labels_csv)
df_meta = pd.read_csv(metadata_csv)

# Merge the DataFrames on the 'image' column
df = pd.merge(df_labels, df_meta, on='image')

print("DataFrames loaded and merged. First 5 rows of the merged DataFrame:")
display(df.head())

# Define diagnosis columns (assuming these are the one-hot encoded label columns)
diagnosis_cols = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC']

# Create a 'label' column to easily get the primary diagnosis for each image
# This assumes that for each image, only one diagnosis column will have a '1'.
df['label'] = df[diagnosis_cols].idxmax(axis=1)

print("\n'label' column created based on primary diagnosis.")
print("\nDataFrame Info:")
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Calculate counts for each diagnosis
label_counts = df[diagnosis_cols].sum().sort_values(ascending=True)
total_images = label_counts.sum() # This sums up all '1's, effectively total classified instances

plt.figure(figsize=(12, 7))
ax = label_counts.plot(kind='barh', color=sns.color_palette("viridis", len(label_counts))) # Use a nicer color palette
plt.title('Distribución de Clases (Diagnósticos)', fontsize=18, fontweight='bold', pad=20) # Add padding to title
plt.xlabel('Número de Imágenes', fontsize=14, labelpad=15) # Add padding to xlabel
plt.ylabel('Diagnóstico', fontsize=14, labelpad=15) # Add padding to ylabel
plt.xticks(fontsize=12) # Increase x-axis tick font size
plt.yticks(fontsize=12) # Increase y-axis tick font size

# Add exact counts and percentages to the bars
for i, (count, percentage) in enumerate(zip(label_counts.values, (label_counts / total_images * 100).values)):
    ax.text(count + (ax.get_xlim()[1] * 0.02), # Position text slightly to the right of the bar
            i, # Y-position (index of the bar)
            f'{count} ({percentage:.1f}%)', # Formatted text
            va='center', # Vertical alignment
            ha='left', # Horizontal alignment
            fontsize=11,
            color='black')

plt.xlim(0, ax.get_xlim()[1] * 1.15) # Adjust x-axis limit to make space for labels
plt.grid(axis='x', linestyle='--', alpha=0.7) # Add a subtle horizontal grid for readability
plt.tight_layout() # Adjust layout to prevent labels from being cut off
plt.show()

print("\nDiagnosis class distribution plot generated.")

In [None]:
# Edad (Age)
plt.figure(figsize=(10, 6)) # A bit taller for better visualization
sns.histplot(df['age_approx'].dropna(), kde=False, bins=30, color='#1f77b4', edgecolor='black') # Color and edge
plt.title("Distribución de Edad Aproximada", fontsize=16, fontweight='bold')
plt.xlabel("Edad", fontsize=12)
plt.ylabel("Número de Casos", fontsize=12)

# Add line for the mean age
mean_age = df['age_approx'].mean()
plt.axvline(mean_age, color='red', linestyle='--', linewidth=2, label=f'Media: {mean_age:.1f} años')
plt.legend(fontsize=10) # Show legend for lines
plt.grid(axis='y', linestyle='--', alpha=0.7) # Subtle grid on Y-axis
plt.tight_layout() # Adjust layout to prevent clipping
plt.show()

print("\nAge distribution plot generated.")

In [None]:
# Sexo (Sex)
plt.figure(figsize=(7, 5)) # Adjust size
ax = sns.countplot(data=df, x='sex', order=df['sex'].value_counts().index, palette='viridis') # Use a color palette
plt.title("Distribución por Sexo", fontsize=16, fontweight='bold')
plt.xlabel("Sexo", fontsize=12)
plt.ylabel("Número de Casos", fontsize=12)

# Add value and percentage labels
total = len(df['sex'].dropna())
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(f'{int(p.get_height())}\n({percentage})', (x, y), ha='center', va='bottom', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

plt.ylim(0, df['sex'].value_counts().max() * 1.15) # Adjust Y-limit to fit labels
plt.tight_layout()
plt.show()

print("\nSex distribution plot generated.")

In [None]:
# Anatomical Site Distribution
plt.figure(figsize=(10, 7)) # Increase height for more categories

# Count values and calculate percentages
anatom_counts = df['anatom_site_general'].value_counts()
anatom_percentages = (anatom_counts / anatom_counts.sum()) * 100

# Create a DataFrame for seaborn for easier plotting and labeling
plot_df = pd.DataFrame({
    'Anatomical Location': anatom_counts.index,
    'Number of Cases': anatom_counts.values,
    'Percentage': anatom_percentages.values
})

ax = sns.barplot(data=plot_df, x='Number of Cases', y='Anatomical Location', order=anatom_counts.index, palette='crest') # A nice palette
plt.title("Distribution by Anatomical Location", fontsize=16, fontweight='bold')
plt.xlabel("Number of Cases", fontsize=12)
plt.ylabel("Anatomical Location", fontsize=12)

# Add value and percentage labels
for index, row in plot_df.iterrows():
    ax.text(row['Number of Cases'] + 20, index, # Adjust 20 for padding
            f"{int(row['Number of Cases'])} ({row['Percentage']:.1f}%)",
            color='black', ha="left", va='center', fontsize=10)

plt.xlim(0, plot_df['Number of Cases'].max() * 1.2) # Adjust X-limit to fit labels
plt.tight_layout()
plt.show()

print("\nAnatomical site distribution plot generated.")
print("\nEDA completed for ISIC 2019 dataset.")