# Exploratory Data Analysis (EDA) on HAM10000 Skin Cancer Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set plotting style
sns.set(style="whitegrid")


In [None]:
# Load the metadata CSV
csv_path = "HAM10000_metadata.csv"
df = pd.read_csv(csv_path)

# Show the first few rows
df.head()


## Dataset Overview

In [None]:
# Basic info and summary statistics
print("Dataset Info:")
df.info()

print("\nSummary Statistics:")
df.describe(include='all')


## Missing Values

In [None]:
# Show missing values
missing = df.isnull().sum()
print("Missing Values Per Column:\n", missing)


## Class Distributions

In [None]:
# Diagnosis distribution
plt.figure(figsize=(8,5))
sns.countplot(x='dx', data=df, order=df['dx'].value_counts().index)
plt.title("Diagnosis Distribution")
plt.xticks(rotation=45)
plt.show()

# Gender distribution
plt.figure(figsize=(6,4))
sns.countplot(x='sex', data=df)
plt.title("Gender Distribution")
plt.show()


## Age Distribution

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['age'].dropna(), bins=30, kde=True)
plt.title("Age Distribution of Patients")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


## Correlations

In [None]:
# Correlation matrix for numerical features
corr = df[['age']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


## Possible Biases in Data

In [None]:
# Cross tab between gender and diagnosis
bias_df = pd.crosstab(df['sex'], df['dx'], normalize='index') * 100
bias_df.plot(kind='bar', stacked=True, figsize=(10,6), colormap='tab20')
plt.title("Diagnosis Distribution by Gender (%)")
plt.ylabel("Percentage")
plt.legend(title="Diagnosis", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
