In [None]:
import os
import pandas as pd
import numpy as np

import cv2
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns

import albumentations as A
import missingno as msno
from collections import Counter

from scipy.stats import chi2_contingency

np.random.seed(5)

In [None]:
isic_train_dataset = pd.read_csv("./datasets/ISIC_2019_Training_GroundTruth.csv")

In [None]:
isic_train_dataset.info()

In [None]:
isic_train_dataset.head()

In [None]:
bins = isic_train_dataset.iloc[:, 1:].sum(axis=0)
bins

In [None]:
plt.bar(bins.index, bins.values)
plt.xticks(rotation=45)
plt.title("Class Distribution")
plt.savefig("class_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
image = isic_train_dataset.iloc[np.random.randint(0, 1000), 0]
img = Image.open(f"./datasets/ISIC_2019_Training_Input/{image}.jpg")

plt.axis('off')
plt.imshow(img)
plt.show()

In [None]:
isic_train_dataset["label"] = isic_train_dataset.iloc[:, 1:].idxmax(axis=1)
isic_train_dataset.head()

In [None]:
grouped_by_label = isic_train_dataset.groupby("label")
random_imgs = []
for name, group in grouped_by_label:
    idx = np.random.randint(0, len(group))
    random_imgs.append(group.loc[group.index[idx], ['image', 'label']])

random_imgs

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(12, 6))

for ax, item in zip(axs.flat, random_imgs):
    
    image_path = f"./datasets/ISIC_2019_Training_Input/{item['image']}.jpg"
    img_data = Image.open(image_path)
    
    ax.imshow(img_data)
    ax.set_title(item['label'])
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
isic_train_dataset_metadata = pd.read_csv("./datasets/ISIC_2019_Training_Metadata.csv")
isic_full_dataset = pd.merge(left=isic_train_dataset, right=isic_train_dataset_metadata, on="image", how="left", validate="one_to_one")

In [None]:
isic_full_dataset.head()

In [None]:
isic_full_dataset.info()

In [None]:
isic_full_dataset.isna().sum()

In [None]:
isic_full_dataset['sex'].value_counts(dropna=False)

In [None]:
isic_full_dataset['sex'].value_counts(dropna=False).plot.bar(rot=0, title="Sex Distribution")

In [None]:
for row in isic_full_dataset.iloc[:, 11:].drop(columns=['lesion_id']):
    fig, ax = plt.subplots(figsize=(10, 4))
    isic_full_dataset[row].value_counts(dropna=False).sort_index().plot.bar(ax=ax, title=row)
    ax.tick_params(axis="x", rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
for name, group in isic_full_dataset.groupby(by='label'):
    fig, ax = plt.subplots(figsize=(15, 7), nrows=1, ncols=3)
    for col, axis in zip(['sex', 'age_approx', 'anatom_site_general'], ax.ravel()):
        group[col].value_counts(dropna=False).sort_index().plot.bar(ax=axis, title=f"{name} - {col}")
        axis.tick_params(axis="x", rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
sex_distribution_by_label = {"male": [], "female": [], "labels_order": []}
male_count, female_count = isic_full_dataset['sex'].value_counts()
for name, group in isic_full_dataset[['sex', 'label']].groupby(by='label'):
    m, f = group.value_counts()
    sex_distribution_by_label["labels_order"].append(name)
    sex_distribution_by_label["male"].append(round(((m / male_count) * 100), 2))
    sex_distribution_by_label["female"].append(round(((f / female_count) * 100), 2))

sex_distribution_by_label

In [None]:
def survey(results, category_names):
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.colormaps['RdYlGn'](
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(15, 8))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        rects = ax.barh(labels, widths, left=starts, height=0.5,
                        label=colname, color=color)

        r, g, b, _ = color
        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
        ax.bar_label(rects, label_type='center', color=text_color)
    ax.legend(ncols=len(category_names), bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')

    return fig, ax


survey({'male': sex_distribution_by_label['male'], 'female': sex_distribution_by_label['female']}, sex_distribution_by_label['labels_order'])
plt.tight_layout()
plt.show()

In [None]:
isic_train_dataset['image'].duplicated().sum()

In [None]:
corr = isic_full_dataset.select_dtypes(include=[np.number]).drop(columns=['UNK']).corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
msno.matrix(isic_full_dataset)

In [None]:
sns.boxplot(x='label', y='age_approx', data=isic_full_dataset)
plt.title("Age by Disease")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

sns.kdeplot(
    data=isic_full_dataset.dropna(subset=['age_approx', 'sex']),
    x='age_approx',
    hue='label',
    fill=True,
    common_norm=False,
    alpha=0.5
)

plt.title("Age Approximation Density by Label")
plt.show()

In [None]:
class_counts = isic_full_dataset['label'].value_counts()
class_pct = class_counts / len(isic_full_dataset) * 100

display(pd.DataFrame({
    "count": class_counts,
    "percent": class_pct.round(2)
}))

In [None]:
plt.figure(figsize=(8,4))
np.log10(class_counts).plot(kind='bar')
plt.show()

In [None]:
classes = isic_full_dataset['label'].unique()

for attr in ['sex', 'anatom_site_general']:
    print(f"\nAttr: {attr}")

    for c in classes:
        df = isic_full_dataset.copy().dropna()
        df['is_class'] = (df['label'] == c).astype(int)

        ct = pd.crosstab(df[attr], df['is_class'])
        chi2, p, dof, exp = chi2_contingency(ct.fillna(0))

        print(f"    {c}: chi2={chi2:.2f}, p={p:.2e}")

In [None]:
classes = isic_full_dataset['label'].unique()

for attr in ['sex', 'anatom_site_general']:
    print(f"\nAttr: {attr}")

    for c in classes:
        if c == 'MEL':
            continue

        df = isic_full_dataset[isic_full_dataset['label'].isin(['MEL', c])].copy()

        df['is_c'] = (df['label'] == c).astype(int)

        ct = pd.crosstab(df[attr], df['is_c'])

        chi2, p, dof, exp = chi2_contingency(ct)

        print(f"    MEL vs {c}: chi2={chi2}, p={p:.3e}")

In [None]:
ct = pd.crosstab(isic_full_dataset['label'], isic_full_dataset['anatom_site_general'])
chi2, p, dof, expected = chi2_contingency(ct)
residuals = (len(isic_full_dataset['anatom_site_general']) - expected) / np.sqrt(expected)

residuals

In [None]:
def analyze_image_sizes(image_dir, sample_size=1000):
    files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]
    files = files[:sample_size]
    
    sizes = []
    for f in files:
        img = Image.open(os.path.join(image_dir, f))
        sizes.append(img.size)
    
    size_counts = Counter(sizes)
    print("Image Sizes Rank::")
    for size, count in size_counts.most_common(10):
        print(f"  {size[0]}x{size[1]}: {count}")
    
    widths = [s[0] for s in sizes]
    heights = [s[1] for s in sizes]

    print(f"\nwidth: min={min(widths)}, max={max(widths)}, mean={np.mean(widths):.0f}")
    print(f"height: min={min(heights)}, max={max(heights)}, mean={np.mean(heights):.0f}")

    labels = [f"{w}x{h}" for (w, h), _ in size_counts.most_common(10)]
    counts = [c for _, c in size_counts.most_common(10)]

    return labels, counts

In [None]:
labels, counts = analyze_image_sizes("datasets/ISIC_2019_Training_Input")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(labels, counts, color="steelblue")
ax.set_title("Top 10 Image Resolutions")
ax.set_xlabel("Resolution (W x H)")
ax.set_ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
class HairRemover:
    def __init__(self, kernel_size=17, threshold=10, inpaint_radius=5):
        self.kernel_size = kernel_size
        self.threshold = threshold
        self.inpaint_radius = inpaint_radius
    
    def __call__(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, 
                                           (self.kernel_size, self.kernel_size))
        blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
        _, mask = cv2.threshold(blackhat, self.threshold, 255, cv2.THRESH_BINARY)
        mask = cv2.dilate(mask, kernel, iterations=1)
        result = cv2.inpaint(image, mask, self.inpaint_radius, cv2.INPAINT_TELEA)
        return result, mask

In [None]:
remover = HairRemover()

In [None]:
img = np.array(Image.open("datasets/ISIC_2019_Training_Input/ISIC_0000095_downsampled.jpg"))

result, mask = remover(img)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].imshow(img)
axes[0].set_title("Before", fontsize=14)
axes[0].axis('off')

axes[1].imshow(mask, cmap='gray')
axes[1].set_title("Mask", fontsize=14)
axes[1].axis('off')

axes[2].imshow(result)
axes[2].set_title("After", fontsize=14)
axes[2].axis('off')

plt.tight_layout()
plt.savefig('hair_removal_demo.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
def shades_of_gray(img, power=6):
    
    img = np.float32(img)
    img_power = np.power(img + 1e-6, power)

    rgb_vec = np.power(np.mean(img_power, axis=(0, 1)), 1.0 / power)
    rgb_norm = np.sqrt(np.sum(np.power(rgb_vec, 2.0)))
    rgb_vec = rgb_vec / (rgb_norm + 1e-6)
    rgb_vec = 1.0 / (rgb_vec * np.sqrt(3) + 1e-6)

    img = np.multiply(img, rgb_vec)
    img = np.clip(img, 0, 255).astype(np.uint8)

    return img

In [None]:
images = [np.array(Image.open(f"datasets/ISIC_2019_Training_Input/ISIC_000009{i}_downsampled.jpg")) for i in range(1, 5)]

In [None]:
images

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, img in enumerate(images):
    img_corrected = shades_of_gray(img)
    
    axes[0, i].imshow(img)
    axes[0, i].set_title(f"Before {i+1}", fontsize=12)
    axes[0, i].axis('off')
    
    axes[1, i].imshow(img_corrected)
    axes[1, i].set_title(f"After {i+1}", fontsize=12)
    axes[1, i].axis('off')

axes[0, 0].set_ylabel("Before", fontsize=14)
axes[1, 0].set_ylabel("After", fontsize=14)

plt.suptitle("Shades Of Gray Algorithm", fontsize=16)
plt.tight_layout()
plt.savefig('color_constancy_demo.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
aug_visualize = A.Compose([
    A.RandomResizedCrop(size=(384, 384), scale=(0.7, 1.0), ratio=(0.9, 1.1)),
    A.Rotate(limit=180, p=0.8),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.OneOf([
        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05),
        A.RandomGamma(gamma_limit=(80, 120)),
        A.CLAHE(clip_limit=2.0),
    ], p=0.6),
    A.OneOf([
        A.GaussianBlur(blur_limit=(3, 5)),
        A.GaussNoise(std_range=(0.1, 0.5)),
    ], p=0.3),
    A.CoarseDropout(num_holes_range=(2, 8), hole_height_range=(5, 20), hole_width_range=(5, 20), fill=0, p=0.4),
])

In [None]:
img = np.array(Image.open("datasets/ISIC_2019_Training_Input/ISIC_0000096_downsampled.jpg"))

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(16, 12))

axes[0, 0].imshow(img)
axes[0, 0].set_title("Original", fontsize=12, fontweight='bold')
axes[0, 0].axis('off')

np.random.seed(42)
for i in range(11):
    row = (i + 1) // 4
    col = (i + 1) % 4
    
    augmented = aug_visualize(image=img)['image']
    
    axes[row, col].imshow(augmented)
    axes[row, col].set_title(f"Augmentation nr. {i+1}", fontsize=10)
    axes[row, col].axis('off')

plt.suptitle("Some Augmentations Examples", fontsize=16)
plt.tight_layout()
plt.savefig('augmentation_examples.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
def visualize_pipeline(img_path):
    img_original = np.array(Image.open(img_path).convert('RGB'))
    
    remover = HairRemover()
    img_no_hair, mask = remover(img_original)
    
    img_color_norm = shades_of_gray(img_no_hair)
    
    aug = A.Compose([
        A.RandomResizedCrop(size=(384, 384), scale=(0.8, 1.0)),
        A.Rotate(limit=45, p=1.0),
        A.ColorJitter(brightness=0.1, contrast=0.1, p=1.0),
    ])
    img_augmented = aug(image=img_color_norm)['image']
    
    fig, axes = plt.subplots(1, 4, figsize=(20, 5))
    
    titles = ['Original', '=> Hair Remover =>', 
              '=> ShadesOfGray/ColorConstancy =>', '=> Augmentation Final ']
    images = [img_original, img_no_hair, img_color_norm, img_augmented]
    
    for ax, img, title in zip(axes, images, titles):
        ax.imshow(img)
        ax.set_title(title, fontsize=14)
        ax.axis('off')
    
    plt.suptitle('Example full Preprocessing Step', fontsize=16)
    plt.tight_layout()
    plt.savefig('complete_pipeline.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
for i in range(4):
    visualize_pipeline(f"datasets/ISIC_2019_Training_Input/ISIC_000994{i}.jpg")