# Exploratory Data Analysis and Preprocessing


In [None]:
"""import kagglehub
kagglehub.login()"""

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [None]:
"""isic_2024_challenge_path = kagglehub.competition_download('isic-2024-challenge')"""

Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/isic-2024-challenge...


100%|██████████| 2.00G/2.00G [01:44<00:00, 20.5MB/s]

Extracting files...





In [None]:
##############
# LIBRARIES
##############

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import matplotlib.image as mpimg

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Row Column Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
##############
# LOAD DATA
##############

train_metadata_path = f"{isic_2024_challenge_path}/train-metadata.csv"
train_image_dir = f"{isic_2024_challenge_path}/train-image"

# Read Data
train_metadata = pd.read_csv(train_metadata_path, low_memory=False)

NameError: name 'isic_2024_challenge_path' is not defined

In [None]:
#######
# DROP
#######

# Variables with a high number of missing values or that do not carry significant information have been removed.

drop_list = [
    'patient_id',
    'attribution',
    'copyright_license',
    'image_type',
    'iddx_1',
    'iddx_2',
    'iddx_3',
    'iddx_4',
    'iddx_5',
    'iddx_full',
    'lesion_id',
    'mel_mitotic_index',
    'mel_thick_mm',
    'tbp_lv_dnn_lesion_confidence'
]

train_metadata.drop(drop_list, axis=1, inplace=True)

## Data Analysis

In [None]:
percentiles = [0.10, 0.25, 0.30, 0.40, 0.60, 0.70, 0.80, 0.85, 0.90, 0.95, 0.99]
train_metadata.describe(percentiles = percentiles).T.style.background_gradient(axis=0, cmap='coolwarm')

In [None]:
#############
## CORR MATRIX
#############

num_cols = train_metadata.select_dtypes(include=[np.number]).columns.tolist()
corr = train_metadata[num_cols].corr()
plt.figure(figsize=(25, 10))

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", linewidths=0.5,
                 cmap='coolwarm', vmin=-1, vmax=1, cbar_kws={"shrink": .8},
                 xticklabels=corr.columns, yticklabels=corr.columns)

plt.title('Correlation Matrix', fontsize=20, fontweight='bold', color='darkblue')
plt.xticks(fontsize=12, rotation=45, ha='right')
plt.yticks(fontsize=12, rotation=0)
plt.xlabel('Variables', fontsize=14, fontweight='bold', color='black')
plt.ylabel('Variables', fontsize=14, fontweight='bold', color='black')

cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)

plt.tight_layout()
plt.show();

In [None]:
# High Correlations

threshold = 0.9

for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > threshold:
            correlation_value = corr.iloc[i, j]
            var1 = corr.columns[i]
            var2 = corr.columns[j]
            color = "\033[93m"
            reset_color = "\033[0m"
            print(f"{color}{var1} ve {var2}: {correlation_value:.2f}{reset_color}")

In [None]:
#######
# DROP
#######

# Variables with high correlation or those that are derived from each other have been dropped.

unnec_list = [
    'tbp_lv_Bext',
    'tbp_lv_Cext',
    'tbp_lv_Lext',
    'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm',
    'tbp_lv_stdL',
    'tbp_lv_norm_color',
    'tbp_lv_location'
]

train_metadata.drop(unnec_list, axis=1, inplace=True)

In [None]:
#############
# COUNTPLOT
#############

variables_to_plot = ['target', 'sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location_simple']

num_columns = 3
num_rows = (len(variables_to_plot) + num_columns - 1) // num_columns

fig, axes = plt.subplots(num_rows, num_columns, figsize=(18, 5 * num_rows))
palette = sns.color_palette("deep")

for i, column in enumerate(variables_to_plot):
    ax = axes[i // num_columns, i % num_columns]
    sns.countplot(data=train_metadata, x=column, ax=ax, palette=palette)

    ax.set_title(column, fontsize=12, fontweight='bold')
    ax.set_ylabel('Count', fontsize=8)
    ax.set_xlabel('Categories', fontsize=8)

    ax.tick_params(axis='x', rotation=45)

    total = len(train_metadata)
    for p in ax.patches:
        count = p.get_height()
        percentage = count / total * 100
        ax.annotate(f'({percentage:.1f}%)',
                    (p.get_x() + p.get_width() / 2., count),
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

    ax.grid(axis='y', linestyle='--', alpha=0.7)

for j in range(i + 1, num_rows * num_columns):
    fig.delaxes(axes.flatten()[j])

plt.tight_layout()
plt.suptitle('Count of Selected Variables', fontsize=16, fontweight='bold', y=1.02)
plt.show();

## Visualization

In [None]:
##################
# TRAIN IMG PATHS
##################

def get_images_path(image_dir):
    images_path = {}
    for root, dirs, files in os.walk(image_dir):
        for file in files:
            if file.endswith('.jpg'):
                image_id = os.path.splitext(file)[0]
                image_path = os.path.join(root, file)
                images_path[image_id] = image_path
    return images_path

images_path = get_images_path(train_image_dir)
train_metadata['img_paths'] = train_metadata['isic_id'].map(images_path)

In [None]:
################################
# IMAGE TRANFORMATION TECHNIQUES
################################

#Aim to improve the robustness and performance of the model by providing varied representations of the input images

selected_ids = ['ISIC_4851249', 'ISIC_3646371', 'ISIC_3149970', 'ISIC_6286734']
selected_img_path = train_metadata.loc[train_metadata['isic_id'].isin(selected_ids), 'img_paths']


def apply_transformations(image_paths):
    fig, axes = plt.subplots(nrows=5, ncols=4, figsize=(10, 15))
    methods = ["Without Gaussian Blur", "With Gaussian Blur", "Hue, Saturation, Brightness", "LUV Color Space", "Greyscale + Gaussian Blur"]

    for index, method in enumerate(methods):
        for i, path in enumerate(image_paths):
            image = cv2.imread(path)

            if image is None:
                print(f"Image at {path} could not be loaded.")
                continue

            if method == "Without Gaussian Blur":
                transformed_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
                transformed_image = cv2.resize(transformed_image, (200, 200))

            elif method == "With Gaussian Blur":
                transformed_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
                transformed_image = cv2.resize(transformed_image, (200, 200))
                transformed_image = cv2.addWeighted(transformed_image, 4, cv2.GaussianBlur(transformed_image, (0, 0), 256/10), -4, 128)

            elif method == "Hue, Saturation, Brightness":
                transformed_image = cv2.cvtColor(image, cv2.COLOR_BGR2HLS)
                transformed_image = cv2.resize(transformed_image, (200, 200))

            elif method == "LUV Color Space":
                transformed_image = cv2.cvtColor(image, cv2.COLOR_BGR2LUV)
                transformed_image = cv2.resize(transformed_image, (200, 200))

            elif method == "Greyscale + Gaussian Blur":
                transformed_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                transformed_image = cv2.resize(transformed_image, (200, 200))
                transformed_image = cv2.GaussianBlur(transformed_image, (5, 5), 0)

            axes[index, i].imshow(transformed_image, cmap=plt.cm.bone if len(transformed_image.shape) == 2 else None)
            axes[index, i].axis('off')
            axes[index, i].set_title(method, fontsize=8)

    plt.tight_layout()
    plt.show()


apply_transformations(selected_img_path)

In [None]:
#############
# HAIR REMOVE
#############

all_image_paths = train_metadata['img_paths'].tolist()
image_hair = np.array(all_image_paths)[[13, 9473, 267, 167, 87, 13]]


plt.figure(figsize=(16,3))
plt.suptitle("Original Images", fontsize = 16)

for k, path in enumerate(image_hair[:6]):
    image = mpimg.imread(path)
    image = cv2.resize(image,(300, 300))

    plt.subplot(1, 6, k+1)
    plt.imshow(image)
    plt.axis('off')


#####################################


def hair_remove(image):
    grayScale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(1,(17,17))
    blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)
    _, threshold = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    final_image = cv2.inpaint(image, threshold, 1, cv2.INPAINT_TELEA)

    return final_image


plt.figure(figsize=(16,3))
plt.suptitle("Non Hairy Images", fontsize = 16)

for k, path in enumerate(image_hair):
    image = mpimg.imread(path)
    image = cv2.resize(image,(300, 300))
    image = hair_remove(image)

    plt.subplot(1, 6, k+1)
    plt.imshow(image)
    plt.axis('off')

In [None]:
########
# DROP
########

final_drop = ["isic_id", "tbp_lv_location_simple"]
train_metadata.drop(final_drop, axis=1, inplace=True)

In [None]:
##############
# EXPORT DATA
##############

file_path = '/content/drive/MyDrive/Skin Cancer Detection/exploratory_data_analysis.csv'
train_metadata.to_csv(file_path, index=False)