Import Libraries

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import hashlib
import shutil


Load Dataset

In [2]:
data_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"

categories = os.listdir(data_dir)   # Each folder = one class
print("Classes:", categories)

data = []
img_size = 128   # Resize all images to 128x128

for category in categories:
    path = os.path.join(data_dir, category)
    class_num = categories.index(category)
    for img in tqdm(os.listdir(path)):
        try:
            img_array = cv2.imread(os.path.join(path, img))
            img_array = cv2.resize(img_array, (img_size, img_size))
            data.append([img_array, class_num])
        except Exception as e:
            pass


Classes: ['actinic keratosis', 'basal cell carcinoma', 'benign', 'blurry_images', 'blurry_images_edges', 'dermatofibroma', 'malignant', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


  0%|          | 0/982 [00:00<?, ?it/s]

100%|██████████| 982/982 [00:22<00:00, 42.90it/s]
100%|██████████| 1095/1095 [00:28<00:00, 37.77it/s]
100%|██████████| 1593/1593 [00:46<00:00, 33.96it/s]
100%|██████████| 3491/3491 [01:50<00:00, 31.72it/s]
100%|██████████| 926/926 [00:34<00:00, 26.89it/s]
100%|██████████| 933/933 [00:19<00:00, 48.47it/s]
100%|██████████| 1255/1255 [00:38<00:00, 32.60it/s]
100%|██████████| 812/812 [00:21<00:00, 37.34it/s]
100%|██████████| 1068/1068 [00:33<00:00, 31.86it/s]
100%|██████████| 1004/1004 [00:29<00:00, 34.15it/s]
100%|██████████| 905/905 [00:32<00:00, 27.65it/s]
100%|██████████| 600/600 [00:22<00:00, 27.06it/s]


Label Cleaning

In [3]:
import pandas as pd
import os

# Paths to your dataset
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Test"

# Function to extract labels from folder names
def build_labels_from_folders(base_dir):
    records = []
    for class_name in os.listdir(base_dir):
        class_path = os.path.join(base_dir, class_name)
        if os.path.isdir(class_path):
            for img_name in os.listdir(class_path):
                image_id, ext = os.path.splitext(img_name)
                records.append([image_id, class_name.lower().strip()])
    return pd.DataFrame(records, columns=["image_id", "label"])

# Build label dataframe from Train + Test
train_labels = build_labels_from_folders(train_dir)
test_labels  = build_labels_from_folders(test_dir)

labels = pd.concat([train_labels, test_labels], ignore_index=True)

# Step 1: Standardize labels
labels["label"] = labels["label"].str.lower().str.strip()

# Step 2: Keep only valid classes
valid_classes = ["melanoma", "nevus", "bcc", "akiec", "bkl", "df", "vasc"]
labels = labels[labels["label"].isin(valid_classes)]

# Step 3: Drop duplicate image IDs
labels = labels.drop_duplicates(subset=["image_id"])

# Save cleaned labels
labels.to_csv("labels_cleaned.csv", index=False)
print("✅ Labels cleaned and saved as labels_cleaned.csv")


✅ Labels cleaned and saved as labels_cleaned.csv


Clean Data


Hashing Images to Detect Duplicates:

1. Converts image → grayscale → resized.

2. Computes a hash (dhash).

In [4]:



def dhash(image, hash_size=8):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Resize the image
    resized = cv2.resize(gray, (hash_size + 1, hash_size))
    # Compute the difference
    diff = resized[:, 1:] > resized[:, :-1]
    # Convert to hex
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

def remove_duplicates(folder_path):
    hashes = {}
    duplicates = []

    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path):
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                h = dhash(img)
                if h in hashes:
                    duplicates.append(img_path)
                    os.remove(img_path)   # 🔥 remove duplicate
                else:
                    hashes[h] = img_path
            except:
                continue

    print(f"Removed {len(duplicates)} duplicate images.")

# Example usage:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_duplicates(train_dir)

test_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
remove_duplicates(test_dir)


Checking actinic keratosis:   3%|▎         | 28/982 [00:00<00:03, 265.04it/s]

Checking actinic keratosis: 100%|██████████| 982/982 [00:01<00:00, 573.89it/s]
Checking basal cell carcinoma: 100%|██████████| 1095/1095 [00:02<00:00, 519.18it/s]
Checking benign: 100%|██████████| 1593/1593 [00:04<00:00, 396.77it/s]
Checking blurry_images: 100%|██████████| 3491/3491 [00:04<00:00, 784.60it/s]
Checking blurry_images_edges: 100%|██████████| 926/926 [00:01<00:00, 740.89it/s]
Checking dermatofibroma: 100%|██████████| 933/933 [00:01<00:00, 745.23it/s]
Checking malignant: 100%|██████████| 1255/1255 [00:03<00:00, 348.94it/s]
Checking nevus: 100%|██████████| 812/812 [00:01<00:00, 672.15it/s]
Checking pigmented benign keratosis: 100%|██████████| 1068/1068 [00:03<00:00, 306.10it/s]
Checking seborrheic keratosis: 100%|██████████| 1004/1004 [00:02<00:00, 379.53it/s]
Checking squamous cell carcinoma: 100%|██████████| 905/905 [00:01<00:00, 691.21it/s]
Checking vascular lesion: 100%|██████████| 600/600 [00:01<00:00, 424.79it/s]


Removed 6 duplicate images.


Checking actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 32.61it/s]
Checking basal cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 30.38it/s]
Checking benign: 100%|██████████| 500/500 [00:16<00:00, 30.22it/s]
Checking blurry_images: 100%|██████████| 7/7 [00:00<00:00, 49.19it/s]
Checking dermatofibroma: 100%|██████████| 16/16 [00:00<00:00, 41.35it/s]
Checking malignant: 100%|██████████| 500/500 [00:19<00:00, 25.54it/s]
Checking nevus: 100%|██████████| 13/13 [00:00<00:00, 52.38it/s]
Checking pigmented benign keratosis: 100%|██████████| 15/15 [00:00<00:00, 52.74it/s]
Checking seborrheic keratosis: 100%|██████████| 3/3 [00:00<00:00, 50.16it/s]
Checking squamous cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 52.68it/s]
Checking vascular lesion: 100%|██████████| 2/2 [00:00<00:00, 50.98it/s]

Removed 1 duplicate images.





Blur Detection using Laplacian Variance

The idea:

1. Compute the Laplacian of the image.

2. Calculate the variance.    variance < threshold → image is blurry.

3. Low variance → blurry image

4. High variance → sharp image

threshold=100

In [5]:

def detect_blur(image_path, threshold=100):
    """Return True if the image is blurry"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True   # Treat unreadable image as bad
    laplacian_var = cv2.Laplacian(img, cv2.CV_64F).var()
    return laplacian_var < threshold

def remove_blurry_images(folder_path, threshold=100, move_to_folder=True):
    blurry_count = 0
    blurry_folder = os.path.join(folder_path, "blurry_images")

    if move_to_folder and not os.path.exists(blurry_folder):
        os.makedirs(blurry_folder)

    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path) or category == "blurry_images":
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                if detect_blur(img_path, threshold):
                    blurry_count += 1
                    if move_to_folder:
                        shutil.move(img_path, os.path.join(blurry_folder, img_name))
                    else:
                        os.remove(img_path)
            except:
                continue

    print(f"Removed/flagged {blurry_count} blurry images.")

# Example usage:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_blurry_images(train_dir, threshold=100, move_to_folder=True)

test_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
remove_blurry_images(test_dir, threshold=100, move_to_folder=True)


Checking actinic keratosis:   0%|          | 0/982 [00:00<?, ?it/s]

Checking actinic keratosis: 100%|██████████| 982/982 [00:01<00:00, 522.94it/s]
Checking basal cell carcinoma: 100%|██████████| 1095/1095 [00:01<00:00, 560.41it/s]
Checking benign: 100%|██████████| 1589/1589 [00:07<00:00, 223.86it/s]
Checking blurry_images_edges: 100%|██████████| 926/926 [00:02<00:00, 372.37it/s]
Checking dermatofibroma: 100%|██████████| 933/933 [00:01<00:00, 524.91it/s]
Checking malignant: 100%|██████████| 1254/1254 [00:04<00:00, 304.17it/s]
Checking nevus: 100%|██████████| 812/812 [00:01<00:00, 753.13it/s]
Checking pigmented benign keratosis: 100%|██████████| 1068/1068 [00:01<00:00, 788.09it/s]
Checking seborrheic keratosis: 100%|██████████| 1003/1003 [00:01<00:00, 729.81it/s]
Checking squamous cell carcinoma: 100%|██████████| 905/905 [00:02<00:00, 311.41it/s]
Checking vascular lesion: 100%|██████████| 600/600 [00:01<00:00, 582.85it/s]


Removed/flagged 378 blurry images.


Checking actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 557.52it/s]
Checking basal cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 542.47it/s]
Checking benign: 100%|██████████| 499/499 [00:04<00:00, 113.14it/s]
Checking dermatofibroma: 100%|██████████| 16/16 [00:00<00:00, 248.85it/s]
Checking malignant: 100%|██████████| 500/500 [00:03<00:00, 126.47it/s]
Checking nevus: 100%|██████████| 13/13 [00:00<00:00, 565.26it/s]
Checking pigmented benign keratosis: 100%|██████████| 15/15 [00:00<00:00, 504.58it/s]
Checking seborrheic keratosis: 100%|██████████| 3/3 [00:00<00:00, 441.54it/s]
Checking squamous cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 470.93it/s]
Checking vascular lesion: 100%|██████████| 2/2 [00:00<00:00, 275.32it/s]

Removed/flagged 703 blurry images.





Canny edge detector:
Count number of edge pixels.

If edge count < threshold → image is blurry → move it to a blurry_images_edges folder

threshold=50


In [6]:
def is_blurry_edge(image_path, threshold=50):
    """Detect blur using edge detection method"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    
    # Apply Canny Edge Detector
    edges = cv2.Canny(img, 100, 200)
    edge_strength = cv2.countNonZero(edges)  # how many edge pixels exist
    
    return edge_strength < threshold  # If too few edges → blurry

def remove_blurry_images_with_edges(folder_path, threshold=50):
    blurry_folder = os.path.join(folder_path, "blurry_images_edges")
    if not os.path.exists(blurry_folder):
        os.makedirs(blurry_folder)

    blurry_count = 0
    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path) or category == "blurry_images_edges":
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                if is_blurry_edge(img_path, threshold):
                    blurry_count += 1
                    shutil.move(img_path, os.path.join(blurry_folder, img_name))
            except:
                continue

    print(f"Moved {blurry_count} blurry images to {blurry_folder}")

# Example usage
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_blurry_images_with_edges(train_dir, threshold=50)


Checking actinic keratosis:   0%|          | 0/982 [00:00<?, ?it/s]

Checking actinic keratosis: 100%|██████████| 982/982 [00:01<00:00, 863.55it/s]
Checking basal cell carcinoma: 100%|██████████| 1095/1095 [00:01<00:00, 830.17it/s]
Checking benign: 100%|██████████| 1589/1589 [00:05<00:00, 285.90it/s]
Checking blurry_images: 100%|██████████| 3869/3869 [00:05<00:00, 766.80it/s]
Checking dermatofibroma: 100%|██████████| 933/933 [00:01<00:00, 858.39it/s]
Checking malignant: 100%|██████████| 1254/1254 [00:04<00:00, 307.58it/s]
Checking nevus: 100%|██████████| 812/812 [00:00<00:00, 928.17it/s]
Checking pigmented benign keratosis: 100%|██████████| 1068/1068 [00:01<00:00, 920.87it/s]
Checking seborrheic keratosis: 100%|██████████| 1003/1003 [00:01<00:00, 953.31it/s]
Checking squamous cell carcinoma: 100%|██████████| 905/905 [00:00<00:00, 949.77it/s]
Checking vascular lesion: 100%|██████████| 600/600 [00:00<00:00, 964.95it/s]

Moved 378 blurry images to C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train\blurry_images_edges





Normalize + Resize

Split Train/Test

In [None]:

# Paths
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Test"

img_size = 128  # or 224 for ResNet/EfficientNet

# Define the function
def load_and_preprocess_images(data_dir, img_size=128):
    X = []
    y = []
    categories = [cls for cls in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, cls))]

    for category in categories:
        class_path = os.path.join(data_dir, category)
        label = categories.index(category)

        for img_name in tqdm(os.listdir(class_path), desc=f"Loading {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                img = cv2.resize(img, (img_size, img_size))   # Resize
                img = img.astype("float32") / 255.0           # Normalize
                X.append(img)
                y.append(label)
            except:
                continue

    X = np.array(X, dtype="float32")
    y = np.array(y)
    return X, y, categories

# Load train and test
X_train_full, y_train_full, categories = load_and_preprocess_images(train_dir, img_size)
X_test, y_test, _ = load_and_preprocess_images(test_dir, img_size)

# Split train into Train and Validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)
print("Classes:", categories)


Loading actinic keratosis: 100%|██████████| 982/982 [00:03<00:00, 304.31it/s]
Loading basal cell carcinoma: 100%|██████████| 1095/1095 [00:03<00:00, 304.33it/s]
Loading benign: 100%|██████████| 1589/1589 [00:08<00:00, 184.28it/s]
Loading blurry_images: 100%|██████████| 3491/3491 [00:10<00:00, 342.39it/s]
Loading blurry_images_edges: 100%|██████████| 926/926 [00:05<00:00, 165.49it/s]
Loading dermatofibroma: 100%|██████████| 933/933 [00:03<00:00, 296.21it/s]
Loading malignant: 100%|██████████| 1254/1254 [00:04<00:00, 252.63it/s]
Loading nevus: 100%|██████████| 812/812 [00:02<00:00, 356.19it/s]
Loading pigmented benign keratosis: 100%|██████████| 1068/1068 [00:03<00:00, 325.41it/s]
Loading seborrheic keratosis: 100%|██████████| 1003/1003 [00:03<00:00, 296.38it/s]
Loading squamous cell carcinoma: 100%|██████████| 905/905 [00:02<00:00, 306.06it/s]
Loading vascular lesion: 100%|██████████| 600/600 [00:01<00:00, 314.65it/s]
Loading actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 156.0

Train shape: (11726, 128, 128, 3) (11726,)
Validation shape: (2932, 128, 128, 3) (2932,)
Test shape: (1103, 128, 128, 3) (1103,)
Classes: ['actinic keratosis', 'basal cell carcinoma', 'benign', 'blurry_images', 'blurry_images_edges', 'dermatofibroma', 'malignant', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


Balance the Dataset using:

Oversampling

Duplicate minority class samples → risk of overfitting.

In [12]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy="not majority", random_state=42)

# Flatten images for oversampling
X_resampled, y_resampled = ros.fit_resample(
    X_train.reshape(len(X_train), -1), y_train
)

# Reshape back to image format
X_resampled = X_resampled.reshape(-1, img_size, img_size, 3)

print("Before oversampling:", X_train.shape, y_train.shape)
print("After oversampling:", X_resampled.shape, y_resampled.shape)


Before oversampling: (11726, 128, 128, 3) (11726,)
After oversampling: (33516, 128, 128, 3) (33516,)


Data Augmentation (Refinement)

Step 1: Count Images per Class

In [13]:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
class_counts = {}
for cls in os.listdir(train_dir):
    cls_path = os.path.join(train_dir, cls)
    if os.path.isdir(cls_path):
        class_counts[cls] = len(os.listdir(cls_path))

print("Class distribution before augmentation:")
print(class_counts)



Class distribution before augmentation:
{'actinic keratosis': 982, 'basal cell carcinoma': 1095, 'benign': 1589, 'blurry_images': 3491, 'blurry_images_edges': 926, 'dermatofibroma': 933, 'malignant': 1254, 'nevus': 812, 'pigmented benign keratosis': 1068, 'seborrheic keratosis': 1003, 'squamous cell carcinoma': 905, 'vascular lesion': 600}


Step 2: Identify Majority and Minority Classes

In [14]:
max_count = max(class_counts.values())
minority_classes = [cls for cls, count in class_counts.items() if count < max_count]

print("Majority class has:", max_count, "images")
print("Minority classes (to augment):", minority_classes)


Majority class has: 3491 images
Minority classes (to augment): ['actinic keratosis', 'basal cell carcinoma', 'benign', 'blurry_images_edges', 'dermatofibroma', 'malignant', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


Step 3: Apply Augmentation Only on Minority Classes

In [15]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode="nearest"
)

img_size = 128  # resize for consistency

# Augment only minority classes
for cls in minority_classes:
    cls_path = os.path.join(train_dir, cls)
    images = os.listdir(cls_path)
    needed = max_count - len(images)   # how many extra images we need

    print(f"Augmenting {cls}: need {needed} more images")

    for img_name in images:
        img_path = os.path.join(cls_path, img_name)
        img = cv2.imread(img_path)
        if img is None:
            continue
        img = cv2.resize(img, (img_size, img_size))
        img = np.expand_dims(img, axis=0)  # add batch dimension

        i = 0
        for batch in datagen.flow(img, batch_size=1,
                                  save_to_dir=cls_path,
                                  save_prefix="aug",
                                  save_format="jpg"):
            i += 1
            if i >= needed // len(images) + 1:  # spread augmentations
                break


Augmenting actinic keratosis: need 2509 more images
Augmenting basal cell carcinoma: need 2396 more images
Augmenting benign: need 1902 more images
Augmenting blurry_images_edges: need 2565 more images
Augmenting dermatofibroma: need 2558 more images
Augmenting malignant: need 2237 more images
Augmenting nevus: need 2679 more images
Augmenting pigmented benign keratosis: need 2423 more images
Augmenting seborrheic keratosis: need 2488 more images
Augmenting squamous cell carcinoma: need 2586 more images
Augmenting vascular lesion: need 2891 more images


Normalize pixel values

In [16]:
data_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"

categories = os.listdir(data_dir)
img_size = 128   # resize to 128x128

X = []
y = []

for category in categories:
    class_path = os.path.join(data_dir, category)
    if not os.path.isdir(class_path):
        continue
    
    class_num = categories.index(category)  # assign label
    for img_name in tqdm(os.listdir(class_path), desc=f"Loading {category}"):
        try:
            img_path = os.path.join(class_path, img_name)
            img = cv2.imread(img_path)
            if img is None:
                continue
            img = cv2.resize(img, (img_size, img_size))
            X.append(img)
            y.append(class_num)
        except:
            continue

# Convert to NumPy arrays
X = np.array(X, dtype="float32")
y = np.array(y)

# Normalize pixel values
X = X / 255.0
print("Data shape:", X.shape)
print("Label shape:", y.shape)

Loading actinic keratosis:   0%|          | 0/3294 [00:00<?, ?it/s]

Loading actinic keratosis: 100%|██████████| 3294/3294 [01:11<00:00, 45.96it/s] 
Loading basal cell carcinoma: 100%|██████████| 3666/3666 [01:24<00:00, 43.52it/s] 
Loading benign: 100%|██████████| 4296/4296 [01:34<00:00, 45.68it/s] 
Loading blurry_images: 100%|██████████| 3491/3491 [00:14<00:00, 238.76it/s]
Loading blurry_images_edges: 100%|██████████| 3217/3217 [01:14<00:00, 42.97it/s] 
Loading dermatofibroma: 100%|██████████| 3159/3159 [01:14<00:00, 42.22it/s]
Loading malignant: 100%|██████████| 3456/3456 [01:07<00:00, 51.27it/s] 
Loading nevus: 100%|██████████| 3401/3401 [01:31<00:00, 37.35it/s] 
Loading pigmented benign keratosis: 100%|██████████| 3592/3592 [01:27<00:00, 41.27it/s] 
Loading seborrheic keratosis: 100%|██████████| 3345/3345 [01:29<00:00, 37.49it/s]
Loading squamous cell carcinoma: 100%|██████████| 3093/3093 [01:22<00:00, 37.69it/s] 
Loading vascular lesion: 100%|██████████| 3050/3050 [01:17<00:00, 39.18it/s] 


Data shape: (41060, 128, 128, 3)
Label shape: (41060,)


Handle Class Imbalance

In [17]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
print("Class Weights:", dict(enumerate(class_weights)))

Class Weights: {0: np.float64(1.243214588634436), 1: np.float64(1.1154870624048707), 2: np.float64(0.7688172043010753), 3: np.float64(0.34986275211839124), 4: np.float64(1.3187134502923976), 5: np.float64(1.3098748882931188), 6: np.float64(0.9742439348620804), 7: np.float64(1.5033333333333334), 8: np.float64(1.144223263075722), 9: np.float64(1.2184123025768911), 10: np.float64(1.3496777163904237), 11: np.float64(2.035763888888889)}


Save Refined Dataset

In [20]:
import pandas as pd


In [21]:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
output_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Refined_Dataset"

# Load cleaned labels
labels = pd.read_csv("labels_cleaned.csv")

# Parameters
target_size = (224, 224)  # resize all images to 224x224

# Check valid files
valid_records = []
missing = []

for _, row in labels.iterrows():
    image_id = row["image_id"]
    label = row["label"]

    found = False
    for ext in [".jpg", ".jpeg", ".png"]:
        train_path = os.path.join(train_dir, label, image_id + ext)
        test_path = os.path.join(test_dir, label, image_id + ext)

        if os.path.exists(train_path):
            file_path = train_path
            found = True
            break
        elif os.path.exists(test_path):
            file_path = test_path
            found = True
            break

    if found:
        # Prepare save path
        save_folder = os.path.join(output_dir, label)
        os.makedirs(save_folder, exist_ok=True)
        save_path = os.path.join(save_folder, image_id + ".jpg")

        try:
            # Open, resize, normalize (0-255 → 0-1 range), then save back
            img = Image.open(file_path).convert("RGB")
            img = img.resize(target_size)
            img.save(save_path, "JPEG")

            valid_records.append([image_id, label, save_path])
        except Exception as e:
            print(f"⚠️ Error processing {file_path}: {e}")
    else:
        missing.append(image_id)

# Save refined CSV
df_refined = pd.DataFrame(valid_records, columns=["image_id", "label", "path"])
df_refined.to_csv("refined_labels.csv", index=False)

print(f"✅ Refined dataset saved at: {output_dir}")
print(f"✅ Metadata saved as refined_labels.csv")
print(f"✅ Valid images: {len(valid_records)} | ❌ Missing images: {len(missing)}")


⚠️ Error processing C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train\nevus\aug_0_1003.jpg: name 'Image' is not defined
⚠️ Error processing C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train\nevus\aug_0_101.jpg: name 'Image' is not defined
⚠️ Error processing C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train\nevus\aug_0_1026.jpg: name 'Image' is not defined
⚠️ Error processing C:\Users\Administrator\Documents\FYP(skin cancer)\Skin-Cancer-Detection-Model-1\combined dataset\Skin cancer ISIC The International Skin Imaging Collaboration\Train\nevus\aug_0_1050.jpg: name 'Image' is not defined
⚠️ Error processing C:\Users\Administrator\Documents\FYP(skin