Import Libraries

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import hashlib
import shutil


Load Dataset

In [2]:
data_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"

categories = os.listdir(data_dir)   # Each folder = one class
print("Classes:", categories)

data = []
img_size = 128   # Resize all images to 128x128

for category in categories:
    path = os.path.join(data_dir, category)
    class_num = categories.index(category)
    for img in tqdm(os.listdir(path)):
        try:
            img_array = cv2.imread(os.path.join(path, img))
            img_array = cv2.resize(img_array, (img_size, img_size))
            data.append([img_array, class_num])
        except Exception as e:
            pass


Classes: ['actinic keratosis', 'basal cell carcinoma', 'blurry_images', 'blurry_images_edges', 'dermatofibroma', 'melanoma', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


100%|██████████| 1152/1152 [00:02<00:00, 479.00it/s]
100%|██████████| 1438/1438 [00:03<00:00, 477.18it/s]
100%|██████████| 1157/1157 [00:01<00:00, 587.06it/s]
100%|██████████| 1361/1361 [00:04<00:00, 274.17it/s]
100%|██████████| 1358/1358 [00:05<00:00, 248.03it/s]
100%|██████████| 1401/1401 [00:04<00:00, 304.40it/s]
100%|██████████| 1169/1169 [00:04<00:00, 268.36it/s]
100%|██████████| 1287/1287 [00:04<00:00, 265.38it/s]
100%|██████████| 1101/1101 [00:02<00:00, 438.08it/s]
100%|██████████| 1204/1204 [00:02<00:00, 475.04it/s]
100%|██████████| 1169/1169 [00:02<00:00, 461.05it/s]


Label Cleaning

In [3]:
import pandas as pd
import os

# Paths to your dataset
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Test"

# Function to extract labels from folder names
def build_labels_from_folders(base_dir):
    records = []
    for class_name in os.listdir(base_dir):
        class_path = os.path.join(base_dir, class_name)
        if os.path.isdir(class_path):
            for img_name in os.listdir(class_path):
                image_id, ext = os.path.splitext(img_name)
                records.append([image_id, class_name.lower().strip()])
    return pd.DataFrame(records, columns=["image_id", "label"])

# Build label dataframe from Train + Test
train_labels = build_labels_from_folders(train_dir)
test_labels  = build_labels_from_folders(test_dir)

labels = pd.concat([train_labels, test_labels], ignore_index=True)

# Step 1: Standardize labels
labels["label"] = labels["label"].str.lower().str.strip()

# Step 2: Keep only valid classes
valid_classes = ["melanoma", "nevus", "bcc", "akiec", "bkl", "df", "vasc"]
labels = labels[labels["label"].isin(valid_classes)]

# Step 3: Drop duplicate image IDs
labels = labels.drop_duplicates(subset=["image_id"])

# Save cleaned labels
labels.to_csv("labels_cleaned.csv", index=False)
print("✅ Labels cleaned and saved as labels_cleaned.csv")


✅ Labels cleaned and saved as labels_cleaned.csv


Clean Data


Hashing Images to Detect Duplicates:

1. Converts image → grayscale → resized.

2. Computes a hash (dhash).

In [4]:



def dhash(image, hash_size=8):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Resize the image
    resized = cv2.resize(gray, (hash_size + 1, hash_size))
    # Compute the difference
    diff = resized[:, 1:] > resized[:, :-1]
    # Convert to hex
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

def remove_duplicates(folder_path):
    hashes = {}
    duplicates = []

    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path):
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                h = dhash(img)
                if h in hashes:
                    duplicates.append(img_path)
                    os.remove(img_path)   # 🔥 remove duplicate
                else:
                    hashes[h] = img_path
            except:
                continue

    print(f"Removed {len(duplicates)} duplicate images.")

# Example usage:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_duplicates(train_dir)

test_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
remove_duplicates(test_dir)


Checking actinic keratosis:   4%|▎         | 41/1152 [00:00<00:02, 406.96it/s]

Checking actinic keratosis: 100%|██████████| 1152/1152 [00:01<00:00, 724.93it/s]
Checking basal cell carcinoma: 100%|██████████| 1438/1438 [00:01<00:00, 956.66it/s] 
Checking blurry_images: 100%|██████████| 1157/1157 [00:01<00:00, 680.30it/s]
Checking blurry_images_edges: 100%|██████████| 1361/1361 [00:02<00:00, 557.61it/s]
Checking dermatofibroma: 100%|██████████| 1358/1358 [00:02<00:00, 594.82it/s]
Checking melanoma: 100%|██████████| 1401/1401 [00:02<00:00, 590.77it/s]
Checking nevus: 100%|██████████| 1169/1169 [00:02<00:00, 538.58it/s]
Checking pigmented benign keratosis: 100%|██████████| 1287/1287 [00:02<00:00, 525.31it/s]
Checking seborrheic keratosis: 100%|██████████| 1101/1101 [00:01<00:00, 672.02it/s]
Checking squamous cell carcinoma: 100%|██████████| 1204/1204 [00:01<00:00, 649.01it/s]
Checking vascular lesion: 100%|██████████| 1169/1169 [00:01<00:00, 593.49it/s]


Removed 0 duplicate images.


Checking actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 157.43it/s]
Checking basal cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 313.40it/s]
Checking blurry_images: 100%|██████████| 7/7 [00:00<00:00, 191.76it/s]
Checking dermatofibroma: 100%|██████████| 16/16 [00:00<00:00, 240.14it/s]
Checking melanoma: 100%|██████████| 14/14 [00:00<00:00, 543.50it/s]
Checking nevus: 100%|██████████| 13/13 [00:00<00:00, 522.64it/s]
Checking pigmented benign keratosis: 100%|██████████| 15/15 [00:00<00:00, 430.87it/s]
Checking seborrheic keratosis: 100%|██████████| 3/3 [00:00<00:00, 187.07it/s]
Checking squamous cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 356.96it/s]
Checking vascular lesion: 100%|██████████| 2/2 [00:00<00:00, 297.74it/s]

Removed 0 duplicate images.





Blur Detection using Laplacian Variance

The idea:

1. Compute the Laplacian of the image.

2. Calculate the variance.    variance < threshold → image is blurry.

3. Low variance → blurry image

4. High variance → sharp image

threshold=100

In [5]:

def detect_blur(image_path, threshold=100):
    """Return True if the image is blurry"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True   # Treat unreadable image as bad
    laplacian_var = cv2.Laplacian(img, cv2.CV_64F).var()
    return laplacian_var < threshold

def remove_blurry_images(folder_path, threshold=100, move_to_folder=True):
    blurry_count = 0
    blurry_folder = os.path.join(folder_path, "blurry_images")

    if move_to_folder and not os.path.exists(blurry_folder):
        os.makedirs(blurry_folder)

    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path) or category == "blurry_images":
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                if detect_blur(img_path, threshold):
                    blurry_count += 1
                    if move_to_folder:
                        shutil.move(img_path, os.path.join(blurry_folder, img_name))
                    else:
                        os.remove(img_path)
            except:
                continue

    print(f"Removed/flagged {blurry_count} blurry images.")

# Example usage:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_blurry_images(train_dir, threshold=100, move_to_folder=True)

test_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
remove_blurry_images(test_dir, threshold=100, move_to_folder=True)


Checking actinic keratosis:   0%|          | 0/1152 [00:00<?, ?it/s]

Checking actinic keratosis: 100%|██████████| 1152/1152 [00:02<00:00, 402.59it/s]
Checking basal cell carcinoma: 100%|██████████| 1438/1438 [00:03<00:00, 476.67it/s]
Checking blurry_images_edges: 100%|██████████| 1361/1361 [00:06<00:00, 213.72it/s]
Checking dermatofibroma: 100%|██████████| 1358/1358 [00:04<00:00, 335.98it/s]
Checking melanoma: 100%|██████████| 1401/1401 [00:04<00:00, 335.36it/s]
Checking nevus: 100%|██████████| 1169/1169 [00:04<00:00, 250.35it/s]
Checking pigmented benign keratosis: 100%|██████████| 1287/1287 [00:04<00:00, 293.77it/s]
Checking seborrheic keratosis: 100%|██████████| 1101/1101 [00:03<00:00, 353.40it/s]
Checking squamous cell carcinoma: 100%|██████████| 1204/1204 [00:04<00:00, 244.64it/s]
Checking vascular lesion: 100%|██████████| 1169/1169 [00:06<00:00, 173.91it/s]


Removed/flagged 3502 blurry images.


Checking actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 280.19it/s]
Checking basal cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 281.85it/s]
Checking dermatofibroma: 100%|██████████| 16/16 [00:00<00:00, 313.69it/s]
Checking melanoma: 100%|██████████| 14/14 [00:00<00:00, 619.32it/s]
Checking nevus: 100%|██████████| 13/13 [00:00<00:00, 358.17it/s]
Checking pigmented benign keratosis: 100%|██████████| 15/15 [00:00<00:00, 319.83it/s]
Checking seborrheic keratosis: 100%|██████████| 3/3 [00:00<00:00, 405.89it/s]
Checking squamous cell carcinoma: 100%|██████████| 16/16 [00:00<00:00, 374.70it/s]
Checking vascular lesion: 100%|██████████| 2/2 [00:00<00:00, 540.71it/s]

Removed/flagged 0 blurry images.





Canny edge detector:
Count number of edge pixels.

If edge count < threshold → image is blurry → move it to a blurry_images_edges folder

threshold=50


In [6]:
def is_blurry_edge(image_path, threshold=50):
    """Detect blur using edge detection method"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    
    # Apply Canny Edge Detector
    edges = cv2.Canny(img, 100, 200)
    edge_strength = cv2.countNonZero(edges)  # how many edge pixels exist
    
    return edge_strength < threshold  # If too few edges → blurry

def remove_blurry_images_with_edges(folder_path, threshold=50):
    blurry_folder = os.path.join(folder_path, "blurry_images_edges")
    if not os.path.exists(blurry_folder):
        os.makedirs(blurry_folder)

    blurry_count = 0
    for category in os.listdir(folder_path):
        class_path = os.path.join(folder_path, category)
        if not os.path.isdir(class_path) or category == "blurry_images_edges":
            continue

        for img_name in tqdm(os.listdir(class_path), desc=f"Checking {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                if is_blurry_edge(img_path, threshold):
                    blurry_count += 1
                    shutil.move(img_path, os.path.join(blurry_folder, img_name))
            except:
                continue

    print(f"Moved {blurry_count} blurry images to {blurry_folder}")

# Example usage
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
remove_blurry_images_with_edges(train_dir, threshold=50)


Checking actinic keratosis:   2%|▏         | 21/982 [00:00<00:04, 203.60it/s]

Checking actinic keratosis: 100%|██████████| 982/982 [00:01<00:00, 784.51it/s]
Checking basal cell carcinoma: 100%|██████████| 1095/1095 [00:01<00:00, 837.07it/s]
Checking blurry_images: 100%|██████████| 3869/3869 [00:30<00:00, 127.25it/s]
Checking dermatofibroma: 100%|██████████| 934/934 [00:00<00:00, 1028.15it/s]
Checking melanoma: 100%|██████████| 1190/1190 [00:01<00:00, 947.42it/s]
Checking nevus: 100%|██████████| 812/812 [00:00<00:00, 995.36it/s] 
Checking pigmented benign keratosis: 100%|██████████| 1068/1068 [00:01<00:00, 954.65it/s]
Checking seborrheic keratosis: 100%|██████████| 1004/1004 [00:01<00:00, 924.50it/s]
Checking squamous cell carcinoma: 100%|██████████| 905/905 [00:00<00:00, 951.91it/s]
Checking vascular lesion: 100%|██████████| 600/600 [00:00<00:00, 971.00it/s] 

Moved 379 blurry images to C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train\blurry_images_edges





Normalize + Resize

In [7]:
# Path to your dataset (already cleaned and augmented)
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Test"

img_size = 128  # you can also use 224 for ResNet/EfficientNet

def load_and_preprocess_images(data_dir, img_size=128):
    X = []
    y = []
    categories = [cls for cls in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, cls))]

    for category in categories:
        class_path = os.path.join(data_dir, category)
        label = categories.index(category)

        for img_name in tqdm(os.listdir(class_path), desc=f"Loading {category}"):
            img_path = os.path.join(class_path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                img = cv2.resize(img, (img_size, img_size))   # Resize
                img = img.astype("float32") / 255.0           # Normalize
                X.append(img)
                y.append(label)
            except:
                continue

    X = np.array(X, dtype="float32")
    y = np.array(y)
    return X, y, categories

# Process Train and Test
X_train, y_train, categories = load_and_preprocess_images(train_dir, img_size)
X_test, y_test, _ = load_and_preprocess_images(test_dir, img_size)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Classes:", categories)


Loading actinic keratosis: 100%|██████████| 982/982 [00:02<00:00, 450.51it/s]
Loading basal cell carcinoma: 100%|██████████| 1095/1095 [00:01<00:00, 689.09it/s]
Loading blurry_images: 100%|██████████| 3491/3491 [00:07<00:00, 451.98it/s]
Loading blurry_images_edges: 100%|██████████| 926/926 [00:02<00:00, 424.00it/s]
Loading dermatofibroma: 100%|██████████| 933/933 [00:02<00:00, 456.26it/s]
Loading melanoma: 100%|██████████| 1190/1190 [00:02<00:00, 405.96it/s]
Loading nevus: 100%|██████████| 812/812 [00:01<00:00, 439.39it/s]
Loading pigmented benign keratosis: 100%|██████████| 1068/1068 [00:02<00:00, 448.33it/s]
Loading seborrheic keratosis: 100%|██████████| 1004/1004 [00:02<00:00, 412.04it/s]
Loading squamous cell carcinoma: 100%|██████████| 905/905 [00:02<00:00, 428.18it/s]
Loading vascular lesion: 100%|██████████| 600/600 [00:01<00:00, 443.18it/s]
Loading actinic keratosis: 100%|██████████| 16/16 [00:00<00:00, 1264.44it/s]
Loading basal cell carcinoma: 100%|██████████| 16/16 [00:00<00

Training set shape: (13006, 128, 128, 3) (13006,)
Testing set shape: (118, 128, 128, 3) (118,)
Classes: ['actinic keratosis', 'basal cell carcinoma', 'blurry_images', 'blurry_images_edges', 'dermatofibroma', 'melanoma', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


Split Train/Test

In [11]:
# Split into Train and Validation (80% / 20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)


NameError: name 'X' is not defined

Balance the Dataset using:

Oversampling

Duplicate minority class samples → risk of overfitting.

In [63]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy="not majority", random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.reshape(len(X), -1), y)

X_resampled = X_resampled.reshape(-1, img_size, img_size, 3)
print("After oversampling:", X_resampled.shape, y_resampled.shape)


After oversampling: (4410, 128, 128, 3) (4410,)


Data Augmentation (Refinement)

Step 1: Count Images per Class

In [64]:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"

class_counts = {}
for cls in os.listdir(train_dir):
    cls_path = os.path.join(train_dir, cls)
    if os.path.isdir(cls_path):
        class_counts[cls] = len(os.listdir(cls_path))

print("Class distribution before augmentation:")
print(class_counts)



Class distribution before augmentation:
{'actinic keratosis': 401, 'basal cell carcinoma': 494, 'blurry_images': 1157, 'blurry_images_edges': 473, 'dermatofibroma': 288, 'melanoma': 724, 'nevus': 402, 'pigmented benign keratosis': 334, 'seborrheic keratosis': 389, 'squamous cell carcinoma': 419, 'vascular lesion': 245}


Step 2: Identify Majority and Minority Classes

In [65]:
max_count = max(class_counts.values())
minority_classes = [cls for cls, count in class_counts.items() if count < max_count]

print("Majority class has:", max_count, "images")
print("Minority classes (to augment):", minority_classes)


Majority class has: 1157 images
Minority classes (to augment): ['actinic keratosis', 'basal cell carcinoma', 'blurry_images_edges', 'dermatofibroma', 'melanoma', 'nevus', 'pigmented benign keratosis', 'seborrheic keratosis', 'squamous cell carcinoma', 'vascular lesion']


Step 3: Apply Augmentation Only on Minority Classes

In [66]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode="nearest"
)

img_size = 128  # resize for consistency

# Augment only minority classes
for cls in minority_classes:
    cls_path = os.path.join(train_dir, cls)
    images = os.listdir(cls_path)
    needed = max_count - len(images)   # how many extra images we need

    print(f"Augmenting {cls}: need {needed} more images")

    for img_name in images:
        img_path = os.path.join(cls_path, img_name)
        img = cv2.imread(img_path)
        if img is None:
            continue
        img = cv2.resize(img, (img_size, img_size))
        img = np.expand_dims(img, axis=0)  # add batch dimension

        i = 0
        for batch in datagen.flow(img, batch_size=1,
                                  save_to_dir=cls_path,
                                  save_prefix="aug",
                                  save_format="jpg"):
            i += 1
            if i >= needed // len(images) + 1:  # spread augmentations
                break


Augmenting actinic keratosis: need 756 more images
Augmenting basal cell carcinoma: need 663 more images
Augmenting blurry_images_edges: need 684 more images
Augmenting dermatofibroma: need 869 more images
Augmenting melanoma: need 433 more images
Augmenting nevus: need 755 more images
Augmenting pigmented benign keratosis: need 823 more images
Augmenting seborrheic keratosis: need 768 more images
Augmenting squamous cell carcinoma: need 738 more images
Augmenting vascular lesion: need 912 more images


Normalize pixel values

In [67]:
data_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"

categories = os.listdir(data_dir)
img_size = 128   # resize to 128x128

X = []
y = []

for category in categories:
    class_path = os.path.join(data_dir, category)
    if not os.path.isdir(class_path):
        continue
    
    class_num = categories.index(category)  # assign label
    for img_name in tqdm(os.listdir(class_path), desc=f"Loading {category}"):
        try:
            img_path = os.path.join(class_path, img_name)
            img = cv2.imread(img_path)
            if img is None:
                continue
            img = cv2.resize(img, (img_size, img_size))
            X.append(img)
            y.append(class_num)
        except:
            continue

# Convert to NumPy arrays
X = np.array(X, dtype="float32")
y = np.array(y)

# Normalize pixel values
X = X / 255.0
print("Data shape:", X.shape)
print("Label shape:", y.shape)

Loading actinic keratosis: 100%|██████████| 1152/1152 [00:21<00:00, 53.54it/s] 
Loading basal cell carcinoma: 100%|██████████| 1438/1438 [00:22<00:00, 62.72it/s] 
Loading blurry_images: 100%|██████████| 1157/1157 [00:22<00:00, 51.15it/s]
Loading blurry_images_edges: 100%|██████████| 1361/1361 [00:44<00:00, 30.80it/s]
Loading dermatofibroma: 100%|██████████| 1358/1358 [00:35<00:00, 38.52it/s]
Loading melanoma: 100%|██████████| 1401/1401 [00:24<00:00, 58.20it/s] 
Loading nevus: 100%|██████████| 1169/1169 [00:26<00:00, 44.57it/s] 
Loading pigmented benign keratosis: 100%|██████████| 1287/1287 [00:29<00:00, 43.56it/s] 
Loading seborrheic keratosis: 100%|██████████| 1101/1101 [00:23<00:00, 47.44it/s]
Loading squamous cell carcinoma: 100%|██████████| 1204/1204 [00:17<00:00, 70.40it/s] 
Loading vascular lesion: 100%|██████████| 1169/1169 [00:28<00:00, 41.52it/s] 


Data shape: (13797, 128, 128, 3)
Label shape: (13797,)


Handle Class Imbalance

In [68]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
print("Class Weights:", dict(enumerate(class_weights)))

Class Weights: {0: np.float64(1.8461538461538463), 1: np.float64(0.631578947368421), 2: np.float64(1.6), 3: np.float64(2.3661971830985915), 4: np.float64(0.49122807017543857), 5: np.float64(0.7636363636363637), 6: np.float64(0.47592067988668557), 7: np.float64(84.0), 8: np.float64(1.1748251748251748), 9: np.float64(1.9310344827586208)}


Save Refined Dataset

In [78]:
train_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Train"
test_dir  = r"C:\Users\Administrator\Documents\FYP(skin cancer)\archive (7)\Skin cancer ISIC The International Skin Imaging Collaboration\Test"
output_dir = r"C:\Users\Administrator\Documents\FYP(skin cancer)\Refined_Dataset"

# Load cleaned labels
labels = pd.read_csv("labels_cleaned.csv")

# Parameters
target_size = (224, 224)  # resize all images to 224x224

# Check valid files
valid_records = []
missing = []

for _, row in labels.iterrows():
    image_id = row["image_id"]
    label = row["label"]

    found = False
    for ext in [".jpg", ".jpeg", ".png"]:
        train_path = os.path.join(train_dir, label, image_id + ext)
        test_path = os.path.join(test_dir, label, image_id + ext)

        if os.path.exists(train_path):
            file_path = train_path
            found = True
            break
        elif os.path.exists(test_path):
            file_path = test_path
            found = True
            break

    if found:
        # Prepare save path
        save_folder = os.path.join(output_dir, label)
        os.makedirs(save_folder, exist_ok=True)
        save_path = os.path.join(save_folder, image_id + ".jpg")

        try:
            # Open, resize, normalize (0-255 → 0-1 range), then save back
            img = Image.open(file_path).convert("RGB")
            img = img.resize(target_size)
            img.save(save_path, "JPEG")

            valid_records.append([image_id, label, save_path])
        except Exception as e:
            print(f"⚠️ Error processing {file_path}: {e}")
    else:
        missing.append(image_id)

# Save refined CSV
df_refined = pd.DataFrame(valid_records, columns=["image_id", "label", "path"])
df_refined.to_csv("refined_labels.csv", index=False)

print(f"✅ Refined dataset saved at: {output_dir}")
print(f"✅ Metadata saved as refined_labels.csv")
print(f"✅ Valid images: {len(valid_records)} | ❌ Missing images: {len(missing)}")


✅ Refined dataset saved at: C:\Users\Administrator\Documents\FYP(skin cancer)\Refined_Dataset
✅ Metadata saved as refined_labels.csv
✅ Valid images: 1149 | ❌ Missing images: 0
