In [3]:
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [4]:
isic_2024_challenge_path = kagglehub.competition_download('isic-2024-challenge')

Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/isic-2024-challenge...


100%|██████████| 2.00G/2.00G [01:47<00:00, 20.0MB/s]

Extracting files...





In [5]:
import pandas as pd
import numpy as np
import os
import cv2
import torch
from PIL import Image

import h5py
from tqdm import tqdm
import gc

import torch.nn as nn
from torchvision import models
import hashlib
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as transforms
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from collections import Counter


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore")

# Adjusting Row Column Settings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [6]:
## LOAD DATA

final_data_path = '/content/drive/MyDrive/Skin Cancer Detection/preprocessing_data.csv'
final_data = pd.read_csv(final_data_path)

train_image_dir = f"{isic_2024_challenge_path}/train-image"

In [7]:
final_data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,381533
1,393


In [8]:
##########################
# IMAGE PREPROCESSING CLASS
##########################

class CustomResizeCenterCrop:
    def __init__(self, target_size):
        if isinstance(target_size, int):
            self.target_size = (target_size, target_size)
        else:
            self.target_size = target_size

    def __call__(self, img):
        width, height = img.size
        scale = min(self.target_size[0] / width, self.target_size[1] / height)
        new_width = int(width * scale)
        new_height = int(height * scale)
        img_resized = img.resize((new_width, new_height), Image.LANCZOS)
        return self._center_crop(img_resized)

    def _center_crop(self, img):
        width, height = img.size
        target_width, target_height = self.target_size
        left = (width - target_width) // 2
        top = (height - target_height) // 2
        right = left + target_width
        bottom = top + target_height
        return img.crop((left, top, right, bottom))




class AdaptiveHairRemoval:
    def __init__(self, kernel_sizes=[11, 15, 17], threshold_range=(5, 20), cache_size=100):
        self.kernel_sizes = kernel_sizes
        self.threshold_range = threshold_range
        self.cache = {}
        self.cache_size = cache_size

    def _get_image_hash(self, image):
        return hashlib.md5(np.array(image).tobytes()).hexdigest()

    def remove_hair(self, image):
        image_hash = self._get_image_hash(image)
        if image_hash in self.cache:
            return self.cache[image_hash]

        image_np = np.array(image)
        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
        best_result = None
        best_score = float('inf')

        for kernel_size in self.kernel_sizes:
            for threshold in range(*self.threshold_range):
                result = self._hair_removal_attempt(image_np, gray, kernel_size, threshold)
                score = self._evaluate_hair_removal(result)
                if score < best_score:
                    best_result = result
                    best_score = score

        result_image = Image.fromarray(cv2.cvtColor(best_result, cv2.COLOR_BGR2RGB))
        self._update_cache(image_hash, result_image)
        return result_image

    def _hair_removal_attempt(self, image_np, gray, kernel_size, threshold):
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
        blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
        _, thresh = cv2.threshold(blackhat, threshold, 255, cv2.THRESH_BINARY)
        return cv2.inpaint(image_np, thresh, inpaintRadius=1, flags=cv2.INPAINT_TELEA)

    def _evaluate_hair_removal(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 100, 200)
        return np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])

    def _update_cache(self, key, value):
        if len(self.cache) >= self.cache_size:
            self.cache.pop(next(iter(self.cache)))
        self.cache[key] = value

In [9]:
################
# DATASET CLASS
################

class SkinCancerDataset(Dataset):
    def __init__(self, df, image_dir, target_size=(224, 224)):
        self.df = df
        self.image_dir = image_dir
        self.target_size = target_size
        self.hair_remover = AdaptiveHairRemoval()
        self.tabular_features = [col for col in df.columns if col not in ['target', 'img_paths']]
        self.feature_means = df[self.tabular_features].mean()
        self.feature_stds = df[self.tabular_features].std()

        self.transform = transforms.Compose([
            CustomResizeCenterCrop(self.target_size),
            transforms.Lambda(self._hair_remove),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def _hair_remove(self, image):
        return self.hair_remover.remove_hair(image)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.df.iloc[idx]['img_paths'])
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        tabular_features = (self.df.iloc[idx][self.tabular_features] - self.feature_means) / self.feature_stds
        tabular_features = torch.tensor(tabular_features.values.astype(np.float32))
        label = torch.tensor(self.df.iloc[idx]['target'], dtype=torch.long)
        return image, tabular_features, label

    def __len__(self):
        return len(self.df)

In [10]:
####################
# IMBALANCED DATASET
####################

from collections import Counter

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, cache_enabled=True):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.cache_enabled = cache_enabled
        self.cache = {}

    def forward(self, inputs, targets):
        inputs_hash = self._get_input_hash(inputs, targets)
        if self.cache_enabled and inputs_hash in self.cache:
            return self.cache[inputs_hash]

        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        if self.cache_enabled:
            self.cache[inputs_hash] = focal_loss.mean()
        return focal_loss.mean()

    def _get_input_hash(self, inputs, targets):
        inputs_cpu = inputs.cpu()
        targets_cpu = targets.cpu()
        return hash((inputs_cpu.data.numpy().tobytes(), targets_cpu.data.numpy().tobytes()))

In [11]:
####################
# DATA PREPROCESSING
####################

zero_samples = final_data[final_data['target'] == 0]

if len(zero_samples) > 1000:
    zero_samples_sampled = zero_samples.sample(n=1000, random_state=42)
else:
    zero_samples_sampled = zero_samples

final_data_sampled = pd.concat([zero_samples_sampled, final_data[final_data['target'] == 1]])




train_val_df, test_df = train_test_split(
    final_data_sampled,
    test_size=0.15,
    stratify=final_data_sampled['target'],
    random_state=42
)


effective_val_size = 0.15 / (1 - 0.15)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=effective_val_size,
    stratify=train_val_df['target'],
    random_state=42
)



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def calculate_class_weights(train_df):
    class_counts = train_df['target'].value_counts()
    total = len(train_df)
    class_weights = {cls: total / (len(class_counts) * count)
                    for cls, count in class_counts.items()}
    return torch.FloatTensor([class_weights[cls]
                             for cls in sorted(class_weights.keys())])


class_weights = calculate_class_weights(train_df)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

train_dataset = SkinCancerDataset(train_df, train_image_dir)
val_dataset = SkinCancerDataset(val_df, train_image_dir)
test_dataset = SkinCancerDataset(test_df, train_image_dir)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)

In [12]:
#######################
# DENSENET HYBRID MODEL
#######################


class DenseNetHybrid(nn.Module):
    def __init__(self, num_tabular_features, num_classes=2, dropout_rate=0.5):
        super().__init__()

        self.image_model = models.densenet121(pretrained=True)
        num_image_features = self.image_model.classifier.in_features
        self.image_model.classifier = nn.Identity()

        self.tabular_network = nn.Sequential(
            nn.BatchNorm1d(num_tabular_features),
            nn.Linear(num_tabular_features, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.classifier = nn.Sequential(
            nn.Linear(num_image_features + 64, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.tabular_network.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        for m in self.classifier.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, images, tabular_data):
        image_features = self.image_model(images)
        tabular_features = self.tabular_network(tabular_data)
        combined_features = torch.cat([image_features, tabular_features], dim=1)
        output = self.classifier(combined_features)
        return output

In [13]:
##############
# TRAIN CLASS
##############

class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, criterion, optimizer,
                 scheduler, device, num_epochs=50, early_stopping_patience=7):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.num_epochs = num_epochs
        self.early_stopping_patience = early_stopping_patience
        self.best_val_auc = 0
        self.patience_counter = 0

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0

        for images, tabular, labels in self.train_loader:
            images = images.to(self.device)
            tabular = tabular.to(self.device)
            labels = labels.to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(images, tabular)
            loss = self.criterion(outputs, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        return total_loss / len(self.train_loader), correct / total

    def validate(self):
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for images, tabular, labels in self.val_loader:
                images = images.to(self.device)
                tabular = tabular.to(self.device)
                labels = labels.to(self.device)

                outputs = self.model(images, tabular)
                loss = self.criterion(outputs, labels)

                total_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                all_preds.extend(outputs.softmax(dim=1)[:, 1].cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        auc = roc_auc_score(all_labels, all_preds)
        return total_loss / len(self.val_loader), correct / total, auc

    def train(self):
        print("Starting training...")
        for epoch in range(self.num_epochs):
            train_loss, train_acc = self.train_epoch()
            val_loss, val_acc, val_auc = self.validate()
            self.scheduler.step(val_auc)

            print(f'Epoch {epoch+1}/{self.num_epochs}:')
            print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
            print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val AUC: {val_auc:.4f}')
            print(f'Learning Rate: {self.optimizer.param_groups[0]["lr"]:.6f}')

            if val_auc > self.best_val_auc:
                self.best_val_auc = val_auc
                self.patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'val_auc': val_auc,
                }, 'best_model.pth')
                print("Saved new best model!")
            else:
                self.patience_counter += 1

            if self.patience_counter >= self.early_stopping_patience:
                print(f'Early stopping triggered after {epoch+1} epochs')
                break

            print('-' * 50)

In [14]:
#######
# TRAIN
#######

def initialize_training(train_loader, val_loader, num_tabular_features, device):
    model = DenseNetHybrid(num_tabular_features=num_tabular_features).to(device)

    ct = 0
    for child in model.image_model.features.children():
        ct += 1
        if ct < 11:
            for param in child.parameters():
                param.requires_grad = False

    densenet_params = list(model.image_model.parameters())
    other_params = list(model.tabular_network.parameters()) + list(model.classifier.parameters())

    optimizer = optim.AdamW([{'params': densenet_params, 'lr': 1e-4},{'params': other_params, 'lr': 3e-4}], weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)
    criterion = FocalLoss(alpha=1, gamma=2)

    trainer = ModelTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        num_epochs=50,
        early_stopping_patience=7)

    return trainer


num_tabular_features = len(train_dataset.tabular_features)
trainer = initialize_training(train_loader, val_loader, num_tabular_features, device)
trainer.train()

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:00<00:00, 198MB/s]


Starting training...
Epoch 1/50:
Train Loss: 0.7216, Train Acc: 0.6236
Val Loss: 0.1479, Val Acc: 0.7321, Val AUC: 0.7092
Learning Rate: 0.000100
Saved new best model!
--------------------------------------------------
Epoch 2/50:
Train Loss: 0.3875, Train Acc: 0.6646
Val Loss: 0.1351, Val Acc: 0.7368, Val AUC: 0.7667
Learning Rate: 0.000100
Saved new best model!
--------------------------------------------------
Epoch 3/50:
Train Loss: 0.2920, Train Acc: 0.6749
Val Loss: 0.1329, Val Acc: 0.7273, Val AUC: 0.7593
Learning Rate: 0.000100
--------------------------------------------------
Epoch 4/50:
Train Loss: 0.2330, Train Acc: 0.6831
Val Loss: 0.1251, Val Acc: 0.7799, Val AUC: 0.8234
Learning Rate: 0.000100
Saved new best model!
--------------------------------------------------
Epoch 5/50:
Train Loss: 0.2091, Train Acc: 0.7118
Val Loss: 0.1187, Val Acc: 0.7751, Val AUC: 0.7829
Learning Rate: 0.000100
--------------------------------------------------
Epoch 6/50:
Train Loss: 0.2028, T