In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/skin_kaggle
!ls

/content/drive/MyDrive/skin_kaggle
isic2024				      kaggle.json     resnet18_test_pred.csv
isic-2024-challenge.zip			      models	      resnet18_train_pred.csv
Kaggle_ISIC_2024_Skin_Cancer_Detection.ipynb  resnet18.ipynb


In [None]:
!pip install timm

Collecting timm
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading timm-1.0.9-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.9


In [None]:
import torch
import numpy as np
import os, io
from sklearn.metrics import roc_auc_score
import pandas as pd

import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.optim import AdamW
import timm
import sys
from tqdm import tqdm

from PIL import Image

import albumentations as A

import math, random
import h5py

In [None]:
NOT_DEBUG = True # True -> run naormally, False -> debug mode, with lesser computing cost

train_data_path = 'isic2024/test-image.hdf5'
train_meta_path = 'isic2024/test-metadata.csv'

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
N_WORKERS = 32
USE_AMP = True
SEED = 3407

IMG_SIZE = [112, 112]
IN_CHANS = 3
N_CLASSES = 1

AUG_PROB = 0.75

N_FOLDS = 5 if NOT_DEBUG else 2
EPOCHS = 10 if NOT_DEBUG else 2
MODEL_NAME = 'resnet18'

GRAD_ACC = 1
TGT_BATCH_SIZE = 256
BATCH_SIZE = TGT_BATCH_SIZE // GRAD_ACC
MAX_GRAD_NORM = None
EARLY_STOPPING_EPOCH = 10

LR = 2e-4 * TGT_BATCH_SIZE / 32
WD = 1e-2
AUG = True

In [None]:

def set_random_seed(seed: int = 3407, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = deterministic  # type: ignore

set_random_seed(SEED)



# image_path = '/kaggle/input/isic-2024-challenge/train-image/image'
class ISICDataset(Dataset):
    def __init__(self, df, test_hd5, transform=None):
        self.df = df
        self.transform = transform
        self.test_hd5 = h5py.File(test_hd5)

        #debug
#         self.test_hd5 = h5py.File('/kaggle/input/isic-2024-challenge/train-image.hdf5')
#         self.df = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
#         self.df = self.df[self.df['target'] == 1]


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        t = str(self.df.iloc[idx]['isic_id'])
        p = self.test_hd5[t][()]
        img = Image.open(io.BytesIO(p))
        img = img.convert('RGB')
        img = np.array(img)
        img = (img - img.min()) / (img.max() - img.min() +1e-6) * 255
        if self.transform:
            img = self.transform(image=img.astype(np.uint8))['image']
        return t, img.transpose(2, 0, 1)

In [None]:
transforms_val = A.Compose([
    A.Resize(IMG_SIZE[0], IMG_SIZE[1]),
    A.Normalize(mean=0.5, std=0.5)
])

In [None]:
class ISICModel(nn.Module):
    def __init__(self, model_name, in_c=30, n_classes=75, pretrained=True, features_only=False):
        super().__init__()
        self.model = timm.create_model(
                                    model_name,
                                    pretrained=pretrained,
                                    features_only=features_only,
                                    in_chans=in_c,
                                    num_classes=n_classes,
                                    global_pool='avg',
                                    # pretrained_cfg_overlay=dict(file='/home/pretrain/440605cf-77e5-4658-b573-08a9dbebe1f0'),
                                    )

    def forward(self, x):
        x = self.model(x)
        y = torch.sigmoid(x)
        return y

In [None]:
import glob

# checkpoints = glob.glob('/kaggle/input/resnet18/pytorch/resnet18/1/resnet18_baseline_112/*.pt')

checkpoints = ['/content/drive/MyDrive/skin_kaggle/models/best_val_model_fold-4.pt']

In [None]:
models = []
for c in checkpoints:
    model = ISICModel(MODEL_NAME, in_c=IN_CHANS, n_classes=N_CLASSES, pretrained=False)
    print(model)
    model.load_state_dict(torch.load(c))
    model.eval()
    model.half()
    model.to(device)
    models.append(model)

ISICModel(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop_block): Identity()
        (act1): ReLU(inplace=True)
        (aa): Identity()
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act2): ReLU(inplace=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1

  model.load_state_dict(torch.load(c, map_location=torch.device('cpu')))


In [None]:
train_df = pd.read_csv(train_meta_path)
train_hd5 = train_data_path
train_ds = ISICDataset(train_df, test_hd5=train_hd5, transform=transforms_val)
train_dl = DataLoader(
            train_ds,
            batch_size=256,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
            num_workers=N_WORKERS
            )



In [None]:
autocast = torch.cuda.amp.autocast(enabled=USE_AMP, dtype=torch.half)
y_preds = []
y_names = []

with tqdm(train_dl, leave=True) as pbar:
    with torch.no_grad():
        for idx, (names, imgs) in enumerate(pbar):
            x = imgs
            x = x.to(device)
            pred_batch = []
            with autocast:
                for m in models:
                    y_i = m(x).flatten()
                    y_i = y_i.reshape(-1,1).cpu().numpy() #(-1)
#                     y_i = y_i.tolist()
                    pred_batch.append(y_i)
#                     pred_batch += y_i/len(models)
                pred_batch = np.concatenate(pred_batch, axis=1)
                pred_batch = np.max(pred_batch, axis=1, keepdims=False)
                y_preds.extend(pred_batch.tolist())
                for i in range(len(names)):
                    y_names.append(names[i])

# # submission_df.to_csv("/kaggle/input/resnet18/pytorch/resnet18/1/resnet18_baseline_112/resnet18_pred.csv", index=False)


In [None]:
test_resnet_18_df = pd.DataFrame({'isic_id':y_names, 'resnet_18_pred':y_preds})

In [None]:
display(test_resnet_18_df)

Unnamed: 0,isic_id,resnet_18_pred
0,ISIC_0015657,0.105774
1,ISIC_0015729,0.026764
2,ISIC_0015740,0.041077


In [None]:
test_resnet_18_df.to_csv("resnet18_test_pred.csv", index=False)

## Logic: use Resnet predicted logits as a feature in addition to tabular data to train a LGBM

In [None]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [None]:
train_meta = pl.read_csv('isic2024/train-metadata.csv')
test_meta = pl.read_csv('isic2024/test-metadata.csv')

In [None]:
train_df = (
    train_meta
    .select(test_meta.columns + ['target'])
    .with_columns(pl.col(pl.String).cast(pl.Categorical))
)
X = train_df.drop('target').to_pandas()
y = train_df.select('target').to_pandas()

import pandas as pd
resnet_18_pred = pd.read_csv("/kaggle/input/resnet18-pred-max/resnet18_pred.csv")
X['resnet_18_pred'] = resnet_18_pred['target']

  .with_columns(pl.col(pl.String).cast(pl.Categorical))


In [None]:
cat_cols = train_df.drop('target').select(pl.col(pl.Categorical)).columns

train_df_dataset = lgb.Dataset(
    X,
    y,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

In [None]:
kf = StratifiedKFold()

def pauc_80(preds, data):
    score_value = score(data.get_label(), preds, min_tpr=0.8)
    return 'pauc_80', score_value, True


lgb_params = {
    'objective': 'binary',
    'metric': 'none',
    'verbose': -1,
    'learning_rate': 0.01,
    'num_leaves': 31,
    'min_data_in_leaf': 50,
    'pos_bagging_fraction': 0.75,
    'neg_bagging_fraction': 0.05,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'lambda_l1': 0.25,
    'lambda_l2': 1.0
}

cv_results = lgb.cv(
    lgb_params,
    train_df_dataset,
    folds=kf.split(X, y),
    feval=pauc_80,
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(20)
    ],
    stratified=True,
    return_cvbooster=True,
)

Training until validation scores don't improve for 50 rounds
[20]	cv_agg's valid pauc_80: 0.163981 + 0.0134335
[40]	cv_agg's valid pauc_80: 0.167867 + 0.0109358
[60]	cv_agg's valid pauc_80: 0.170125 + 0.0095106
[80]	cv_agg's valid pauc_80: 0.171475 + 0.00868064
[100]	cv_agg's valid pauc_80: 0.172521 + 0.00828191
[120]	cv_agg's valid pauc_80: 0.173218 + 0.00780807
[140]	cv_agg's valid pauc_80: 0.173226 + 0.00786652
[160]	cv_agg's valid pauc_80: 0.173062 + 0.00785856
Early stopping, best iteration is:
[125]	cv_agg's valid pauc_80: 0.173263 + 0.00786269


In [None]:
# test_resnet_18_df['resnet_18_pred']

In [None]:
boosters = cv_results['cvbooster'].boosters
test_df = test_meta.with_columns(pl.col(cat_cols).cast(pl.String).cast(pl.Categorical))
test_df = test_df.to_pandas()
test_df['resnet_18_pred'] = test_resnet_18_df['resnet_18_pred']
pred_per_cv = [b.predict(test_df) for b in boosters]
pred_average = np.array(pred_per_cv).mean(axis=0)

In [None]:
# print(len(test_df['isic_id'].tolist()), len(pred_average))

In [None]:
submission_df = pd.DataFrame({'isic_id':test_df['isic_id'].tolist(), 'target':pred_average})
submission_df.to_csv("submission.csv", index=False)
submission_df

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.003467
1,ISIC_0015729,0.000542
2,ISIC_0015740,0.000729
