In [60]:
# %%
import os
import torch
from torch import optim
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import logging
from matplotlib import transforms
from torch.utils.data import Dataset
from torchvision import transforms, models
from PIL import Image
import os
import random
import torch
import numpy as np
import pandas as pd
import h5py
from tqdm import tqdm
import io
import gc

PRO_DIR = r"/home/kenny/Projects/kaggle/isic2024"
os.chdir(PRO_DIR)
print("project_directory:", PRO_DIR)

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
SEED = 42
seed_everything(SEED)


class CustomDataset(Dataset):
    def __init__(self, df_meta , undersample_rate=2, transform=None, n="all"):   
        df_meta = df_meta.reset_index(drop=True)  # Reset the index
        file_list = {}
        for _, row in df_meta.iterrows():        
            if row["set"]=="org":
                file_list[row["isic_id"]] = f"{PRO_DIR}/input/train-image/image/{row['isic_id']}.jpg"
            else:
                file_list[row["isic_id"]] = f"{PRO_DIR}/data/external/{row['isic_id']}.jpg"
        self.file_list = file_list
        self.df = df_meta
        # print(f"filelist {len(self.file_list)}, df {self.df.shape}")
        assert len(self.file_list.keys()) == self.df.shape[0]
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        print("go to here", idx)
        print("df ", self.df.shape)
        isic_id = self.df.loc[idx,"isic_id"]
        target = self.df.loc[idx,"target"]        
        img_path = self.file_list[isic_id]
        # print(f"isic_id {isic_id}. target {target}, path {img_path}")
        try:
            img = Image.open(img_path).convert("RGB")            
            if self.transform:
                img = self.transform(img)
        except Exception as ex:
            raise ex

        return img, img_path, target
class ISICDatasetTest(Dataset):
    def __init__(self, df_meta, file_list, transform=None, n="all"):        
        self.file_list = file_list
        self.df = df_meta
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        isic_id = self.df.loc[idx,"isic_id"]
        img_path = self.file_list[isic_id]
        try:
            img = Image.open(img_path).convert("RGB")            
            if self.transform:
                img = self.transform(img)
        except Exception as ex:
            raise ex

        return img

def read_images_from_hdf5_and_save(file_path, output_dir):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    with h5py.File(file_path, 'r') as file:
        ids_list = list(file.keys())        
        ids_paths = {}
        for img_id in tqdm(ids_list):
            image_data = file[img_id][()]
            image_path = os.path.join(output_dir, f"{img_id}.png")  # Define how you want to save the file
            if os.path.exists(image_path):
                ids_paths[img_id] = image_path
                continue
            # Save the image data to a file
            with Image.open(io.BytesIO(image_data)) as image:
                image.save(image_path)

            # Store the path instead of the image data
            ids_paths[img_id] = image_path
    return ids_paths

## 2. DEFINE MODEL -----------------------------------------------------------
IMG_SIZE=224
BATCH_SIZE=32

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model(model_name="efficientnet_v2_m"):
    if model_name == "mobilenet_v3_small":
    # mobile net
        model = models.mobilenet_v3_small()
        model.classifier[3] = torch.nn.Linear(model.classifier[3].in_features, 1)
    if model_name == "efficientnet_v2_m":
        model = models.efficientnet_v2_m(weights=None)
        model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 1)
    if model_name == "efficientnet_b0":
        model = models.efficientnet_b0(weights=None)
        model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 1)

    if model_name == "vgg16":
        model = models.vgg16(pretrained=True)
        model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 1)        
    

    model = model.to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = BCEWithLogitsLoss()
    return model, optimizer, criterion

## 3. DEFINE METRICS -------------------------------------------------------

from sklearn.metrics import auc, roc_curve
def compute_pauc(y_true, y_scores, tpr_threshold=0.8):
    """
    Compute the partial AUC above a given TPR threshold.

    Parameters:
    y_true (np.array): True binary labels.
    y_scores (np.array): Target scores.
    tpr_threshold (float): TPR threshold above which to compute the pAUC.
    Returns:
    float: The partial AUC above the given TPR threshold.
    """
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    # Find the indices where the TPR is above the threshold
    tpr_above_threshold_indices = np.where(tpr >= tpr_threshold)[0]
    if len(tpr_above_threshold_indices) == 0:
        return 0.0

    # Extract the indices for the ROC segment above the threshold
    start_index = tpr_above_threshold_indices[0] 
    fpr_above_threshold = fpr[start_index:]
    tpr_above_threshold = tpr[start_index:] - tpr_threshold
    partial_auc = auc(fpr_above_threshold, tpr_above_threshold)    
    return partial_auc

## 4. LOAD DATASET -------------------------------------------------------
class PATHS:
    train_images_h5_path = f"{PRO_DIR}/input/train-image.hdf5"
    test_images_h5_path = f"{PRO_DIR}/input/test-image.hdf5"    
    train_metadata_path = f"{PRO_DIR}/input/train-metadata.csv"
    test_metadata_path = f"{PRO_DIR}/input/test-metadata.csv"    
    submission_path = f"{PRO_DIR}/input/sample_submission.csv"
    train_metadata_path_ext = f"{PRO_DIR}/data/external/metadata.csv"

# # get more 2000 malignant data from external dataset
df_meta = pd.read_csv(PATHS.train_metadata_path)[["isic_id","patient_id","target"]]
df_meta["set"] = "org"

df_meta_ext = pd.read_csv(PATHS.train_metadata_path_ext)
df_meta_ext = df_meta_ext.loc[df_meta_ext["benign_malignant"]=="malignant",["isic_id","patient_id","benign_malignant"]]
df_meta_ext = df_meta_ext.loc[(df_meta_ext["benign_malignant"]=="malignant")&(~df_meta_ext["patient_id"].isna())]
df_meta_ext["benign_malignant"] = 1
df_meta_ext.rename(columns={"benign_malignant":"target"}, inplace=True)
df_meta_ext["set"]="ext"
df_meta_ext = df_meta_ext.sample(frac=1).sample(n=2000)

meta_train = pd.concat([df_meta, df_meta_ext])
meta_train["target"]=meta_train["target"].astype("int")

df_meta_pos = meta_train.loc[meta_train["target"]==1]
df_meta_neg = meta_train.loc[meta_train["target"]==0].sample(frac=1).sample(n=len(df_meta_pos)*1, random_state=SEED)
meta_train = pd.concat([df_meta_pos, df_meta_neg]).reset_index()
# meta_train = meta_train.set_index("index")

## 5. TRAIN MODEL ----------------------------------------------
def train(model, train_loader, optimizer, criterion):
    total_loss = 0
    all_targets = []
    all_probs = []        

    model.train()
    for input,_, targets in train_loader:        
        input = input.to(DEVICE)
        targets = targets.to(DEVICE)

        targets = targets.unsqueeze(1) # make the target [batch, 1]
        targets = targets.float() # BCEWithLogitsLoss requires targets as float()
        # print(f"input shape {input.shape}")
        optimizer.zero_grad()
        output = model(input)
        loss = criterion(output, targets)
        total_loss += loss.item()
        
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(output).cpu().detach().numpy()
        # predictions = (probs > 0.5)

        all_targets.extend(targets.cpu().detach().numpy().flatten())
        all_probs.extend(probs.flatten())

        loss.backward()
        optimizer.step()
    
    pauc = compute_pauc(np.array(all_targets), np.array(all_probs))
    return total_loss, pauc

def val(model, val_loader, criterion):
    total_loss= 0
    all_targets = []
    all_probs = []        
    model.eval()
    with torch.no_grad():
        for input, _, targets in val_loader:
            input = input.to(DEVICE)
            targets = targets.to(DEVICE)

            targets = targets.unsqueeze(1) # make the target [batch, 1]
            targets = targets.float() # BCEWithLogitsLoss requires targets as float()

            output = model(input)
            val_loss = criterion(output, targets)
            total_loss +=  val_loss.item()

            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(output).cpu().detach().numpy().flatten()
            # predictions = (probs > 0.5)
            
            all_targets.extend(targets.cpu().detach().numpy().flatten())
            all_probs.extend(probs)           
    
    # pauc = compute_pauc(all_targets, all_predictions)
    print(f"all_targets {len(all_targets)}, all_probs {len(all_probs)}")
    pauc = compute_pauc(np.array(all_targets), np.array(all_probs))
    return total_loss, pauc, all_probs, all_targets

def get_mean_std(df):
    trn_dataset = CustomDataset(df,
                                transform=transforms.Compose([
                                    transforms.Resize((IMG_SIZE, IMG_SIZE)),            
                                    transforms.ToTensor(),
                                ])
                            ) 
    train_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, shuffle=True) 
    mean = 0.0
    for images, _,_ in train_loader:
        batch_samples = images.size(0) # batch size (the last batch can have smaller size!)        
        images = images.view(batch_samples, images.size(1), -1)  # print(images.shape) # will be (64, 3, 224x224)
        mean += images.mean(2).sum(0)  
    mean = mean / len(train_loader.dataset)

    var = 0.0
    for images, _,_ in train_loader:
        batch_samples = images.size(0)
        images = images.view(batch_samples, images.size(1), -1)
        var += ((images - mean.unsqueeze(1))**2).sum([0,2])
    std = torch.sqrt(var / (len(train_loader.dataset)*IMG_SIZE*IMG_SIZE))
    return mean, std

EXP_ID    = 3
MODEL_NAME = "efficientnet_v2_m"
NUM_EPOCHS = 30
# BATCH_SIZE = 32
NOTE="with_2k_external_db"
EXP_NAME = "{:03}_{}_{}_{}_{}".format(EXP_ID, MODEL_NAME, NUM_EPOCHS, BATCH_SIZE, NOTE)  # you can name your experiment whatever you like
# SAVE_PATH = "/kaggle/working"
SAVE_PATH = "models"

logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(EXP_NAME))

TRAIN_CV = True
if TRAIN_CV:
    logging.info(f"TRAIN CV ---------------------------------")
    from sklearn.model_selection import StratifiedKFold,GroupKFold
    meta_train['group'] = meta_train['target'].astype(str) + "_" + meta_train['patient_id'].astype(str)
    cv = GroupKFold(n_splits=5)
    

    # train_data_stat = {}
    # for i, (i_trn, i_val) in enumerate(cv.split(meta_train.drop("target", axis=1), meta_train["target"], groups=meta_train["group"])):   
    #     mean, std = get_mean_std(meta_train.loc[i_trn])        
    #     train_data_stat[i] = {"mean": mean, "std": std}
    #     logging.info(f"fold {i}, mean {mean}, std {std}")

    train_data_stat ={
        0: {"mean": [0.6822, 0.5201, 0.4416], "std":[0.1682, 0.1580, 0.1617]},
        1: {"mean": [0.6863, 0.5066, 0.4145], "std":[0.1333, 0.1276, 0.1193]},
        2: {"mean": [0.6831, 0.5211, 0.4434], "std":[0.1682, 0.1581, 0.1614]},
        3: {"mean": [0.6855, 0.5224, 0.4436], "std":[0.1676, 0.1577, 0.1615]},
        4: {"mean": [0.6840, 0.5217, 0.4445], "std":[0.1667, 0.1565, 0.1601]},
    }
        
    for fold, (i_trn, i_val) in enumerate(cv.split(meta_train.drop("target", axis=1), meta_train["target"], groups=meta_train["patient_id"])):
        train_data_mean = train_data_stat[fold]["mean"]
        train_data_std = train_data_stat[fold]["std"]
        train_trans = transforms.Compose([    
            transforms.Resize((IMG_SIZE, IMG_SIZE)),       
            transforms.ToTensor(),
            transforms.Normalize(mean=train_data_mean, std=train_data_std),
        ])
        val_trans =  transforms.Compose([    
            transforms.Resize((IMG_SIZE, IMG_SIZE)),  
            transforms.ToTensor(),
            transforms.Normalize(mean=train_data_mean, std=train_data_std),
        ])

        trn_dataset = CustomDataset(
            meta_train.loc[i_trn],
            transform=train_trans
        )
        val_dataset = CustomDataset(
            meta_train.loc[i_val],
            transform=val_trans
        )
        
        # Now, you can create separate data loaders for each split:
        train_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

        logging.info(f"Fold {fold}, train {len(trn_dataset)}, val {len(val_dataset)}")
        model, optimizer, criterion = load_model("efficientnet_v2_m")

        best_val_loss, best_val_pauc = 100, 0        
        for epoch in range(30):
            train_loss, train_pauc = train(model, train_loader, optimizer, criterion)
            val_loss, val_pauc, all_probs, all_targets = val(model, val_loader, criterion)        
            if val_pauc > best_val_pauc:
                best_val_pauc = val_pauc
                os.makedirs(f"{SAVE_PATH}/{EXP_NAME}", exist_ok=True)            
                torch.save(model.state_dict(),f"{SAVE_PATH}/{EXP_NAME}/best_{fold}.pth")
                logging.info(f"Epoch {epoch}, train_loss {train_loss:.4f}, train_pauc {train_pauc:.2f}, val_loss {val_loss:.4f}, val_pauc {val_pauc:.2f} --> Best val_pauc {val_pauc:.2f} at epoch {epoch}")    
            else:        
                logging.info(f"Epoch {epoch}, train_loss {train_loss:.4f}, train_pauc {train_pauc:.2f}, val_loss {val_loss:.4f}, val_pauc {val_pauc:.2f}") 

project_directory: /home/kenny/Projects/kaggle/isic2024


  df_meta = pd.read_csv(PATHS.train_metadata_path)[["isic_id","patient_id","target"]]
  df_meta_ext = pd.read_csv(PATHS.train_metadata_path_ext)


go to here 2686
df  (3828, 6)
go to here 1334
df  (3828, 6)
go to here 2027
df  (3828, 6)
go to here 3239
df  (3828, 6)
go to here 274
df  (3828, 6)
go to here 3360
df  (3828, 6)
go to here 3496
df  (3828, 6)
go to here 847
df  (3828, 6)
go to here 2029
df  (3828, 6)
go to here 3044
df  (3828, 6)
go to here 1054
df  (3828, 6)
go to here 2615
df  (3828, 6)
go to here 1815
df  (3828, 6)
go to here 1454
df  (3828, 6)
go to here 2523
df  (3828, 6)
go to here 3497
df  (3828, 6)
go to here 2605
df  (3828, 6)
go to here 3262
df  (3828, 6)
go to here 2237
df  (3828, 6)
go to here 1801
df  (3828, 6)
go to here 167
df  (3828, 6)
go to here 2900
df  (3828, 6)
go to here 2513
df  (3828, 6)
go to here 3119
df  (3828, 6)
go to here 1714
df  (3828, 6)
go to here 97
df  (3828, 6)
go to here 133
df  (3828, 6)
go to here 1336
df  (3828, 6)
go to here 3722
df  (3828, 6)
go to here 3093
df  (3828, 6)
go to here 1854
df  (3828, 6)
go to here 1832
df  (3828, 6)
go to here 2076
df  (3828, 6)
go to here 2193


KeyboardInterrupt: 

In [32]:
x = meta_train.loc[i_trn]
x.loc[x["patient_id"]=="IP_7198665"]

Unnamed: 0,index,isic_id,patient_id,target,set,group


In [15]:
i_trn

array([   0,    1,    2, ..., 4782, 4783, 4785])

In [28]:
meta_train.loc[i_trn]

Unnamed: 0,index,isic_id,patient_id,target,set,group
0,387,ISIC_0082829,IP_3249371,1,org,1_IP_3249371
1,935,ISIC_0096034,IP_6723298,1,org,1_IP_6723298
2,1245,ISIC_0104229,IP_9057861,1,org,1_IP_9057861
3,1846,ISIC_0119495,IP_6856511,1,org,1_IP_6856511
5,4812,ISIC_0190307,IP_4890448,1,org,1_IP_4890448
...,...,...,...,...,...,...
4780,57198,ISIC_1484996,IP_7797815,0,org,0_IP_7797815
4781,137288,ISIC_3477875,IP_0294957,0,org,0_IP_0294957
4782,46771,ISIC_1226780,IP_6598006,0,org,0_IP_6598006
4783,236260,ISIC_5939194,IP_2023739,0,org,0_IP_2023739


In [31]:
val = meta_train.loc[i_val]
val.loc[val["patient_id"]=="IP_7198665"]

Unnamed: 0,index,isic_id,patient_id,target,set,group
2523,355693,ISIC_8881221,IP_7198665,0,org,0_IP_7198665


In [4]:
trn_dataset.df.shape, trn_dataset.file_list["ISIC_8299496"]

KeyError: 'ISIC_8299496'

In [61]:

val_dataset.df.shape, val_dataset.file_list["ISIC_8299496"]

((958, 6),
 '/home/kenny/Projects/kaggle/isic2024/input/train-image/image/ISIC_8299496.jpg')

In [30]:
val_dataset.df.loc[2523]

index               355693
isic_id       ISIC_8881221
patient_id      IP_7198665
target                   0
set                    org
group         0_IP_7198665
Name: 2523, dtype: object

In [17]:
# # get more 2000 malignant data from external dataset
df_meta = pd.read_csv(PATHS.train_metadata_path)[["isic_id","patient_id","target"]]
df_meta["set"] = "org"

df_meta_ext = pd.read_csv(PATHS.train_metadata_path_ext)
df_meta_ext = df_meta_ext.loc[df_meta_ext["benign_malignant"]=="malignant",["isic_id","patient_id","benign_malignant"]]
df_meta_ext = df_meta_ext.loc[(df_meta_ext["benign_malignant"]=="malignant")&(~df_meta_ext["patient_id"].isna())]
df_meta_ext["benign_malignant"] = 1
df_meta_ext.rename(columns={"benign_malignant":"target"}, inplace=True)
df_meta_ext["set"]="ext"
df_meta_ext = df_meta_ext.sample(frac=1).sample(n=2000)

meta_train = pd.concat([df_meta, df_meta_ext])
meta_train["target"]=meta_train["target"].astype("int")

df_meta_pos = meta_train.loc[meta_train["target"]==1]
df_meta_neg = meta_train.loc[meta_train["target"]==0].sample(frac=1).sample(n=len(df_meta_pos)*1, random_state=SEED)
# meta_train = pd.concat([df_meta_pos, df_meta_neg]).reset_index(drop=True)

  df_meta = pd.read_csv(PATHS.train_metadata_path)[["isic_id","patient_id","target"]]
  df_meta_ext = pd.read_csv(PATHS.train_metadata_path_ext)


In [18]:
meta_train = pd.concat([df_meta_pos, df_meta_neg])

In [19]:
meta_train = meta_train.set_index("index")

KeyError: "None of ['index'] are in the columns"

In [21]:
meta_train

Unnamed: 0,isic_id,patient_id,target,set
387,ISIC_0082829,IP_3249371,1,org
935,ISIC_0096034,IP_6723298,1,org
1245,ISIC_0104229,IP_9057861,1,org
1846,ISIC_0119495,IP_6856511,1,org
3478,ISIC_0157834,IP_3927284,1,org
...,...,...,...,...
154579,ISIC_3906966,IP_8672132,0,org
247118,ISIC_6209201,IP_6613669,0,org
116536,ISIC_2962560,IP_9175987,0,org
319341,ISIC_7983182,IP_1959951,0,org


In [36]:



class CustomDataset(Dataset):
    def __init__(self, df_meta , undersample_rate=2, transform=None, n="all"):   
        file_list = {}
        for _, row in df_meta.iterrows():        
            if row["set"]=="org":
                file_list[row["isic_id"]] = f"{PRO_DIR}/input/train-image/image/{row['isic_id']}.jpg"
            else:
                file_list[row["isic_id"]] = f"{PRO_DIR}/data/external/{row['isic_id']}.jpg"
        self.file_list = file_list
        self.df = df_meta
        # print(f"filelist {len(self.file_list)}, df {self.df.shape}")
        assert len(self.file_list.keys()) == self.df.shape[0]
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        isic_id = self.df.loc[idx,"isic_id"]
        target = self.df.loc[idx,"target"]        
        img_path = self.file_list[isic_id]
        # print(f"isic_id {isic_id}. target {target}, path {img_path}")
        try:
            img = Image.open(img_path).convert("RGB")            
            if self.transform:
                img = self.transform(img)
        except Exception as ex:
            raise ex

        return img, img_path, target
cv = GroupKFold(n_splits=5)    
for fold, (i_trn, i_val) in enumerate(cv.split(meta_train.drop("target", axis=1), meta_train["target"], groups=meta_train["patient_id"])):
    train_data_mean = [0.6822, 0.5201, 0.4416]
    train_data_std = [0.1682, 0.1580, 0.1617]

    trn_dataset = CustomDataset(meta_train.loc[i_trn])
    val_dataset = CustomDataset(meta_train.loc[i_val])
    
    # Now, you can create separate data loaders for each split:
    train_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    print(f"Fold {fold}, train {len(trn_dataset)}, val {len(val_dataset)}")
    input, _, target = next(iter(train_loader))

Fold 0, train 3828, val 958
Fold 1, train 3829, val 957
Fold 2, train 3829, val 957
Fold 3, train 3829, val 957
Fold 4, train 3829, val 957


In [45]:
meta_train.loc[i_trn].shape, meta_train.loc[i_val].shape

((3828, 6), (958, 6))

In [43]:
input, _, target = next(iter(train_loader))

go to here 247
df  (3828, 6)
go to here 3555
df  (3828, 6)


KeyError: 3555

In [49]:
val_dataset.df.loc[3555]
# trn_dataset.df.loc[3555]

index               221943
isic_id       ISIC_5580416
patient_id      IP_5714646
target                   0
set                    org
group         0_IP_5714646
Name: 3555, dtype: object

In [50]:
df = val_dataset.df

In [52]:

df.loc[3555,:]

index               221943
isic_id       ISIC_5580416
patient_id      IP_5714646
target                   0
set                    org
group         0_IP_5714646
Name: 3555, dtype: object

In [56]:
3555 in i_trn

False

In [54]:
i_trn

array([   0,    1,    2, ..., 4782, 4783, 4785])

In [55]:
df.index

Index([   4,   24,   27,   45,   47,   49,   55,   60,   63,   66,
       ...
       4737, 4738, 4742, 4747, 4750, 4751, 4756, 4757, 4767, 4784],
      dtype='int64', length=958)

In [59]:
meta_train.loc[i_trn]

Unnamed: 0,index,isic_id,patient_id,target,set,group
0,387,ISIC_0082829,IP_3249371,1,org,1_IP_3249371
1,935,ISIC_0096034,IP_6723298,1,org,1_IP_6723298
2,1245,ISIC_0104229,IP_9057861,1,org,1_IP_9057861
3,1846,ISIC_0119495,IP_6856511,1,org,1_IP_6856511
5,4812,ISIC_0190307,IP_4890448,1,org,1_IP_4890448
...,...,...,...,...,...,...
4780,57198,ISIC_1484996,IP_7797815,0,org,0_IP_7797815
4781,137288,ISIC_3477875,IP_0294957,0,org,0_IP_0294957
4782,46771,ISIC_1226780,IP_6598006,0,org,0_IP_6598006
4783,236260,ISIC_5939194,IP_2023739,0,org,0_IP_2023739


In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt

import h5py
from PIL import Image
from io import BytesIO

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import sys
# /kaggle/input/isic-pytorch-training-baseline/Final_model.bin
BEST_WEIGHT = sys.argv[1]
print(f"BEST_WEIGHT = {BEST_WEIGHT}")
CONFIG = {
    "seed": 42,
    "img_size": 256,
    "model_name": "edgenext_base.in21k_ft_in1k",
    "valid_batch_size": 32,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])
ROOT_DIR = "/kaggle/input/isic-2024-challenge"
TEST_CSV = f'{ROOT_DIR}/test-metadata.csv'
TEST_HDF = f'{ROOT_DIR}/test-image.hdf5'
SAMPLE = f'{ROOT_DIR}/sample_submission.csv'

# BEST_WEIGHT = "/kaggle/input/isic-pytorch-training-baseline/Final_model.bin"
df = pd.read_csv(TEST_CSV)
df['target'] = 0 # dummy
df
df_sub = pd.read_csv(SAMPLE)
df_sub
class ISICDataset(Dataset):
    def __init__(self, df, file_hdf, transforms=None):
        self.df = df
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.targets = df['target'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.isic_ids)
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img = np.array( Image.open(BytesIO(self.fp_hdf[isic_id][()])) )
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'target': target,
        }
train_data_mean=[0.6939, 0.5256, 0.4579]
train_data_std=[0.1612, 0.1567, 0.1678]

test_trans =  transforms.Compose([    
    transforms.Resize((IMG_SIZE, IMG_SIZE)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=train_data_mean, std=train_data_std),
])
class ISICDataset(Dataset):
    def __init__(self, hdf5_file, isic_ids, targets=None, transform=None):
        self.hdf5_file = h5py.File(hdf5_file, 'r')  # Keep file open
        self.isic_ids = isic_ids
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, idx):
        img_bytes = self.hdf5_file[self.isic_ids[idx]][()]
        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")    
#         img = np.array(img)

        if self.transform:
#             transformed = self.transform(image=img)
#             img = transformed['image']
            img = self.transform(img)

        target = self.targets[idx] if self.targets is not None else torch.tensor(-1)
        return img, target

    def __del__(self):
        self.hdf5_file.close()  # Ensure file is closed when object is destroyed

IMG_SIZE=224
BATCH_SIZE=32

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def load_one_model(model_name="efficientnet_v2_m"):
#     model = models.efficientnet_v2_m(weights=None)
#     model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 1)
#     model = model.to(DEVICE)
#     return model, optimizer, criterion
def load_models(folds):
    cv_models = []
    for fold in folds:
        model = models.efficientnet_v2_m(weights=None)
        model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 1)
        model = model.to(DEVICE)
        model.load_state_dict(torch.load(BEST_WEIGHT, map_location=DEVICE))
        model.eval()
        cv_models.append(model)
    return cv_models
    
cv_models = load_models(folds)
df_sub["target"] = preds
df_sub.to_csv("submission.csv", index=False)