In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import h5py
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
from PIL import Image
import io
import torch.nn.functional as F


class MyDataset(Dataset):
    def __init__(self, hdf5_file, csv_file, class_mapping, columns=None, transform=None):
        """
        Initializes the dataset.
        :param hdf5_file: Path to the HDF5 file containing image data.
        :param csv_file: Path to the CSV file containing data.
        :param class_mapping: Dictionary mapping class names to numeric values.
        :param columns: List of column names to include as features. If None, all columns are included.
        :param transform: Optional transform to be applied on a sample.
        """
        self.data_frame = pd.read_csv(csv_file, low_memory=False, usecols=['isic_id'] + (columns if columns is not None else []))
        self.hdf5_file = hdf5_file
        self.transform = transform
        self.class_mapping = class_mapping
        self.hdf5 = h5py.File(self.hdf5_file, 'r')

        # Handle missing values in the CSV data
        if columns is not None:
            self.csv_data = self.data_frame[columns].apply(lambda col: col.fillna(col.mean()), axis=0)
        else:
            self.csv_data = self.data_frame.apply(lambda col: col.fillna(col.mean()), axis=0)

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx]['isic_id']
        image_data = self.hdf5[img_name][()]
        image = Image.open(io.BytesIO(image_data))
        if self.transform:
            image = self.transform(image)
        csv_data_row = self.csv_data.iloc[idx]
        csv = torch.tensor(csv_data_row.values, dtype=torch.float)
        return image, csv, img_name

    def __del__(self):
        self.hdf5.close()  # Close the HDF5 file when the dataset object is deleted

def get_dataloader(hdf5_file, test_csv, columns_to_use, class_mapping, val_transform=None, batch_size=32):
    test_ds = MyDataset(hdf5_file=hdf5_file, csv_file=test_csv, class_mapping=class_mapping, columns=columns_to_use, transform=val_transform)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    return test_dl



import matplotlib.pyplot as plt

def visualize_data(data_loader, num_images=5):
    """
    Visualizes a few images and their corresponding CSV data from the DataLoader.
    
    :param data_loader: DataLoader from which to fetch data.
    :param num_images: Number of images to display.
    """
    # Fetch the first batch
    data_iter = iter(data_loader)
    images, csv_data, img_name = next(data_iter)

    # Plot images and print corresponding CSV data
    fig, axes = plt.subplots(1, num_images, figsize=(15, 5))
    for i in range(num_images):
        ax = axes[i]
        image, csv_values = images[i], csv_data[i]
        
        # Handle different types of image tensor formats
        if image.shape[0] == 3:  # Check for color channels
            image = image.permute(1, 2, 0)  # Convert from CxHxW to HxWxC for visualization
            ax.imshow(image.numpy())
        else:
            ax.imshow(image.numpy()[0], cmap='gray')  # For grayscale images
        
        ax.axis('off')
        ax.set_title(f'Sample {i+1}')
        
        # Print CSV data
        print(f'Sample {i+1} CSV Data:', csv_values.numpy())

    plt.show()


In [3]:
kaggle_base_path = '/kaggle/input/isic-2024-challenge/'
columns_to_use = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A',
       'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext',
       'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2',
       'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA',
       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
       'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
       'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
       'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
       'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
       'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']


test_dl = get_dataloader(
    hdf5_file=os.path.join(kaggle_base_path, "test-image.hdf5"),
    test_csv=os.path.join(kaggle_base_path, "test-metadata.csv"),
    columns_to_use = columns_to_use,
    class_mapping={'benign': 0, 'malignant': 1},
    val_transform=transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()]),
    batch_size=64,
)



# Call the function with the test DataLoader
# visualize_data(test_dl, num_images=3)

# Define Model

In [4]:
import torch
import torch.nn as nn
from torchvision.models import resnet18 as resnet

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.layers(x)

class CombinedModel(nn.Module):
    def __init__(self, mlp, n_classes, train_resnet=False):
        super(CombinedModel, self).__init__()
        self.resnet18 = resnet()
        self.mlp = mlp
        
        # Configure trainability of ResNet layers
        for param in self.resnet18.parameters():
            param.requires_grad = train_resnet
        
        num_features_resnet = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Identity()  # Remove the final fully connected layer
        
        mlp_output_size = mlp.layers[-2].out_features
        combined_input_size = num_features_resnet + mlp_output_size
        
        self.combined = nn.Sequential(
            nn.Linear(combined_input_size, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, n_classes)
        )

    def forward(self, image, csv_data):
        x1 = self.resnet18(image)
        x2 = self.mlp(csv_data)
        x = torch.cat((x1, x2), dim=1)
        return self.combined(x)
    
    
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [5]:
import os 
print(os.listdir("/kaggle/working"))

['__notebook__.ipynb']


In [6]:
import logging, os


mlp = MLP(input_size=34, hidden_size=128, output_size=16)
model = CombinedModel(mlp=mlp, n_classes=2, train_resnet=False).to(device)



model_saved_path = os.path.join("/kaggle/input/miml-v2/miml_v1.pt")
if os.path.exists(model_saved_path):
    model.load_state_dict(torch.load(model_saved_path, map_location=device))
    logging.info(f'Model loaded from {model_saved_path}')



In [7]:
model.eval()
predictions = []
# image_ids = []

test_df = pd.read_csv(os.path.join(kaggle_base_path, "test-metadata.csv"))
test_isic_ids = test_df['isic_id'].values

with torch.no_grad():
    for images, csv_data, ids in test_dl:
        images = images.to(device)
        csv_data = csv_data.to(device)
        outputs = model(images, csv_data)
        probabilities = F.softmax(outputs, dim=1)
        class_one_prob = probabilities[:, 1].cpu().numpy()  # Class probabilities
        predictions.extend(class_one_prob)
#         image_ids.extend(ids)



In [8]:
# df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
# df_sub["target"] = predictions
# df_sub

In [9]:
# df_sub.to_csv("submission.csv", index=False)

In [10]:
# Save predictions to CSV
df = pd.DataFrame({
    'isic_id': test_isic_ids,
    'target': predictions
})
df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")


Predictions saved to submission.csv


In [11]:
# import pandas as pd


# # Load the test metadata to determine the expected number of rows
# test_metadata = pd.read_csv(os.path.join(kaggle_base_path, "test-metadata.csv"))
# expected_rows = len(test_metadata)

# # Load your predictions DataFrame
# df = pd.read_csv('/kaggle/working/submission.csv')

# # Check for the correct number of columns
# if df.columns.tolist() != ['isic_id', 'target']:
#     print("Error: The column names or their order is incorrect.")

# # Check for the correct number of rows (replace 10000 with the expected number of rows)
# if len(df) != expected_rows:
#     print(f"Error: There should be exactly {expected_rows} rows but found {len(df)}.")

# # Check for any missing values
# if df.isna().any().any():
#     print("Error: There are empty values in your DataFrame.")

# # Ensure that all 'target' values are probabilities between 0 and 1
# if not df['target'].between(0, 1).all():
#     print("Error: Target values should be between 0 and 1.")

# # Save the DataFrame correctly
# df.to_csv('/kaggle/working/submission.csv', index=False)
# print("Submission saved to submission.csv")


In [12]:
import pandas as pd

df = pd.read_csv('submission.csv')
df.head(100)


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.998294
1,ISIC_0015729,0.332079
2,ISIC_0015740,1.0
