In [1]:
from sklearn.utils import resample
import pandas as pd

# Load metadata
metadata = pd.read_csv("/Users/lucabernecker/PHD_UiT/Skin_lesions/HAM10000_metadata.csv")

# Check class distribution in the 'dx' column
class_counts = metadata['dx'].value_counts()

# Calculate the average class size
avg_class_size = int(class_counts.mean())

# Oversample minority classes to match the average class size
balanced_metadata = pd.concat(
    [
        
        resample(metadata[metadata['dx'] == label], 
                 replace=True,  # Enable replacement for oversampling
                 n_samples=avg_class_size, 
                 random_state=42)
        for label in class_counts.index
    ]
)

# Check new class distribution after oversampling
balanced_class_counts = balanced_metadata['dx'].value_counts()

# Output class distributions
class_counts, balanced_class_counts

(dx
 nv       6705
 mel      1113
 bkl      1099
 bcc       514
 akiec     327
 vasc      142
 df        115
 Name: count, dtype: int64,
 dx
 nv       1430
 mel      1430
 bkl      1430
 bcc      1430
 akiec    1430
 vasc     1430
 df       1430
 Name: count, dtype: int64)

In [2]:
categorical_columns = ['dx', 'sex', 'localization']
import numpy as np
print(balanced_metadata.head())
import pandas as pd
label_encoders = {}
from sklearn.preprocessing import LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    balanced_metadata[col] = le.fit_transform(balanced_metadata[col])
    label_encoders[col] = le  # Store the encoder for inverse transformations if needed


print(balanced_metadata.head())
data = balanced_metadata
data['age'] = data['age'].fillna(data['age'].mean())
print(np.isnan(data["sex"]).sum())  # Check metadata
print(np.isnan(data["localization"]).sum())  # Check images
print(np.isnan(data["age"]).sum())  # Check labels
print(np.isnan(data["dx"]).sum())  # Check labels
print(len(np.unique(data['dx'])))

        lesion_id      image_id  dx    dx_type   age     sex localization
3835  HAM_0000474  ISIC_0030099  nv  follow_up  45.0  female         hand
8367  HAM_0000597  ISIC_0030654  nv      histo  35.0  female      abdomen
8203  HAM_0007585  ISIC_0032347  nv      histo  35.0  female         back
8168  HAM_0005902  ISIC_0027285  nv      histo  40.0  female         foot
6747  HAM_0004380  ISIC_0026251  nv      histo  30.0  female         face
        lesion_id      image_id  dx    dx_type   age  sex  localization
3835  HAM_0000474  ISIC_0030099   5  follow_up  45.0    0             8
8367  HAM_0000597  ISIC_0030654   5      histo  35.0    0             0
8203  HAM_0007585  ISIC_0032347   5      histo  35.0    0             2
8168  HAM_0005902  ISIC_0027285   5      histo  40.0    0             6
6747  HAM_0004380  ISIC_0026251   5      histo  30.0    0             5
0
0
0
0
7


# Model

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Flatten, BatchNormalization, Concatenate, Lambda, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.regularizers import l2
import numpy as np

def MLP(input_shape_demo):
    inputs = Input(shape=input_shape_demo)  # Input layer for metadata
    x = Dense(64, activation='relu')(inputs)  # First dense layer
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(32, activation='relu')(x) 
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    return Model(inputs=inputs, outputs=x)  # Return the full MLP model

def CNN(input_shape):
    # Input for the original scale
    input_layer = Input(shape=input_shape)

    # Downscale the input image
    scale_small = Lambda(lambda x: tf.image.resize(x, (input_shape[0] // 2, input_shape[1] // 2)))(input_layer)

    # ResNet50 backbone for the original scale
    base_model_original = ResNet50(
        input_shape=input_shape,
        include_top=False,
        weights=None,
        name="resnet50_original"  # Unique name for the original scale model
    )
    x_original = base_model_original(input_layer)
    x_original = Flatten()(x_original)
    x_original = BatchNormalization()(x_original)

    # ResNet50 backbone for the smaller scale
    base_model_small = ResNet50(
        input_shape=(input_shape[0] // 2, input_shape[1] // 2, input_shape[2]),
        include_top=False,
        weights=None,
        name="resnet50_small"  # Unique name for the smaller scale model
    )
    x_small = base_model_small(scale_small)
    x_small = Flatten()(x_small)
    x_small = BatchNormalization()(x_small)

    # Concatenate features from both scales
    combined_features = Concatenate()([x_original, x_small])
    x = BatchNormalization()(combined_features)
    x = Dense(1024, activation="relu",kernel_regularizer=l2(0.01))(x)
    x = BatchNormalization()(x)
    # Create the final model
    model = Model(inputs=input_layer, outputs=x)

    return model

# Input layers for metadata and images
input_meta = Input(shape=(3,))  # Metadata input (age, sex, localization)
input_image = Input(shape=(450, 450, 3))  # Image input

# CNN model for image processing
cnn_model = CNN(input_shape=(450, 450, 3))  # Create the CNN model
x2_out = cnn_model(input_image)  # Pass image input to the CNN model

# MLP model for metadata processing
mlp_model = MLP(input_shape_demo=(3,))  # Create the MLP model
x1_out = mlp_model(input_meta)  # Pass metadata input to the MLP model

# Concatenate the outputs of both models
combined_input = Concatenate()([x1_out, x2_out])

# Apply Dense layers after concatenation
x = Dense(128, activation="relu",kernel_regularizer=l2(0.01))(combined_input)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
x = Dense(128, activation="relu",kernel_regularizer=l2(0.01))(x)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
# Output layer (softmax for multi-class classification)
output = Dense(len(np.unique(data['dx'])), activation="softmax")(x)

# Create the final model
model = Model(inputs=[input_meta, input_image], outputs=output)




2025-01-08 10:36:25.406265: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-01-08 10:36:25.406286: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-01-08 10:36:25.406292: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-01-08 10:36:25.406307: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-08 10:36:25.406315: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import Sequence
data = balanced_metadata
import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os

class DataGenerator(Sequence):
    def __init__(self, dataframe, image_dir, batch_size, augment=False):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.batch_size = batch_size
        self.indices = np.arange(len(dataframe))
        self.augment = augment  # Flag to enable/disable augmentation
        self.scaler = MinMaxScaler()  # Initialize scaler for metadata

    def __len__(self):
        return int(np.ceil(len(self.dataframe) / self.batch_size))
    
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = self.dataframe.iloc[batch_indices]
        
        # Process inputs (e.g., images) and labels
        X_images = np.array([
            self.load_image(os.path.join(self.image_dir, f"{image_id}.jpg"))
            for image_id in batch_data['image_id']
        ], dtype=np.float32)  # Cast to float32 for consistency
        
        if self.augment:
            X_images = self.augment_images(X_images)  # Apply augmentation
        
        X_meta = batch_data[['age', 'sex', 'localization']].values.astype(np.float32)  # Ensure float32
        X_meta = self.scaler.fit_transform(X_meta)  # Scale metadata
        
        # Prepare labels
        y = batch_data['dx'].values
        y = to_categorical(y, num_classes=7)
        
        return (X_meta, X_images), y
    
    def load_image(self, image_path):
        img = tf.keras.preprocessing.image.load_img(image_path, target_size=(450, 450))
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = img / 255.0  # Normalize pixel values to [0, 1]
        return img

    def augment_images(self, images):
        augmented_images = []
        for img in images:
            # Apply random augmentations using tf.image
            img = tf.image.random_flip_left_right(img)  # Random horizontal flip
            img = tf.image.random_flip_up_down(img)  # Random vertical flip
            img = tf.image.random_brightness(img, max_delta=0.1)  # Random brightness adjustment
            img = tf.image.random_contrast(img, lower=0.8, upper=1.2)  # Random contrast adjustment
            img = tf.image.random_saturation(img, lower=0.8, upper=1.2)  # Random saturation adjustment
            augmented_images.append(img)
        return np.array(augmented_images, dtype=np.float32)


# Image directory (replace this with the actual path where images are stored)
image_dir = "/Users/lucabernecker/PHD_UiT/Skin_lesions/HAM10000_images_part_1"

# Hyperparameters
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)

# Hyperparameters
batch_size = 4
input_shape_demo = (3,)  # For metadata (age, sex, localization)
input_shape = (450, 450, 3)  # For image data

# Create the training and validation generators
train_generator = DataGenerator(
    dataframe=train_df,
    image_dir=image_dir,
    batch_size=batch_size,
    augment=True  
)

val_generator = DataGenerator(
    dataframe=val_df,
    image_dir=image_dir,
    batch_size=batch_size,
    augment=False  
)

# Create the model
optimizer = tf.keras.optimizers.SGD(learning_rate=0.005, clipvalue=1.0)

# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Define the ModelCheckpoint callback


# Train the model
history = model.fit(
    train_generator, 
    validation_data=val_generator, 
    epochs=10,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator)
)

# Optionally, print the training history
print(history.history)

Epoch 1/10


  self._warn_if_super_not_called()
2025-01-08 10:36:29.436738: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m  51/2002[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m56:12[0m 2s/step - accuracy: 0.0832 - loss: 26.6583    