In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Concatenate, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import psutil
import os
import time
import gc
from sklearn.utils import class_weight
import math



2024-07-05 17:40:26.415939: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 17:40:26.416063: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 17:40:26.540103: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
train_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')


In [3]:
# Map 'sex' to binary values: male -> 1, female -> 0
train_df['sex'] = train_df['sex'].map({'male': 1, 'female': 0})
test_df['sex'] = test_df['sex'].map({'male': 1, 'female': 0})



In [4]:
# Fill missing values with 'unknown'
train_df['anatom_site_general_challenge'] = train_df['anatom_site_general_challenge'].fillna('unknown')
test_df['anatom_site_general_challenge'] = test_df['anatom_site_general_challenge'].fillna('unknown')

# Convert all values in 'anatom_site_general_challenge' and 'sex' to strings
train_df['anatom_site_general_challenge'] = train_df['anatom_site_general_challenge'].astype(str)
test_df['anatom_site_general_challenge'] = test_df['anatom_site_general_challenge'].astype(str)

# Encode 'anatom_site_general_challenge' column
le_anatom_site = LabelEncoder()
train_df['anatom_site_general_challenge'] = le_anatom_site.fit_transform(train_df['anatom_site_general_challenge'])
test_df['anatom_site_general_challenge'] = le_anatom_site.transform(test_df['anatom_site_general_challenge'])

In [5]:
# Convert 'target' to float
train_df['target'] = train_df['target'].astype(float)

In [6]:
def create_tf_data_generator(df, img_dir, batch_size=32, target_size=(128, 128), is_train=True):
    def load_data(row):
        img_path = tf.strings.join([img_dir, '/', row['image_name'], '.jpg'])
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, target_size)
        img = img / 255.0
        patient_data = tf.stack([
            tf.cast(row['sex'], tf.float32),
            tf.cast(row['age_approx'], tf.float32),
            tf.cast(row['anatom_site_general_challenge'], tf.float32)
        ], axis=-1)
        return img, patient_data

    def load_data_with_labels(row):
        img, patient_data = load_data(row)
        label = tf.cast(row['target'], tf.float32)
        return (img, patient_data), label

    dataset = tf.data.Dataset.from_tensor_slices(dict(df))
    if is_train:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.map(load_data_with_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


In [7]:
# Create data generators
train_generator = create_tf_data_generator(train_df, img_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train', is_train=True)
val_generator = create_tf_data_generator(train_df.sample(frac=0.2, random_state=42), img_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train', is_train=False)


In [8]:
# Define the directory containing the test images
test_img_dir = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test'

# Filter test_df to include only the image files that exist in the directory
existing_files = os.listdir(test_img_dir)
test_df = test_df[test_df['image_name'].apply(lambda x: f'{x}.jpg' in existing_files)]

def create_tf_test_data_generator(df, img_dir, batch_size=32, target_size=(128, 128)):
    def load_data(row):
        img_path = tf.strings.join([img_dir, '/', row['image_name'], '.jpg'])
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, target_size)
        img = img / 255.0
        patient_data = tf.stack([
            tf.cast(row['sex'], tf.float32),
            tf.cast(row['age_approx'], tf.float32),
            tf.cast(row['anatom_site_general_challenge'], tf.float32)
        ], axis=-1)
        return img, patient_data

    def load_data_no_labels(row):
        img, patient_data = load_data(row)
        return img, patient_data

    dataset = tf.data.Dataset.from_tensor_slices(dict(df))
    dataset = dataset.map(load_data_no_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


In [9]:
# Check for NaN values in the input data
print(test_df.isna().sum())


image_name                       0
patient_id                       0
sex                              0
age_approx                       0
anatom_site_general_challenge    0
dtype: int64


In [10]:

# Create test data generator
test_generator = create_tf_test_data_generator(test_df, img_dir=test_img_dir)

# Generate predictions
test_images, test_patient_data = [], []
for batch in test_generator:
    images, patient_data = batch
    test_images.append(images)
    test_patient_data.append(patient_data)

test_images = tf.concat(test_images, axis=0)
test_patient_data = tf.concat(test_patient_data, axis=0)

In [11]:
# Ensure shapes are as expected
print(f"Test images shape: {test_images.shape}")
print(f"Test patient data shape: {test_patient_data.shape}")


Test images shape: (10982, 128, 128, 3)
Test patient data shape: (10982, 3)


In [12]:
train_df

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,1.0,45.0,0,unknown,benign,0.0
1,ISIC_0015719,IP_3075186,0.0,45.0,6,unknown,benign,0.0
2,ISIC_0052212,IP_2842074,0.0,50.0,1,nevus,benign,0.0
3,ISIC_0068279,IP_6890425,0.0,45.0,0,unknown,benign,0.0
4,ISIC_0074268,IP_8723313,0.0,55.0,6,unknown,benign,0.0
...,...,...,...,...,...,...,...,...
33121,ISIC_9999134,IP_6526534,1.0,50.0,4,unknown,benign,0.0
33122,ISIC_9999320,IP_3650745,1.0,65.0,4,unknown,benign,0.0
33123,ISIC_9999515,IP_2026598,1.0,20.0,1,unknown,benign,0.0
33124,ISIC_9999666,IP_7702038,1.0,50.0,1,unknown,benign,0.0


In [13]:
# Build the model
image_input = Input(shape=(128, 128, 3))
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_tensor=image_input)
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
image_features = Model(inputs=image_input, outputs=x)

# Define patient data input
patient_input = Input(shape=(train_df[['sex', 'age_approx', 'anatom_site_general_challenge']].shape[1],))
y = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(patient_input)
y = Dropout(0.5)(y)
y = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(y)

# Combine image and patient data features
combined = Concatenate()([image_features.output, y])
z = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(combined)
z = Dropout(0.5)(z)
output = Dense(1, activation='sigmoid', dtype=tf.float32)(z)  # Ensure correct output dtype for mixed precision

# Define and compile the model
optimizer = Adam(learning_rate=1e-5, clipvalue=1.0)
model = Model(inputs=[image_features.input, patient_input], outputs=output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [19]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dropout, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Build the image processing model using ResNet50
image_input = Input(shape=(128, 128, 3))
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
image_features = Model(inputs=image_input, outputs=x)

# Define patient data input
patient_input = Input(shape=(train_df[['sex', 'age_approx', 'anatom_site_general_challenge']].shape[1],))
y = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(patient_input)
y = Dropout(0.5)(y)
y = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(y)

# Combine image and patient data features
combined = Concatenate()([image_features.output, y])
z = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(combined)
z = Dropout(0.5)(z)
output = Dense(1, activation='sigmoid', dtype=tf.float32)(z)  # Ensure correct output dtype for mixed precision

# Define and compile the model
optimizer = Adam(learning_rate=1e-5, clipvalue=1.0)
model = Model(inputs=[image_features.input, patient_input], outputs=output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [14]:
# Function to monitor memory usage
def print_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Memory Usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")


In [15]:
# Custom callback to log epoch duration and monitor memory
class MemoryCallback(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1} took {time.time() - self.epoch_time_start:.2f} seconds")
        print_memory_usage()
        gc.collect()  # Trigger garbage collection to free up memory

In [16]:
# Callbacks

early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001, mode='min', restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)
memory_callback = MemoryCallback()



In [17]:
# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['target'].values),
    y=train_df['target'].values
)

class_weights_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weights_dict}")


Class weights: {0: 0.5089730194825149, 1: 28.361301369863014}


In [18]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    steps_per_epoch=10,
    validation_steps=80,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr, memory_callback]
)

Epoch 1/10
[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m15:39[0m 104s/step - accuracy: 0.6875 - loss: 2.4543

I0000 00:00:1720201820.661342     116 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1720201820.778601     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step - accuracy: 0.6754 - loss: nan 

W0000 00:00:1720201836.515552     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


Epoch 1 took 185.32 seconds
Memory Usage: 8367.72 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 9s/step - accuracy: 0.6776 - loss: nan - val_accuracy: 0.9809 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.9817 - loss: nanEpoch 2 took 79.10 seconds
Memory Usage: 8403.86 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 9s/step - accuracy: 0.9828 - loss: nan - val_accuracy: 0.9832 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.9925 - loss: nan

  self.gen.throw(typ, value, traceback)


Epoch 3 took 53.60 seconds
Memory Usage: 8328.58 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 6s/step - accuracy: 0.9921 - loss: nan - val_accuracy: 0.9821 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.9740 - loss: nanEpoch 4 took 71.58 seconds
Memory Usage: 8607.18 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 8s/step - accuracy: 0.9747 - loss: nan - val_accuracy: 0.9809 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.9797 - loss: nan
Epoch 5: ReduceLROnPlateau reducing learning rate to 1.9999999494757505e-06.
Epoch 5 took 71.03 seconds
Memory Usage: 8613.11 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 8s/step - accuracy: 0.9795 - loss: nan - val_accuracy: 0.9832 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 5: early stopping

In [20]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    steps_per_epoch=10,
    validation_steps=80,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr, memory_callback]
)

Epoch 1/10
[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m11:47[0m 79s/step - accuracy: 0.4062 - loss: 2.9364

W0000 00:00:1720203293.830662     114 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step - accuracy: 0.4069 - loss: 2.9558

W0000 00:00:1720203306.863228     115 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


Epoch 1 took 156.24 seconds
Memory Usage: 10146.30 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 9s/step - accuracy: 0.4080 - loss: 2.9555 - val_accuracy: 0.0215 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.4360 - loss: nan   Epoch 2 took 74.61 seconds
Memory Usage: 10217.71 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 8s/step - accuracy: 0.4358 - loss: nan - val_accuracy: 0.9832 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.9834 - loss: nan

  self.gen.throw(typ, value, traceback)


Epoch 3 took 49.41 seconds
Memory Usage: 10061.07 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 6s/step - accuracy: 0.9835 - loss: nan - val_accuracy: 0.9821 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.9596 - loss: nanEpoch 4 took 70.82 seconds
Memory Usage: 10068.09 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 8s/step - accuracy: 0.9610 - loss: nan - val_accuracy: 0.9809 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - accuracy: 0.9827 - loss: nan
Epoch 5: ReduceLROnPlateau reducing learning rate to 1.9999999494757505e-06.
Epoch 5 took 70.75 seconds
Memory Usage: 10211.02 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 8s/step - accuracy: 0.9831 - loss: nan - val_accuracy: 0.9832 - val_loss: nan - learning_rate: 1.0000e-05
Epoch 5: early sto

In [None]:
# Make predictions on test data
test_predictions = model.predict([test_images, test_patient_data], verbose=1)

In [None]:
np.isnan(test_predictions).sum()

In [None]:
# Prepare the submission
submission = pd.DataFrame({'image_name': test_df['image_name'],'target': test_predictions.squeeze()})
submission.to_csv('submission.csv', index=False)

In [None]:
submission