In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Concatenate, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import psutil
import os
import time
import gc
from sklearn.utils import class_weight
import math



In [3]:
train_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')


In [4]:
# Map 'sex' to binary values: male -> 1, female -> 0
train_df['sex'] = train_df['sex'].map({'male': 1, 'female': 0})
test_df['sex'] = test_df['sex'].map({'male': 1, 'female': 0})

In [5]:
# Function to impute missing values with mode
def impute_mode(df, column_name):
    mode_value = df[column_name].mode()[0]  # Calculate mode
    df[column_name] = df[column_name].fillna(mode_value)  # Fill missing values with mode

# Impute missing values in 'sex' column for both train and test datasets
impute_mode(train_df, 'sex')
impute_mode(test_df, 'sex')

# Impute missing values in 'age_approx' column for both train and test datasets
impute_mode(train_df, 'age_approx')
impute_mode(test_df, 'age_approx')

# Continue with the existing code for 'anatom_site_general_challenge'
# Calculate mode excluding 'unknown' for the training dataset
train_mode_anatom_site = train_df.loc[train_df['anatom_site_general_challenge'] != 'unknown', 'anatom_site_general_challenge'].mode()[0]

# Fill missing values with the mode in the training dataset
train_df['anatom_site_general_challenge'] = train_df['anatom_site_general_challenge'].replace('unknown', np.nan)
train_df['anatom_site_general_challenge'] = train_df['anatom_site_general_challenge'].fillna(train_mode_anatom_site)

# Calculate mode excluding 'unknown' for the test dataset
test_mode_anatom_site = test_df.loc[test_df['anatom_site_general_challenge'] != 'unknown', 'anatom_site_general_challenge'].mode()[0]

# Fill missing values with the mode in the test dataset
test_df['anatom_site_general_challenge'] = test_df['anatom_site_general_challenge'].replace('unknown', np.nan)
test_df['anatom_site_general_challenge'] = test_df['anatom_site_general_challenge'].fillna(test_mode_anatom_site)

# Convert all values in 'anatom_site_general_challenge' and 'sex' to strings
train_df['anatom_site_general_challenge'] = train_df['anatom_site_general_challenge'].astype(str)
test_df['anatom_site_general_challenge'] = test_df['anatom_site_general_challenge'].astype(str)

# Encode 'anatom_site_general_challenge' column
le_anatom_site = LabelEncoder()
train_df['anatom_site_general_challenge'] = le_anatom_site.fit_transform(train_df['anatom_site_general_challenge'])
test_df['anatom_site_general_challenge'] = le_anatom_site.transform(test_df['anatom_site_general_challenge'])


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33126 entries, 0 to 33125
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     33126 non-null  object 
 1   patient_id                     33126 non-null  object 
 2   sex                            33126 non-null  float64
 3   age_approx                     33126 non-null  float64
 4   anatom_site_general_challenge  33126 non-null  int64  
 5   diagnosis                      33126 non-null  object 
 6   benign_malignant               33126 non-null  object 
 7   target                         33126 non-null  int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 2.0+ MB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10982 entries, 0 to 10981
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     10982 non-null  object 
 1   patient_id                     10982 non-null  object 
 2   sex                            10982 non-null  int64  
 3   age_approx                     10982 non-null  float64
 4   anatom_site_general_challenge  10982 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 429.1+ KB


In [8]:
# Convert 'target' to float
train_df['target'] = train_df['target'].astype(float)

In [9]:
def create_tf_data_generator(df, img_dir, batch_size=32, target_size=(128, 128), is_train=True):
    def load_data(row):
        img_path = tf.strings.join([img_dir, '/', row['image_name'], '.jpg'])
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, target_size)
        img = img / 255.0
        patient_data = tf.stack([
            tf.cast(row['sex'], tf.float32),
            tf.cast(row['age_approx'], tf.float32),
            tf.cast(row['anatom_site_general_challenge'], tf.float32)
        ], axis=-1)
        return img, patient_data

    def load_data_with_labels(row):
        img, patient_data = load_data(row)
        label = tf.cast(row['target'], tf.float32)
        return (img, patient_data), label

    dataset = tf.data.Dataset.from_tensor_slices(dict(df))
    if is_train:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.map(load_data_with_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


In [10]:
# Create data generators
train_generator = create_tf_data_generator(train_df, img_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train', is_train=True)
val_generator = create_tf_data_generator(train_df.sample(frac=0.2, random_state=42), img_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train', is_train=False)


In [11]:
# Define the directory containing the test images
test_img_dir = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test'

# Filter test_df to include only the image files that exist in the directory
existing_files = os.listdir(test_img_dir)
test_df = test_df[test_df['image_name'].apply(lambda x: f'{x}.jpg' in existing_files)]

def create_tf_test_data_generator(df, img_dir, batch_size=32, target_size=(128, 128)):
    def load_data(row):
        img_path = tf.strings.join([img_dir, '/', row['image_name'], '.jpg'])
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, target_size)
        img = img / 255.0
        patient_data = tf.stack([
            tf.cast(row['sex'], tf.float32),
            tf.cast(row['age_approx'], tf.float32),
            tf.cast(row['anatom_site_general_challenge'], tf.float32)
        ], axis=-1)
        return img, patient_data

    def load_data_no_labels(row):
        img, patient_data = load_data(row)
        return img, patient_data

    dataset = tf.data.Dataset.from_tensor_slices(dict(df))
    dataset = dataset.map(load_data_no_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


In [12]:
# Check for NaN values in the input data
print(test_df.isna().sum())


image_name                       0
patient_id                       0
sex                              0
age_approx                       0
anatom_site_general_challenge    0
dtype: int64


In [13]:

# Create test data generator
test_generator = create_tf_test_data_generator(test_df, img_dir=test_img_dir)

# Generate predictions
test_images, test_patient_data = [], []
for batch in test_generator:
    images, patient_data = batch
    test_images.append(images)
    test_patient_data.append(patient_data)

test_images = tf.concat(test_images, axis=0)
test_patient_data = tf.concat(test_patient_data, axis=0)

In [14]:
# Ensure shapes are as expected
print(f"Test images shape: {test_images.shape}")
print(f"Test patient data shape: {test_patient_data.shape}")


Test images shape: (10982, 128, 128, 3)
Test patient data shape: (10982, 3)


In [15]:
train_df

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,1.0,45.0,0,unknown,benign,0.0
1,ISIC_0015719,IP_3075186,0.0,45.0,5,unknown,benign,0.0
2,ISIC_0052212,IP_2842074,0.0,50.0,1,nevus,benign,0.0
3,ISIC_0068279,IP_6890425,0.0,45.0,0,unknown,benign,0.0
4,ISIC_0074268,IP_8723313,0.0,55.0,5,unknown,benign,0.0
...,...,...,...,...,...,...,...,...
33121,ISIC_9999134,IP_6526534,1.0,50.0,4,unknown,benign,0.0
33122,ISIC_9999320,IP_3650745,1.0,65.0,4,unknown,benign,0.0
33123,ISIC_9999515,IP_2026598,1.0,20.0,1,unknown,benign,0.0
33124,ISIC_9999666,IP_7702038,1.0,50.0,1,unknown,benign,0.0


In [16]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dropout, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Build the image processing model using ResNet50
image_input = Input(shape=(128, 128, 3))
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
image_features = Model(inputs=image_input, outputs=x)

# Define patient data input
patient_input = Input(shape=(train_df[['sex', 'age_approx', 'anatom_site_general_challenge']].shape[1],))
y = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(patient_input)
y = Dropout(0.5)(y)
y = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(y)

# Combine image and patient data features
combined = Concatenate()([image_features.output, y])
z = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(combined)
z = Dropout(0.5)(z)
output = Dense(1, activation='sigmoid', dtype=tf.float32)(z)  # Ensure correct output dtype for mixed precision

# Define and compile the model
optimizer = Adam(learning_rate=1e-5, clipvalue=1.0)
model = Model(inputs=[image_features.input, patient_input], outputs=output)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [17]:
# Function to monitor memory usage
def print_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Memory Usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")


In [18]:
# Custom callback to log epoch duration and monitor memory
class MemoryCallback(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1} took {time.time() - self.epoch_time_start:.2f} seconds")
        print_memory_usage()
        gc.collect()  # Trigger garbage collection to free up memory

In [19]:
# Callbacks

early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001, mode='min', restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)
memory_callback = MemoryCallback()



In [20]:
# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['target'].values),
    y=train_df['target'].values
)

class_weights_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weights_dict}")


Class weights: {0: 0.5089730194825149, 1: 28.361301369863014}


In [21]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    steps_per_epoch=10,
    validation_steps=80,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr, memory_callback]
)

Epoch 1/10


I0000 00:00:1724772785.011102     105 service.cc:145] XLA service 0x7d63ec004010 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1724772785.011161     105 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1724772785.011168     105 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5

I0000 00:00:1724772818.900006     105 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - accuracy: 0.1047 - loss: 3.9895Epoch 1 took 146.14 seconds
Memory Usage: 7730.92 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 9s/step - accuracy: 0.1037 - loss: 3.9843 - val_accuracy: 0.0191 - val_loss: 4.8680 - learning_rate: 1.0000e-05
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.1304 - loss: 4.0113Epoch 2 took 77.02 seconds
Memory Usage: 7830.50 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 9s/step - accuracy: 0.1311 - loss: 4.0003 - val_accuracy: 0.0168 - val_loss: 4.8204 - learning_rate: 1.0000e-05
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.1487 - loss: 3.8899

  self.gen.throw(typ, value, traceback)


Epoch 3 took 48.95 seconds
Memory Usage: 7735.34 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5s/step - accuracy: 0.1486 - loss: 3.8743 - val_accuracy: 0.0179 - val_loss: 4.8027 - learning_rate: 1.0000e-05
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.1668 - loss: 3.5911Epoch 4 took 71.99 seconds
Memory Usage: 7893.19 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 8s/step - accuracy: 0.1701 - loss: 3.5842 - val_accuracy: 0.0191 - val_loss: 4.8071 - learning_rate: 1.0000e-05
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.1992 - loss: 3.5270Epoch 5 took 72.65 seconds
Memory Usage: 8011.49 MB
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 8s/step - accuracy: 0.1979 - loss: 3.5655 - val_accuracy: 0.0168 - val_loss: 4.5555 - learning_rate: 1.0000e-05
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [22]:
# Save the model as an .h5 file
model.save('model.h5')

In [23]:
# Make predictions on test data
test_predictions = model.predict([test_images, test_patient_data], verbose=1)

[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 38ms/step


In [24]:
np.isnan(test_predictions).sum()

0

In [25]:
# Prepare the submission
submission = pd.DataFrame({'image_name': test_df['image_name'],'target': test_predictions.squeeze()})
submission.to_csv('submission.csv', index=False)

In [26]:
submission

Unnamed: 0,image_name,target
0,ISIC_0052060,0.608435
1,ISIC_0052349,0.548653
2,ISIC_0058510,0.627943
3,ISIC_0073313,0.607167
4,ISIC_0073502,0.547075
...,...,...
10977,ISIC_9992485,0.533559
10978,ISIC_9996992,0.522832
10979,ISIC_9997917,0.480732
10980,ISIC_9998234,0.613093
