# Load Data

In [None]:
# Allow multiple df print-outs within a single code block
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
try:
    from datasets import load_dataset
except:
    !pip install datasets
    from datasets import load_dataset

# Load dataset
skin_cancer_ds = load_dataset("marmal88/skin_cancer")

# Print structure
print(skin_cancer_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00005-7eed077f2f8e6d(…):   0%|          | 0.00/521M [00:00<?, ?B/s]

data/train-00001-of-00005-50ba64fd20294b(…):   0%|          | 0.00/525M [00:00<?, ?B/s]

data/train-00002-of-00005-36c02a25cbdd54(…):   0%|          | 0.00/527M [00:00<?, ?B/s]

data/train-00003-of-00005-27da80cf1cb259(…):   0%|          | 0.00/528M [00:00<?, ?B/s]

data/train-00004-of-00005-264fb0c337457a(…):   0%|          | 0.00/548M [00:00<?, ?B/s]

data/validation-00000-of-00002-9cc6b2a1d(…):   0%|          | 0.00/341M [00:00<?, ?B/s]

data/validation-00001-of-00002-900252bc4(…):   0%|          | 0.00/348M [00:00<?, ?B/s]

data/test-00000-of-00001-61e7cf54bf274ae(…):   0%|          | 0.00/355M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9577 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2492 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1285 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'],
        num_rows: 9577
    })
    validation: Dataset({
        features: ['image', 'image_id', 'lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'],
        num_rows: 2492
    })
    test: Dataset({
        features: ['image', 'image_id', 'lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'],
        num_rows: 1285
    })
})


# Initial Pre-Processing

* Merge pre-given splits
* Extract image data
* Perform basic cleaning
* Split into training and test sets

In [None]:
import pandas as pd

# Convert each split to pandas DataFrame
skin_train_df = skin_cancer_ds["train"].to_pandas()
skin_valid_df = skin_cancer_ds["validation"].to_pandas()
skin_test_df = skin_cancer_ds["test"].to_pandas()

# Merge all splits, manually split later
skin_df = pd.concat((skin_train_df, skin_valid_df, skin_test_df), axis=0)

# Check
print(len(skin_df))
skin_df.head()

13354


Unnamed: 0,image,image_id,lesion_id,dx,dx_type,age,sex,localization
0,{'bytes': b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\...,ISIC_0024329,HAM_0002954,actinic_keratoses,histo,75.0,female,lower extremity
1,{'bytes': b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\...,ISIC_0024372,HAM_0005389,actinic_keratoses,histo,70.0,male,lower extremity
2,{'bytes': b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\...,ISIC_0024418,HAM_0003380,actinic_keratoses,histo,75.0,female,lower extremity
3,{'bytes': b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\...,ISIC_0024450,HAM_0005505,actinic_keratoses,histo,50.0,male,upper extremity
4,{'bytes': b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\...,ISIC_0024463,HAM_0004568,actinic_keratoses,histo,50.0,male,upper extremity


In [None]:
# Images are stored as bytes within dictionary, so extract them so hashable to check for duplicates and null values
def extract_bytes(dict_object):
    return dict_object["bytes"]

# Apply
skin_df["image"] = skin_df["image"].apply(extract_bytes)

# Check
skin_df.head()

Unnamed: 0,image,image_id,lesion_id,dx,dx_type,age,sex,localization
0,b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...,ISIC_0024329,HAM_0002954,actinic_keratoses,histo,75.0,female,lower extremity
1,b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...,ISIC_0024372,HAM_0005389,actinic_keratoses,histo,70.0,male,lower extremity
2,b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...,ISIC_0024418,HAM_0003380,actinic_keratoses,histo,75.0,female,lower extremity
3,b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...,ISIC_0024450,HAM_0005505,actinic_keratoses,histo,50.0,male,upper extremity
4,b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...,ISIC_0024463,HAM_0004568,actinic_keratoses,histo,50.0,male,upper extremity


In [None]:
# Basic cleaning
skin_df = skin_df.dropna()
skin_df = skin_df.drop_duplicates()

# Check
print(len(skin_df))

9958


In [None]:
# Trim data to save resources
sampling_percentage = 0.5
skin_df = skin_df.sample(frac=sampling_percentage, random_state=60)

# Check
print(len(skin_df))

4979


In [None]:
from sklearn.model_selection import train_test_split

# Split feature types and labels
image_df = skin_df["image"]
tabular_df = skin_df[['age', 'sex', 'localization']]
labels_df = skin_df['dx']

# Apply train-test split so feature engineering fitting only occurs on train portion
image_train_df, image_test_df, tabular_train_df, tabular_test_df, labels_train_df, labels_test_df = train_test_split(image_df, tabular_df, labels_df, test_size=0.2, random_state=60)

# Check
print(len(image_train_df))
print(len(image_test_df))
print(len(tabular_train_df))
print(len(tabular_test_df))
print(len(labels_train_df))
print(len(labels_test_df))

3983
996
3983
996
3983
996


# Feature Engineering: Labels

* Use label encoding
* Export

In [None]:
from sklearn.preprocessing import LabelEncoder

# Use label encoding because it's preferred for multi-class with scikit-learn
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels_train_df)
# Apply to labels
y_train = label_encoder.transform(labels_train_df)
y_test = label_encoder.transform(labels_test_df)

# Check
print(y_train.shape)
print(y_test.shape)

(3983,)
(996,)


In [None]:
import pickle

# Export label NumPy arrays
file_to_write = open("skin_cancer_labels.data", "wb")
pickle.dump((y_train, y_test), file_to_write)
file_to_write.close()

# Export label encoder
file_to_write = open("skin_cancer_encode_labels.preprocess", "wb")
pickle.dump(label_encoder, file_to_write)
file_to_write.close()

# Feature Engineering: Tabular

* Use one-hot encoding for categorical features
* Use standard scaling for numerical features

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define categorical and numerical features
categorical_features = ['sex', 'localization']
numerical_features = ['age']

# Create a column transformer for preprocessing
tabular_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])
tabular_transformer.fit(tabular_train_df)

# Apply the preprocessing
X_tabular_train = tabular_transformer.transform(tabular_train_df)
X_tabular_test = tabular_transformer.transform(tabular_test_df)

# Check
print(X_tabular_train.shape)
print(X_tabular_test.shape)

(3983, 17)
(996, 17)


In [None]:
import pickle

# Export tabular transformer
file_to_write = open("skin_cancer_tabular.preprocess", "wb")
pickle.dump(tabular_transformer, file_to_write)
file_to_write.close()

# Feature Engineering: Image(s)

* Built ResNet50 featuriser and apply to images
* Apply PCA to resulting features
* Export

In [None]:
from io import BytesIO
from PIL import Image
import numpy as np

# Convert image bytes data to NumPy for CNN use
def bytes_to_np(bytes_data):
    # Convert bytes to NumPy
    image = Image.open(BytesIO(bytes_data))
    return np.asarray(image)

In [None]:
try:
    import tensorflow as tf
except:
    !pip install tensorflow
    import tensorflow as tf

# Load pre-trained CNN
pre_cnn_model = tf.keras.applications.ResNet50(
    include_top=False,  # Remove classification head
    input_shape=(85, 85, 3),
    pooling='avg',      # Return vector
    weights='imagenet'  # Pre-trained weights
)
pre_cnn_model.trainable = False

# Input layer is shape of original NumPy array
inputs = tf.keras.Input(shape=(450, 600, 3))
# Crop into square
cropped = tf.keras.layers.CenterCrop(450, 450)(inputs)
# Resize down
resized = tf.keras.layers.Resizing(85, 85)(cropped)
# Reorder RGB to BGR and normalise
scaled = tf.keras.applications.resnet.preprocess_input(resized)
# Get features using CNN model
output_features = pre_cnn_model(scaled)
# Compile models with these layers
image_featuriser = tf.keras.Model(inputs, output_features)
image_featuriser.trainable = False

# Check
# pre_cnn_model.summary()
# image_featuriser.summary()

# Combine functions of bytes to NumPy and featurise
def to_np_and_featurise(bytes_data):
    # Go from binary to NumPy
    image_np = bytes_to_np(bytes_data)
    # CNN model expects batches, so add extra dimension of size 1
    image_np = np.expand_dims(image_np, axis=0)
    # Output also has extra dimension, so remove
    return image_featuriser(image_np, training=False).numpy().squeeze()

# Apply image processing
image_train_df = image_train_df.apply(to_np_and_featurise)
image_test_df = image_test_df.apply(to_np_and_featurise)

# Check
print(len(image_train_df))
print(len(image_test_df))
image_train_df.head()
image_test_df.head()

# Convert to NumPy
X_image_train = np.stack(image_train_df.to_numpy())
X_image_test = np.stack(image_test_df.to_numpy())

# Check
print(X_image_train.shape)
print(X_image_test.shape)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
3983
996


Unnamed: 0,image
713,"[0.24834803, 0.5357977, 4.305394, 1.5008922, 0..."
9374,"[0.10455877, 0.018607192, 0.12080545, 0.0, 0.0..."
1292,"[3.7213407, 0.33441028, 0.0, 0.06540613, 0.0, ..."
758,"[0.8991038, 1.8635287, 0.44194046, 0.0, 0.0, 4..."
2836,"[0.44252193, 0.14044666, 0.0, 0.0, 0.0, 0.0, 3..."


Unnamed: 0,image
4747,"[0.505203, 0.0, 0.06547478, 0.0, 0.0, 0.0, 1.0..."
97,"[0.28265223, 0.20712323, 0.636814, 0.0, 0.2086..."
402,"[0.012687988, 0.034066744, 0.81564593, 0.55973..."
8199,"[0.55982643, 2.2833056, 0.13409866, 0.0, 0.184..."
8463,"[0.0, 0.5467317, 0.0, 0.032581076, 0.0, 0.0, 1..."


(3983, 2048)
(996, 2048)


In [None]:
import pickle

# Store "raw" data without upcoming PCA
file_to_write = open("skin_cancer_features_no_pca.data", "wb")
pickle.dump((
    np.concatenate((X_image_train, X_tabular_train), axis=1),
    np.concatenate((X_image_test, X_tabular_test), axis=1),
), file_to_write)
file_to_write.close()

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# PCA down to 10%
image_pca = PCA(n_components=round(0.1*X_image_train.shape[1]), random_state=60)
image_pca.fit(X_image_train)
# Apply
X_image_train = image_pca.transform(X_image_train)
X_image_test = image_pca.transform(X_image_test)

# Check
print(X_image_train.shape)
print(X_image_test.shape)

(3983, 205)
(996, 205)


In [None]:
import pickle

# Export all features, concatenating image and tabular
file_to_write = open("skin_cancer_features.data", "wb")
pickle.dump((
    np.concatenate((X_image_train, X_tabular_train), axis=1),
    np.concatenate((X_image_test, X_tabular_test), axis=1),
), file_to_write)
file_to_write.close()

# Export PCA for images
file_to_write = open("skin_cancer_image_pca.preprocess", "wb")
pickle.dump(image_pca, file_to_write)
file_to_write.close()