In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore') 
import os
import keras
import zipfile
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from io import BytesIO
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from concurrent.futures import ThreadPoolExecutor
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from keras.models import Model, Sequential
from keras.applications import EfficientNetV2M
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, BatchNormalization, Activation, Input, Conv2D, Multiply, Reshape
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.losses import BinaryCrossentropy
from keras.utils import Sequence
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

# Loading the Data

In [2]:
def load_data(undersample_strat=0.07):
    """
    Returns the metadata and image generators for training and validation
    :param undersample_strat: the desired proportion of cancerous to non-cancerous lesions in the dataset
    """
    # Extract zipped data to a local directory, if not done already
    if not os.path.isdir('Data'):
        with zipfile.ZipFile('anon-patient-data.zip', 'r') as zip_ref:
            zip_ref.extractall('Data')
        
    # Load the metadata
    skin_cancer_df = pd.read_csv('Data/train-metadata.csv', low_memory=False, usecols=[num for num in range(0, 43) if num not in [2, 7]], index_col='isic_id')
        
    # Randomly undersample the cancer-free lesions in the dataset to enhance performance time and address class imbalance
    rus = RandomUnderSampler(random_state=42, sampling_strategy=undersample_strat)
    (skin_cancer_df, targets) = rus.fit_resample(skin_cancer_df.drop('target', axis=1), skin_cancer_df['target'])
    
    # Add an image path column and turn targets into binary strings
    skin_cancer_df['image_filepath'] = ['Data/image/' + img_id + '.jpg' for img_id in skin_cancer_df.index]
    skin_cancer_df['target'] = targets.astype(str)
    
    # Initialize ImageDataGenerators for testing and validation
    train_df, val_df = train_test_split(skin_cancer_df, test_size=0.3, stratify=targets, random_state=42)
    train_datagen = ImageDataGenerator(
        rescale=1./255, 
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True)
    val_datagen = ImageDataGenerator(rescale=1./255)

    # Create training and validation generators from their respective datasets
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        directory=None,
        x_col='image_filepath',
        y_col='target',
        target_size=(224, 224),
        batch_size=32,
        class_mode='binary')
    val_generator = val_datagen.flow_from_dataframe(
        dataframe=val_df,
        directory=None,
        x_col='image_filepath',
        y_col='target',
        target_size=(224, 224),
        batch_size=32,
        class_mode='binary')
    
    # Turn the targets back to integers
    skin_cancer_df['target'] = targets.astype(int)
    
    return skin_cancer_df.drop('image_filepath', axis=1), train_generator, val_generator

def extract_image_features(train_generator, val_generator):
    """
    Trains a ResNet50 model to extract features from images of skin lesions
    
    :param train_generator: stores augmented images for training
    :param val_generator: stores scaled images for validation
    """
    # Load ResNet50 with pre-trained ImageNet weights
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    # Freeze lower layers and allow fine-tuning on upper layers
    for layer in base_model.layers[:int(len(base_model.layers) * 0.3)]:
        layer.trainable = False
    
    # Add global pooling and temporary classification layers for fine-tuning
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dense(512, activation='relu', kernel_regularizer=l2(0.04), name='fc1')(x)
    x = BatchNormalization(name='bn1')(x)
    x = Dropout(0.1, name='dropout1')(x)
    x = Dense(256, activation='relu', kernel_regularizer=l2(0.04), name='fc2')(x)
    x = BatchNormalization(name='bn2')(x)
    x = Dropout(0.1, name='dropout2')(x)
    x = Dense(128, activation='relu', kernel_regularizer=l2(0.04), name='fc3')(x)
    x = BatchNormalization(name='bn3')(x)
    x = Dropout(0.1, name='dropout3')(x)
    output = Dense(1, activation='sigmoid', name='output')(x)
    model = Model(inputs=base_model.input, outputs=output)

    # Initialize loss and callback functions
    #loss = WeightedBinaryCrossentropy(weight_zero=1.0, weight_one=2.0)
    early_stopping = EarlyStopping(monitor='val_recall', patience=8, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_recall', factor=0.1, patience=5, min_lr=0.001)
    
    # Compile and train the model
    model.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', 
                  metrics=[keras.metrics.TruePositives(name='true_positives'), keras.metrics.Recall(name='recall'), 
                           keras.metrics.SpecificityAtSensitivity(sensitivity=0.8, name='specificity_at_sensitivity'), BinaryCrossentropy(name='BinaryCrossentropy')])
    history = model.fit(train_generator, validation_data=val_generator, epochs=20, callbacks=[early_stopping, reduce_lr])

    # Use trained model to extract image features 
    feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)
    train_features = GlobalAveragePooling2D()(base_model.predict(train_generator, verbose=1))
    val_features = GlobalAveragePooling2D()(base_model.predict(val_generator, verbose=1))
    images_features = np.vstack([train_features, val_features])

    return images_features, history, base_model

def plot_history(history, fig_name):
    """
    Plots the trianing and validation performance across epochs and various metrics
    """
    # Initialize variables required to plot series of charts
    plt.figure(figsize=(18, 16))
    num_plots = int(len(list(history.history.keys())[:-1]) / 2)
    if num_plots % 2 == 0:
        num_rows = num_plots / 2
    else:
        num_rows = (num_plots + 1) / 2
    all_metrics = list(history.history.keys())[:num_plots]
    
    # Plot training & validation performance for every metrics
    for p in range(num_plots):
        metric = all_metrics[p]
        plt.subplot(num_rows, 2, p + 1)
        plt.plot(history.history[metric], label=f'Training {metric}')
        plt.plot(history.history[f'val_{metric}'], label=f'Validation {metric}')
        plt.title(f'Training and Validation {metric}')
        plt.xlabel('Epoch')
        plt.ylabel(metric)
        plt.legend()
        
    # Save the figure
    plt.savefig(fig_name)

In [None]:
# Load the image features and metadata
skin_cancer_df, train_generator, val_generator = load_data(undersample_strat=0.07)
images_features, resnet_history, feature_extractor = extract_image_features(train_generator, val_generator)
pd.DataFrame(images_features).to_csv('resnet50_features.csv', index=False, header=False)

# Plot and save the loss history
plot_history(resnet_history, 'ResNet50 Performance')

Found 4204 validated image filenames belonging to 2 classes.
Found 1803 validated image filenames belonging to 2 classes.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

# Exploratory Data Analysis of Metadata by Lesion Type (Cancer vs Non-Cancer)

### How balanced is the data?

In [None]:
# Report the number of cancerous vs non-cancerous lesions in the data
not_cancer = skin_cancer_df[skin_cancer_df['target'] == 0]
cancer = skin_cancer_df[skin_cancer_df['target'] == 1]
print(f'Out of the {len(skin_cancer_df)} lesions in our dataset, {len(not_cancer)} are not cancerous and {len(cancer)} are cancerous.')

# Visualize the results in a pie chart
fig, ax = plt.subplots()
ax.pie([len(not_cancer), len(cancer)], labels=['Not Cancer', 'Cancer'], autopct='%1.1f%%')
ax.set_title('Proportion of Cancerous vs Non-Cancerous Lesions')
plt.show()

#### The data is heavily imbalanced, with almost all available lesions being non-cancerous. This characteristic of the data is our primary motivator for utilizing anomaly detection rather than binary classification as our method for cancer detection.

### Do men and women make up different proportions of cancerous vs non-cancerous lesions?

In [None]:
# Obtain the frequencies of each sex for cancerous and non-cancerous lesions
gender_freqs_cancer = Counter(cancer['sex'])
gender_freqs_noncancer = Counter(not_cancer['sex'])

# Visualize the frequencies
fig, ax = plt.subplots(1,2)
ax[0].pie([gender_freqs_noncancer['male'], gender_freqs_noncancer['female'], gender_freqs_noncancer[np.nan]],
       labels=['male', 'female', 'NA'], autopct='%1.1f%%')
ax[0].set_title('Non-Cancerous Patients')
ax[1].pie([gender_freqs_cancer['male'], gender_freqs_cancer['female'], gender_freqs_cancer[np.nan]],
       labels=['male', 'female', 'NA'], autopct='%1.1f%%')
ax[1].set_title('Cancerous Patients')
plt.show()

#### Men are more represented in cancerous lesions than non-cancerous lesions, which aligns with the notion that men are more likely to obtain skin cancer

### Is there a significant difference in the age distribution for cancerous vs non-cancerous patients?

In [None]:
# Visualize the age distributions
plt.hist(cancer['age_approx'], histtype='step', color='red', density=True, label='Cancerous')
plt.hist(not_cancer['age_approx'], histtype='step', color='green', density=True, label='Non-Cancerous')
plt.legend()
plt.xlabel('Age Approximations')
plt.ylabel('Probability')
plt.title('Age Distribution of Cancerous vs Non-Cancerous Patients')
plt.show()

# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(cancer['age_approx'], not_cancer['age_approx'])

# Print the result
print(f'Mann-Whitney U test: U-stat = {u_stat}, p-value = {p_value}')

# Interpretation
if p_value < 0.05:
    print('There is a significant difference in the age distribution between cancerous and non-cancerous patients.')
else:
    print('There is no significant difference in the age distribution between cancerous and non-cancerous patients.')

### Summary Statistics

In [None]:
# Define the columns to compare summary stats for (choose columns that align with the ABCD factors used for skin cancer detection)
use_cols = ['tbp_lv_symm_2axis', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'clin_size_long_diam_mm']

# Present summary statistics for cancerous patients
cancer[use_cols].describe()

In [None]:
# Present summary statistics for non-cancerous patients
not_cancer[use_cols].describe()

In [None]:
# Use the Mann-Whitney U test to determine if any of these differences are significant
for col in use_cols:
    u_stat, p_value = stats.mannwhitneyu(cancer[col], not_cancer[col])
    if p_value < 0.05:
        print(f'There is a significant difference in {col} between cancerous and non-cancerous patients.')
    else:
        print(f'There is no significant difference in {col} between cancerous and non-cancerous patients.')

### Null Values

In [None]:
# Identify columns with null values for non-cancerous patients
not_cancer_nulls = filter(lambda item: item[1] > 0, not_cancer.isnull().sum().items())
print('Columns with null values for non-cancerous patients:')
for tup in not_cancer_nulls:
    print(f'Column: {tup[0]}, No. of Nulls: {tup[1]}, As a %: {round(tup[1]/len(cancer[tup[0]]), 2)}')
    
# Identify columns with null values for cancerous patients
cancer_nulls = filter(lambda item: item[1] > 0, cancer.isnull().sum().items())
print('\nColumns with null values for cancerous patients:')
for tup in cancer_nulls:
    print(f'Column: {tup[0]}, No. of Nulls: {tup[1]}, As a %: {round(tup[1]/len(cancer[tup[0]]), 2)}')

# Data Preprocessing

In [None]:
# Obtain the categorical (nominal) features
skin_cancer_prepro = skin_cancer_df.copy()
categorical_features = skin_cancer_prepro.select_dtypes(include=['object', 'category', 'string']).columns.tolist()

# Impute and encode values in categorical columns
for feature in categorical_features:
    
    # Impute null values in categorical features with the mode
    skin_cancer_prepro[feature] = skin_cancer_prepro[feature].fillna(skin_cancer_prepro[feature].mode()[0])
    
    # Apply one-hot encoding to categorical (nominal) variables
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_feature = encoder.fit_transform(skin_cancer_prepro[[feature]])
    
    # Add the encoded columns to the dataframe
    encoded_col_names = [f"{feature}_{cat}" for cat in encoder.categories_[0]]
    encoded_feature_df = pd.DataFrame(encoded_feature, columns=encoded_col_names, index=skin_cancer_prepro.index)
    skin_cancer_prepro = pd.concat([skin_cancer_prepro, encoded_feature_df], axis=1)
    
# Remove unencoded categorical columns
skin_cancer_prepro = skin_cancer_prepro.drop(columns=categorical_features)
updated_cols = skin_cancer_prepro.columns
 
# Use KNN to impute null values in the numerical columns
imputer = KNNImputer(n_neighbors=5)
imputed_array = imputer.fit_transform(skin_cancer_prepro)
skin_cancer_prepro = pd.DataFrame(imputed_array, columns=updated_cols, index=skin_cancer_prepro.index)

# Feature Engineering

In [None]:
def create_features(df):
    """
    Creates new features to help the model evaluate the ABCD factors used by dermatologists
    :param df: a dataframe to add new features to
    :return: the input dataframe with updated features
    """
    og_cols = len(df.columns)
    # A - Asymmetry, Border irregularity/bluriness, and Diameter (skin cancer diameter usually > 6 mm)
    df['diameter_ratio'] = df['tbp_lv_minorAxisMM'] / df['clin_size_long_diam_mm']
    df['area_irregularity'] = np.abs((np.pi * (df['clin_size_long_diam_mm'] / 2)**2) - (df['tbp_lv_areaMM2'])**(1/2))
    df['perimeter_irregularity'] = np.abs((np.pi * df['clin_size_long_diam_mm']) - df['tbp_lv_perimeterMM'])
    df['area_perimeter_ratio'] = df['tbp_lv_areaMM2'] / (df['tbp_lv_perimeterMM'] ** 2)
    df['large_diameter'] = [1 if val > 6 else 0 for val in df['clin_size_long_diam_mm']] # skin cancer diameters tend to be larger than 6 mm
    df['perimeter_to_area'] = (df['tbp_lv_perimeterMM']**2) / df['tbp_lv_areaMM2']
    df['avg_normalized_irregularity'] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"]) / 2
    
    # Color (variation)  
    df['hc_mean_contrast'] = ((df['tbp_lv_H'] + df['tbp_lv_Hext']) / 2) + ((df['tbp_lv_C'] + df['tbp_lv_Cext']) / 2)
    df['tbp_lv_deltaH'] = np.abs(df['tbp_lv_H'] - df['tbp_lv_Hext'])
    df['tbp_lv_deltaC'] = np.abs(df['tbp_lv_C'] + df['tbp_lv_Cext'])
    df['overall_lab_contrast'] = np.sqrt(df['tbp_lv_deltaL']**2 + df['tbp_lv_deltaA']**2 + df['tbp_lv_deltaB']**2)
    df['large_color_variance'] = [1 if val > 4 else 0 for val in df['tbp_lv_color_std_mean']]
    df['average_lab_contrast'] = (df['tbp_lv_deltaL'] + df['tbp_lv_deltaA'] + df['tbp_lv_deltaB']) / 3
    
    # Features to maximize other features
    df['lesion_location'] = np.sqrt(df['tbp_lv_x']**2 + df['tbp_lv_y']**2 + df['tbp_lv_z']**2) # l2 norm of lesion coordinates
    print(f'Created {len(df.columns) - og_cols} New Features During Feature Engineering')
    df = df.drop(['tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z'], axis=1)

    return df

# Apply feature engineering
skin_cancer_enhanced = create_features(skin_cancer_prepro.copy())

In [None]:
skin_cancer_enhanced

# Feature Importances

In [None]:
# Calculate feature importances using mutual information classification
fs = SelectKBest(score_func=mutual_info_classif, k='all')
skin_cancer_array = fs.fit_transform(skin_cancer_enhanced.drop(['target'], axis=1), skin_cancer_enhanced['target'])

In [None]:
# Plot the feature importances
plt.figure(figsize=(15,5))
use_cols = skin_cancer_enhanced.drop(['target'], axis=1).columns
plt.bar([use_cols[i] for i in range(len(fs.scores_))], fs.scores_)
plt.xticks(rotation='vertical')
plt.show()

# Aggregating Image Features and Metadata

In [None]:
# Combine enhanced metadata and image features into one data set
images_features = pd.DataFrame(images_features)
skin_cancer_full = pd.concat([skin_cancer_enhanced.reset_index(drop=True), images_features], axis=1)
skin_cancer_full.columns = skin_cancer_full.columns.astype(str)

# Isolation Forest

In [None]:
best = (0, 0, 0, None)
cancer = skin_cancer_full[skin_cancer_full['target']==1]
no_cancer = skin_cancer_full[skin_cancer_full['target']==0]
training, val_no_cancer = train_test_split(no_cancer, test_size=0.40, random_state=42)
validation = pd.concat([val_no_cancer, cancer], axis=0)
val_counter = Counter(validation['target'])
print(f'Validation Data: {val_counter[1.0]} cancer, {val_counter[0.0]} not cancer, {round(val_counter[1.0] / val_counter[0.0]*100, 3)}%')

# Scale the data between 0 and 1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(training.drop('target', axis=1))
X_test = scaler.transform(validation.drop('target', axis=1))

# Develop and train the Isolation Forest model
for c in range(20, 51, 1):
    print(c*0.01)
    for estimators in range(70, 150, 10):
        # Initialize and train the model
        isf = IsolationForest(n_estimators=estimators, contamination=c*0.01, random_state=42)        
        scores_prediction = isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance on testing data
        cr = classification_report(validation['target'], y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (estimators, c*0.01, f1_score, cr)
            print(f'\nEstimators: {estimators}, C: {c*0.01}, f1_score: {f1_score}')
            print(cr)
print('Best Hyperparameters + result:', best[:3], '\n', best[3])

# Autoencoder

In [None]:
@keras.saving.register_keras_serializable(name="weighted_bincrossentropy")
class WeightedBinaryCrossentropy(tf.keras.losses.Loss):
    """
    A custom loss functions to address class imbalance
    """
    def __init__(self, weight_zero=1.0, weight_one=2.0, **kwargs):
        super().__init__(**kwargs)
        self.weight_zero = tf.constant(weight_zero, dtype=tf.float32)
        self.weight_one = tf.constant(weight_one, dtype=tf.float32)

    def call(self, y_true, y_pred):
        """
        Returns the weighted binary crossentropy of two lists containing 1s and 0s
        """
        # Ensure shapes and types are compatible
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)

        # Compute weighted binary crossentropy
        bin_crossentropy = -(y_true * tf.math.log(y_pred) + (1. - y_true) * tf.math.log(1. - y_pred))
        weights = y_true * self.weight_one + (1. - y_true) * self.weight_zero
        weighted_bin_crossentropy = weights * bin_crossentropy

        return tf.reduce_mean(weighted_bin_crossentropy)

    def get_config(self):
        config = super().get_config()
        config.update({"weight_zero": self.weight_zero.numpy(), "weight_one": self.weight_one.numpy()})
        return config

In [None]:
# Oversample minorty class using SMOTE
best = (0,0,0,0, None)

# Separate data into training and validation - train only on normal data and validate on mixed data
cancer = skin_cancer_full[skin_cancer_full['target'] == 1]
no_cancer = skin_cancer_full[skin_cancer_full['target'] == 0]
training, val_no_cancer = train_test_split(no_cancer, test_size=0.4, random_state=42)
validation = pd.concat([val_no_cancer, cancer], axis=0)

# Scale data between 0 and 1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(training.drop('target', axis=1))
X_val = scaler.fit_transform(validation.drop('target', axis=1))

# Construct the autoencoder model
for d in range(2, 6, 2):
    best = (0,0,0,0, None)
    autoencoder = Sequential([
        # Encoder
        Dense(1024, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn1'),
        Dropout(d * 0.1),  
        Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn2'),
        Dropout(d * 0.1),  
        Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn3'),
        Dropout(d * 0.1),  
        Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn4'),
        
        # Bottleneck (increased capacity)
        Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        
        # Decoder
        Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn5'),
        Dropout(d * 0.1),
        Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn6'),
        Dropout(d * 0.1),  
        Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        BatchNormalization(name='bn7'),
        Dense(1024, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
        
        # Output Layer
        Dense(X_train.shape[1], activation='sigmoid')
    ])

    loss = WeightedBinaryCrossentropy(weight_zero=1.0, weight_one=20.0)
    autoencoder.compile(optimizer=Adam(learning_rate=0.0001), loss=[loss], metrics = [keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall'), 
                                                                                    keras.metrics.TruePositives(name='true_positives'), keras.metrics.AUC(name='auc')])
    early_stopping = EarlyStopping(monitor='val_recall', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_recall', factor=0.1, patience=5, min_lr=0.001)


    # Train the autoencoder using only the non-cancerous patients
    autoencoder_history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

    # Find the epoch with the lowest validation loss
    best_epoch = np.argmin(autoencoder_history.history['val_loss']) + 1
    best_val_loss = np.min(autoencoder_history.history['val_loss'])

    # Calculate reconstruction error for each sample
    reconstructed = autoencoder.predict(X_val)
    reconstruction_error = np.mean(np.abs(reconstructed - X_val), axis=1)
    
    # Identify the optimal threshold that maximizes the distance between TPR and FPR
    fpr, tpr, thresholds = roc_curve(validation['target'], reconstruction_error)
    optimal_threshold = thresholds[np.argmax(tpr - fpr)]
    
    predictions = (reconstruction_error > optimal_threshold).astype(int)
        
    cr = classification_report(validation['target'], predictions)
    print(cr)
    f1_score = float(cr.split()[12])  
    
    if f1_score > best[3]:
        best = (d, optimal_threshold, best_epoch, f1_score, cr)

print('Overall Best (d, tresh, best_epoch, f1_score, cr)')
print(best[:4])
print(best[4])

## Autoencoder Performance

In [None]:
# Plot the autoencoder predictions against the true values
plt.figure()
colors = ['red' if p == 1.0 else 'green' for p in predictions]
plt.title('Reconstruction Errors and Predictions against True Targets')
plt.scatter(reconstruction_error, validation['target'], color=colors)
plt.xlabel('Reconstruction Error')
plt.ylabel('True Targets (1 - cancer, 0 - not cancer)')
red_patch = mpatches.Patch(color='red', label='Cancer Prediction')
green_patch = mpatches.Patch(color='green', label='Benign Prediction')
plt.legend(handles=[red_patch, green_patch])
plt.show()

In [None]:
# Compute the ROC curve for the model's predictions
fpr, tpr, thresholds = roc_curve(validation['target'], reconstruction_error)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Autoencoder Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
plot_history(autoencoder_history, 'Autoencoder Performance')

# Simple Binary Classification Models

## Logistic Regression

In [None]:
# Setting the class weight to balanced increases sensitivity (recall) by double!
lr = LogisticRegression(random_state=42, max_iter=800, class_weight='balanced')
X_train, X_rest, y_train, y_rest = train_test_split(skin_cancer_full.drop('target', axis=1), skin_cancer_full['target'], test_size=0.2, 
                                                    stratify=skin_cancer_full['target'], random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.1, stratify=y_rest, random_state=42)

lr.fit(X_train, y_train)
val_preds = lr.predict(X_val)
test_preds = lr.predict(X_test)
rest_preds = lr.predict(X_rest)

rest_counter = Counter(y_rest)
print(f'Validation Data: {rest_counter[1.0]} cancer, {rest_counter[0.0]} not cancer, {round(rest_counter[1.0] / rest_counter[0.0]*100, 3)}%')
print('Validation Performance:\n', classification_report(y_rest, rest_preds))

val_counter = Counter(y_val)
print(f'Validation Data: {val_counter[1.0]} cancer, {val_counter[0.0]} not cancer, {round(val_counter[1.0] / val_counter[0.0]*100, 3)}%')
print('Validation Performance:\n', classification_report(y_val, val_preds))

test_counter = Counter(y_test)
print(f'\nTest Data: {test_counter[1.0]} cancer, {test_counter[0.0]} not cancer, {round(test_counter[1.0] / test_counter[0.0]*100, 3)}%')
print('Test Performance:\n', classification_report(y_test, test_preds))

## CatBoostClassifier

In [None]:
# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(random_state=42,class_weights=[1, 2], iterations=1000, verbose=200)

# Split the data
X_train, X_rest, y_train, y_rest = train_test_split(skin_cancer_full.drop('target', axis=1), skin_cancer_full['target'], 
                                                    test_size=0.2, stratify=skin_cancer_full['target'], random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.05, stratify=y_rest, random_state=42)

# Train the CatBoost model and utilize it for predictions
catboost_model.fit(X_train, y_train)
val_preds = catboost_model.predict(X_val)
test_preds = catboost_model.predict(X_test)
rest_preds = catboost_model.predict(X_rest)

# Print rest data performance
rest_counter = Counter(y_rest)
print(f'Validation Data: {rest_counter[1.0]} cancer, {rest_counter[0.0]} not cancer, {round(rest_counter[1.0] / rest_counter[0.0]*100, 3)}%')
print('Validation Performance:\n', classification_report(y_rest, rest_preds))

# Print validation data performance
val_counter = Counter(y_val)
print(f'Validation Data: {val_counter[1.0]} cancer, {val_counter[0.0]} not cancer, {round(val_counter[1.0] / val_counter[0.0]*100, 3)}%')
print('Validation Performance:\n', classification_report(y_val, val_preds))

# Print test data performance
test_counter = Counter(y_test)
print(f'\nTest Data: {test_counter[1.0]} cancer, {test_counter[0.0]} not cancer, {round(test_counter[1.0] / test_counter[0.0]*100, 3)}%')
print('Test Performance:\n', classification_report(y_test, test_preds))

# Autoencoder and CatBoost Classifier

In [None]:
# Initializwe the catboost model and split the validation reconstruction errors into training and testing
catboost_model = CatBoostClassifier(random_state=42,class_weights=[1, 2], iterations=1000, verbose=200)
X_train, X_test, y_train, y_test = train_test_split(reconstruction_error.reshape(len(reconstruction_error), 1), validation['target'],
                                                    test_size=0.2, stratify=validation['target'], random_state=42)

# Determine the performance of the model on testing reconstruction error data
catboost_model.fit(X_train, y_train)
preds = catboost_model.predict(X_test)
print(classification_report(y_test, preds))

# Conclusion

Binary classification models outperform anomaly detection models. This likely stems from the fact that the success of anomaly detection algorithms requires normal data to look significantly different than anomalies, yet in the real world, cancerous and non-cancerous lesions can often appear identical with very similar features. Doctors frequently attest to this trait of skin lesions, as many admit that some of the lesions they believed to be non-cancerous based off appearance were actually cancerous. Unlike anomaly detection algorithms, many binary classification models do not require there to be stark differences in the appearance of lesions