In [3]:
# To do:

"""
1) PCA after feature engineering
2) figure out how to feature engineer with images (rotations, crops, etc.)
3) figure out how to combine image features with metadata
4) load images and metadata into autoencoder and get predictions
"""

'\n1) PCA after feature engineering\n2) loading images and metadata into autoencoder\n'

In [1]:
# Import libraries
import os
import zipfile
import numpy as np
import pandas as pd
from io import BytesIO
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model, Sequential
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.inspection import permutation_importance

In [2]:
# Define image directories
train_image_dir = "train-image/image"

# Load in the metadata
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
skin_cancer_df = pd.read_csv(zip_folder.open('train-metadata.csv'), low_memory=False, 
                            usecols=[num for num in range(0, 43) if num not in [2, 7]], index_col='isic_id')

# Initialize the ResNet model
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Load images and extract features using ResNet50 and batch procesing
def extract_features(img):
    """
    Extracts features from a given image
    :param img: an image instance in PIL format
    :return: the image features
    """
    # Preprocess the image to align with ResNet50 requirements
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array) 
    
    # Extract features
    features = resnet_model.predict(img_array)
    return features.flatten()

def feature_batch_generator(zip_file, img_size=(224, 224), batch_size=32):
    """
    Loads images and extracts feautres with batch processing and a ResNet50 model
    """
    # Indetify all the image files from the zipped folder
    with zipfile.ZipFile(zip_file, 'r') as z:
        files = [file for file in z.namelist() if file.startswith('image/') and file.endswith('.jpg')]
        # Employ batch processing to load images and extract features
        for i in range(0, len(files), batch_size):
            batch_files = files[i:i + batch_size]
            features = []
            for file in batch_files:
                with z.open(file) as img_file:
                    img = load_img(BytesIO(img_file.read()), target_size=img_size)
                    img_features = extract_features(img)
                    features.append(img_features)
            # Return the features of the images in the batch
            yield np.array(features)

# Data Preprocessing

In [3]:
# Obtain the categorical (nominal) features
categorical_features = skin_cancer_df.select_dtypes(include=['object', 'category', 'string']).columns.tolist()

# Impute and encode values in categorical columns
updated_features = []
for feature in categorical_features:
    
    # Impute null values in categorical features with the mode
    skin_cancer_df[feature] = skin_cancer_df[feature].fillna(skin_cancer_df[feature].mode()[0])
    
    # Apply one-hot encoding to categorical (nominal) variables
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_feature = encoder.fit_transform(skin_cancer_df[[feature]])
    
    # Add the encoded columns to the dataframe
    encoded_col_names = [f"{feature}_{cat}" for cat in encoder.categories_[0]]
    encoded_feature_df = pd.DataFrame(encoded_feature, columns=encoded_col_names, index=skin_cancer_df.index)
    skin_cancer_df = pd.concat([skin_cancer_df, encoded_feature_df], axis=1)
    updated_features += encoded_col_names
    
# Remove unencoded categorical columns
skin_cancer_df = skin_cancer_df.drop(columns=categorical_features)
updated_cols = skin_cancer_df.columns
 
# Use KNN to impute null values in the numerical columns
imputer = KNNImputer(n_neighbors=5)
imputed_array = imputer.fit_transform(skin_cancer_df)
skin_cancer_df = pd.DataFrame(imputed_array, columns=updated_cols, index=skin_cancer_df.index)

# Feature Engineering

In [13]:
def create_features(df):
    """
    Create new features to help the model evaluate the ABCD factors used by dermatologists
    """
    # A - Asymmetry, Border irregularity/bluriness, and Diameter (skin cancer diameter usually > 6 mm)
    df['diameter_ratio'] = df['tbp_lv_minorAxisMM'] / df['clin_size_long_diam_mm']
    df['area_irregularity'] = np.abs((np.pi * (df['clin_size_long_diam_mm'] / 2)**2) - (df['tbp_lv_areaMM2'])**(1/2))
    df['perimeter_irregularity'] = np.abs((np.pi * df['clin_size_long_diam_mm']) - df['tbp_lv_perimeterMM'])
    df['area_perimeter_ratio'] = df['tbp_lv_areaMM2'] / (df['tbp_lv_perimeterMM'] ** 2)
    df['large_diameter'] = [1 if val > 5.5 else 0 for val in df['clin_size_long_diam_mm']]
    df['perimeter_to_area'] = (df['tbp_lv_perimeterMM']**2) / df['tbp_lv_areaMM2']
    df['avg_normalized_irregularity'] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"]) / 2
    
    # Color (variation)  
    df['hc_mean_contrast'] = ((df['tbp_lv_H'] + df['tbp_lv_Hext']) / 2) + ((df['tbp_lv_C'] + df['tbp_lv_Cext']) / 2)
    df['tbp_lv_deltaH'] = np.abs(df['tbp_lv_H'] - df['tbp_lv_Hext'])
    df['tbp_lv_deltaC'] = np.abs(df['tbp_lv_C'] + df['tbp_lv_Cext'])
    df['overall_lab_contrast'] = np.sqrt(df['tbp_lv_deltaL']**2 + df['tbp_lv_deltaA']**2 + df['tbp_lv_deltaB']**2)
    df['large_color_variance'] = [1 if val > 4 else 0 for val in df['tbp_lv_color_std_mean']]
    df['average_lab_contrast'] = (df['tbp_lv_deltaL'] + df['tbp_lv_deltaA'] + df['tbp_lv_deltaB']) / 3
    
    # Features to maximize other features
    df['lesion_location'] = np.sqrt(df['tbp_lv_x']**2 + df['tbp_lv_y']**2 + df['tbp_lv_z']**2)
    df = df.drop(['tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z'], axis=1)

    return df

# Apply feature engineering
skin_cancer_enhanced = create_features(skin_cancer_df.copy())

# Add column containing image paths
skin_cancer_enhanced['image_path'] = pd.Series(skin_cancer_enhanced.index).apply(lambda x:
                                    train_image_dir + str(x) + '.jpg').values

In [None]:
# maybe
@njit
def calculate_color_uniformity(color_std_mean, radial_color_std_max):
    return np.where(radial_color_std_max == 0, 0, color_std_mean / radial_color_std_max)


def create_new_features(df):
    # Create new features
    df["color_uniformity"] = calculate_color_uniformity(df["tbp_lv_color_std_mean"].values, 
                                                        df["tbp_lv_radial_color_std_max"].values)
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2

    return df

In [24]:
from numba import vectorize, njit, jit

# Numba-compatible functions
@njit
def calculate_lesion_size_ratio(minor_axis, long_diam):
    return np.where(long_diam <= 0.05, 0, minor_axis / long_diam)

@njit
def calculate_lesion_shape_index(area, perimeter):
    return area / (perimeter ** 2)

@vectorize(['float64(float64, float64)'], target='parallel')
def calculate_hue_contrast(h_in, h_out):
    return np.abs(h_in - h_out)

@vectorize(['float64(float64, float64)'], target='parallel')
def calculate_luminance_contrast(l_in, l_out):
    return np.abs(l_in - l_out)

@vectorize(['float64(float64, float64, float64)'], target='parallel')
def calculate_lesion_color_difference(delta_a, delta_b, delta_l):
    return np.sqrt(delta_a ** 2 + delta_b ** 2 + delta_l ** 2)

@njit
def calculate_border_complexity(norm_border, symm_2axis):
    return norm_border + symm_2axis

@njit
def calculate_color_uniformity(color_std_mean, radial_color_std_max):
    return np.where(radial_color_std_max == 0, 0, color_std_mean / radial_color_std_max)

@vectorize(['float64(float64, float64, float64)'], target='parallel')
def calculate_3d_position_distance(x, y, z):
    return np.sqrt(x ** 2 + y ** 2 + z ** 2)

@njit
def calculate_perimeter_to_area_ratio(perimeter, area):
    return perimeter / area

@njit
def calculate_lesion_visibility_score(deltaLBnorm, norm_color):
    return deltaLBnorm + norm_color

def create_new_features(df):
    # Create new features
    df["lesion_size_ratio"] = calculate_lesion_size_ratio(df["tbp_lv_minorAxisMM"].values, 
                                                          df["clin_size_long_diam_mm"].values)
    df["lesion_shape_index"] = calculate_lesion_shape_index(df["tbp_lv_areaMM2"].values, df["tbp_lv_perimeterMM"].values)
    df["hue_contrast"] = calculate_hue_contrast(df["tbp_lv_H"].values, df["tbp_lv_Hext"].values)
    df["luminance_contrast"] = calculate_luminance_contrast(df["tbp_lv_L"].values, df["tbp_lv_Lext"].values)
    df["lesion_color_difference"] = calculate_lesion_color_difference(df["tbp_lv_deltaA"].values, 
                                                                      df["tbp_lv_deltaB"].values,
                                                                      df["tbp_lv_deltaL"].values)
    df["border_complexity"] = calculate_border_complexity(df["tbp_lv_norm_border"].values, df["tbp_lv_symm_2axis"].values)
    df["color_uniformity"] = calculate_color_uniformity(df["tbp_lv_color_std_mean"].values, 
                                                        df["tbp_lv_radial_color_std_max"].values)
    df["3d_position_distance"] = calculate_3d_position_distance(df["tbp_lv_x"].values, df["tbp_lv_y"].values, 
                                                                df["tbp_lv_z"].values)
    df["perimeter_to_area_ratio"] = calculate_perimeter_to_area_ratio(df["tbp_lv_perimeterMM"].values, 
                                                                      df["tbp_lv_areaMM2"].values)
    df["lesion_visibility_score"] = calculate_lesion_visibility_score(df["tbp_lv_deltaLBnorm"].values, 
                                                                      df["tbp_lv_norm_color"].values)
    df["color_consistency"] = np.where(df["tbp_lv_Lext"] == 0, 0, df["tbp_lv_stdL"] / df["tbp_lv_Lext"])
    df["normalized_lesion_size"] = np.where(df["age_approx"] == 0, 0, df["clin_size_long_diam_mm"] / df["age_approx"])
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["eccentricity_squared"] = df["tbp_lv_eccentricity"] ** 2
    df["area_to_perimeter_ratio"] = df["tbp_lv_areaMM2"] / df["tbp_lv_perimeterMM"]
    df["circularity"] = (4 * np.pi * df["tbp_lv_areaMM2"]) / (df["tbp_lv_perimeterMM"] ** 2)  
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + 
                                         df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + 
                                        df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    return df

# Apply feature engineering
skin_cancer_enhanced = create_new_features(skin_cancer_df.copy())

# Add column containing image paths
skin_cancer_enhanced['image_path'] = pd.Series(skin_cancer_enhanced.index).apply(lambda x: 
                                               os.path.join(train_image_dir, str(x) + '.jpg')).values
skin_cancer_enhanced = skin_cancer_enhanced.drop(columns=['tbp_lv_Lext', 'tbp_lv_Cext'])

# PCA Feature Selection

# Isolation Forest - gives worse results than autoencoders

In [58]:
# Hyperparameter tuning - training on non-cancerous patientsw, w/o feature engineering
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(17, 23, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.drop('target', axis=1), skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len(y_resampled[y_resampled == 1]) / len(y_resampled)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance on testing data
        cr = classification_report(y_resampled, y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (ss, estimators, f1_score, cr)
        print(f'\nS.S.: {ss}, Estimators: {estimators}, f1_score: {f1_score}')
        print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])


S.S.: 0.17, Estimators: 50, f1_score: 0.43
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.37      0.50      0.43     68113

    accuracy                           0.80    468779
   macro avg       0.64      0.68      0.65    468779
weighted avg       0.83      0.80      0.82    468779


S.S.: 0.17, Estimators: 60, f1_score: 0.44
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.38      0.53      0.44     68113

    accuracy                           0.81    468779
   macro avg       0.65      0.69      0.66    468779
weighted avg       0.84      0.81      0.82    468779


S.S.: 0.17, Estimators: 70, f1_score: 0.43
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.38      0.51      0.43     68113

    accuracy                           0.81    468

In [15]:
# Hyperparameter tuning - training on non-cancerous patients, w feature engineering
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(17, 23, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_enhanced.drop(['target', 'image_path'], axis=1), 
                                                  skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len(y_resampled[y_resampled == 1]) / len(y_resampled)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance on testing data
        cr = classification_report(y_resampled, y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (ss, estimators, f1_score, cr)
        print(f'\nS.S.: {ss}, Estimators: {estimators}, f1_score: {f1_score}')
        print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])


S.S.: 0.17, Estimators: 50, f1_score: 0.42
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.37      0.49      0.42     68113

    accuracy                           0.80    468779
   macro avg       0.64      0.67      0.65    468779
weighted avg       0.83      0.80      0.81    468779


S.S.: 0.17, Estimators: 60, f1_score: 0.44
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.38      0.53      0.44     68113

    accuracy                           0.81    468779
   macro avg       0.65      0.69      0.66    468779
weighted avg       0.84      0.81      0.82    468779


S.S.: 0.17, Estimators: 70, f1_score: 0.43
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88    400666
         1.0       0.37      0.51      0.43     68113

    accuracy                           0.80    468

In [25]:
# Hyperparameter tuning - training on non-cancerous patients, w feature engineering and feature importance

# Sample data
X_full = skin_cancer_enhanced.drop(['target', 'image_path'], axis=1)
y_full = skin_cancer_df['target']
X_train = pd.concat([X_full[y_full == 1][:200], X_full[y_full == 0][:250]], ignore_index=True)
y_train = list(y_full[y_full == 1][:200].values) + list(y_full[y_full == 0][:250].values)
X_test = pd.concat([X_full[y_full == 1][200:], X_full[y_full == 0][250:450]], ignore_index=True)
y_test = list(y_full[y_full == 1][200:].values) + list(y_full[y_full == 0][250:450].values)   

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Determine important features
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns.to_list())
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns.to_list())
c = len([y for y in y_train if y == 1]) / len(y_train)
isf = IsolationForest(n_estimators=80, contamination=c, random_state=42)
isf.fit(X_train)
result = permutation_importance(isf, X_train, y_train, scoring='f1_weighted', n_repeats=10, random_state=42)
importance_means = result.importances_mean
feature_importances = dict(zip(X_train.columns.to_list(), importance_means))
sorted_importances = sorted(feature_importances.items(), key=lambda x:x[1], reverse=True)
use_features = [item[0] for item in sorted_importances if item[1] > 0]

# Use important features to hyperparameter tune
skin_cancer_enhanced_features = skin_cancer_enhanced.copy().drop(use_features, axis=1)

best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(17, 23, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_enhanced_features.drop(['target', 'image_path'], axis=1), 
                                                  skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len(y_resampled[y_resampled == 1]) / len(y_resampled)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance 114on testing data
        cr = classification_report(y_resampled, y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (ss, estimators, f1_score, cr)
        print(f'\nS.S.: {ss}, Estimators: {estimators}, f1_score: {f1_score}')
        print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])


S.S.: 0.17, Estimators: 50, f1_score: 0.29
              precision    recall  f1-score   support

         0.0       0.88      0.85      0.87    400666
         1.0       0.27      0.32      0.29     68113

    accuracy                           0.78    468779
   macro avg       0.58      0.59      0.58    468779
weighted avg       0.79      0.78      0.78    468779


S.S.: 0.17, Estimators: 60, f1_score: 0.3
              precision    recall  f1-score   support

         0.0       0.88      0.85      0.87    400666
         1.0       0.27      0.32      0.30     68113

    accuracy                           0.78    468779
   macro avg       0.58      0.59      0.58    468779
weighted avg       0.79      0.78      0.78    468779


S.S.: 0.17, Estimators: 70, f1_score: 0.3
              precision    recall  f1-score   support

         0.0       0.88      0.85      0.87    400666
         1.0       0.28      0.33      0.30     68113

    accuracy                           0.78    46877

In [103]:
from sklearn.tree import DecisionTreeClassifier

In [104]:
dt = DecisionTreeClassifier(random_state=42, criterion = 'gini')
dt.fit(X_train, y_train)
feature_importances_dt = pd.Series(dt.feature_importances_, index=X_train.columns).sort_values(ascending=False)

In [106]:
feature_importances_dt[feature_importances_dt > 0]

tbp_lv_H                                   0.234218
clin_size_long_diam_mm                     0.230615
tbp_lv_radial_color_std_max                0.084644
tbp_lv_minorAxisMM                         0.047223
tbp_lv_deltaB                              0.033882
tbp_lv_stdLExt                             0.024059
size_age_interaction                       0.023143
color_uniformity                           0.021359
tbp_lv_Bext                                0.021205
tbp_lv_L                                   0.019266
lesion_size_ratio                          0.018315
tbp_lv_x                                   0.016823
tbp_lv_deltaLBnorm                         0.016383
anatom_site_general_posterior torso        0.016336
log_lesion_area                            0.016013
tbp_lv_nevi_confidence                     0.015753
lesion_color_difference                    0.015429
3d_position_distance                       0.014521
lesion_severity_index                      0.014469
tbp_lv_C    

# Autoencoder - use permutation feauture importances and give it autoencoder

In [22]:
# Hyperparameter tuning - training on non-cancerous patients, w feature engineering
best = (0, 0, 0, None)
# Oversample the minority group to make the data more balanced
for ss in range(5, 35, 10):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_enhanced.drop(['target', 'image_path'], axis=1), 
                                                  skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the autoencoder model - dropout of 5 works best
    for d in range(0, 10, 2):
        
        autoencoder = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation='relu'),
            Dropout(d*0.1),  
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(32, activation='relu'),
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(128, activation='relu'),
            Dropout(d*0.1),
            Dense(X_train.shape[1], activation='sigmoid')  # Output layer should match input
        ])

        autoencoder.compile(optimizer='adam', loss='mse')

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the autoencoder using only the non-cancerous patients
        history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                                 callbacks=[early_stopping])

        # Find the epoch with the lowest validation loss
        best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
        best_val_loss = np.min(history.history['val_loss'])

        # Calculate reconstruction error for each sample
        reconstructed = autoencoder.predict(X_test)
        reconstruction_error = np.mean(np.abs(reconstructed - X_test), axis=1)

        # Threshold the reconstruction error to detect anomalies
        for thresh in range(97, 100):
            threshold = np.percentile(reconstruction_error, thresh)  # Set threshold (e.g., 99th percentile)
            predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
            cr = classification_report(y_resampled, predictions_autoencoder)
            f1_score = float(cr.split()[12])
            print(f'\nS.S.: {ss*.01}, Dropout: {d*.1}, Threshold: {thresh}, Best Epoch {best_epoch}',
                  f'f1 score: {f1_score}\n', cr)
            if f1_score > best[2]:
                best = (ss, d, thresh, best_epoch, f1_score, cr)

print('Best:')
print(best[:4])
print(best[5])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

KeyboardInterrupt: 

In [26]:
# Hyperparameter tuning - training on non-cancerous patients, w feature engineering and feature importances

# Use important features to hyperparameter tune
skin_cancer_enhanced_features = skin_cancer_enhanced.copy().drop(use_features, axis=1)

best = (0, 0, 0, None)
# Oversample the minority group to make the data more balanced
for ss in range(5, 35, 10):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_enhanced_features.drop(['target', 'image_path'], axis=1), 
                                                  skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    
    # reduce training size and then split remainder of training into validation and test - for training, just use a 
    # subset of non-cancerous, then use remainder with cancerous patients to form the validation and test sets
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the autoencoder model - dropout of 5 works best
    for d in range(0, 10, 2):
        
        autoencoder = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation='relu'),
            Dropout(d*0.1),  
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(32, activation='relu'),
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(128, activation='relu'),
            Dropout(d*0.1),
            Dense(X_train.shape[1], activation='sigmoid')  # Output layer should match input
        ])

        autoencoder.compile(optimizer='adam', loss='mse')

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the autoencoder using only the non-cancerous patients
        history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                                 callbacks=[early_stopping])

        # Find the epoch with the lowest validation loss
        best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
        best_val_loss = np.min(history.history['val_loss'])

        # Calculate reconstruction error for each sample
        reconstructed = autoencoder.predict(X_test)
        reconstruction_error = np.mean(np.abs(reconstructed - X_test), axis=1)

        # Threshold the reconstruction error to detect anomalies
        for thresh in range(97, 100):
            threshold = np.percentile(reconstruction_error, thresh)  # Set threshold (e.g., 99th percentile)
            predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
            cr = classification_report(y_resampled, predictions_autoencoder, labels=['Not Cancer', 'Cancer'])
            f1_score = float(cr.split()[12])
            print(f'\nS.S.: {ss*.01}, Dropout: {d*.1}, Threshold: {thresh}, Best Epoch {best_epoch}',
                  f'f1 score: {f1_score}\n', cr)
            if f1_score > best[2]:
                best = (ss, d, thresh, best_epoch, f1_score, cr)

print('Best:')
print(best[:4])
print(best[5])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

S.S.: 0.05, Dropout: 0.0, Threshold: 97, Best Epoch 28 f1 score: 0.77
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99    400666
         1.0       1.00      0.63      0.77     20033

    accuracy                           0.98    420699
   macro avg       0.99      0.81      0.88    420699
weighted avg       0.98      0.98      0.98    420699


S.S.: 0.05, Dropout: 0.0, Threshold: 98, Best Epoch 28 f1 score: 0.59
               precision    recall  f1-score 

KeyboardInterrupt: 

In [None]:
# Hyperparameter tuning - training on non-cancerous patients, w/o feature engineering
best = (0, 0, 0, None)
# Oversample the minority group to make the data more balanced
for ss in range(5, 35, 10):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_enhanced.drop('target', axis=1), 
                                                  skin_cancer_df['target'])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the autoencoder model - dropout of 5 works best
    for d in range(0, 10, 2):
        
        autoencoder = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation='relu'),
            Dropout(d*0.1),  
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(32, activation='relu'),
            Dense(64, activation='relu'),
            Dropout(d*0.1),
            Dense(128, activation='relu'),
            Dropout(d*0.1),
            Dense(X_train.shape[1], activation='sigmoid')  # Output layer should match input
        ])

        autoencoder.compile(optimizer='adam', loss='mse')

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the autoencoder using only the non-cancerous patients
        history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                                 callbacks=[early_stopping])

        # Find the epoch with the lowest validation loss
        best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
        best_val_loss = np.min(history.history['val_loss'])

        # Calculate reconstruction error for each sample
        reconstructed = autoencoder.predict(X_test)
        reconstruction_error = np.mean(np.abs(reconstructed - X_test), axis=1)

        # Threshold the reconstruction error to detect anomalies
        for thresh in range(97, 100):
            threshold = np.percentile(reconstruction_error, thresh)  # Set threshold (e.g., 99th percentile)
            predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
            cr = classification_report(y_resampled, predictions_autoencoder)
            f1_score = float(cr.split()[12])
            print(f'\nS.S.: {ss*.01}, Dropout: {d*.1}, Threshold: {thresh}, Best Epoch {best_epoch}',
                  f'f1 score: {f1_score}\n', cr)
            if f1_score > best[2]:
                best = (ss, d, thresh, best_epoch, f1_score, cr)

print('Best:')
print(best[:4])
print(best[5])