In [1]:
# Import libraries
import zipfile
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.losses import BinaryCrossentropy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# Data Preprocessing

In [2]:
# Load data
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
skin_cancer_df = pd.read_csv(zip_folder.open('train-metadata.csv'), 
                             usecols=[num for num in range(0, 43) if num not in [2, 7, 8, 31]], index_col='isic_id')

# Convert categorical data to numbers
encoder = LabelEncoder()
for feature in ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']:
    skin_cancer_df[feature] = encoder.fit_transform(skin_cancer_df[feature])

# Fill in blank values in columns using a KNN imputer
imputer = KNNImputer(n_neighbors=5)
skin_cancer_df[['age_approx', 'sex']] = imputer.fit_transform(skin_cancer_df[['age_approx', 'sex']])

# Oversample the minority group to make the data more balanced
smote = SMOTE(sampling_strategy=0.15, random_state=42)
X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

# Split the data
X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Method 1: Isolation Forest

In [None]:
# Hyperparameter tuning
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(3, 19, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data
    X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len([1 for target in y_train if target == 1]) / len(y_train)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the validation data
        isf_validation_preds = isf.predict(X_validation)
        isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

        # Evaluate the models performance on validation data
        #print(f'\nS.S.: {ss}, Estimators: {estimators}')
        cr = classification_report(y_validation, isf_valid_pred)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best_print = (f'\nS.S.: {ss}, Estimators: {estimators}', f1_score)
            best = (ss, estimators, f1_score, cr)
        #print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])

In [None]:
### Other way to train the isolation forest - only train on the non-cancerous patients

# Hyperparameter tuning
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(17, 23, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len(y_resampled[y_resampled == 1]) / len(y_resampled)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance on testing data
        cr = classification_report(y_resampled, y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best_print = (f'\nS.S.: {ss}, Estimators: {estimators}', f1_score)
            best = (ss, estimators, f1_score, cr)
        #print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])

# Make predictions for the entire data set 

# Method 2: One Class SVM - takes too long to run even with dimension reduction methods

In [None]:
"""
# SVM takes too long on large data
for comp in range(7, 25, 2):
    pca = PCA(n_components=comp, random_state=42)  # Reduce to 10 components (you can adjust this number)
    X_train_pca = pca.fit_transform(X_train)
    X_validation_pca = pca.transform(X_validation)

    # Fit One-Class SVM with a linear kernel on the reduced data
    ocsvm = OneClassSVM(kernel='linear')
    ocsvm.fit(X_train_pca)

    # Predict the targets for the validation data
    ocsvm_validation_preds = ocsvm.predict(X_validation_pca)

    # Convert One-Class SVM predictions to binary (1 for cancerous, 0 for non-cancerous)
    y_pred_valid = [1 if p == -1 else 0 for p in ocsvm_validation_preds]

    # Evaluate the model's performance on the validation data
    print(classification_report(y_validation, y_pred_valid))
"""

# Method 3: Autoencoders

In [5]:
# Oversample the minority group to make the data more balanced
#smote = SMOTE(sampling_strategy=0.15, random_state=42)
#X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

# Split the data - training is non-cancerous, test is on all patients to detect anomalies
X_train = X_resampled[y_resampled == 0]
X_test = X_resampled

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Build the autoencoder model - dropout of 5 works best
for d in range(5, 15, 10):
    autoencoder = Sequential([
        Dense(128, input_dim=X_train.shape[1], activation='relu'),
        Dropout(d*0.01),  # Add dropout layer to reduce overfitting
        Dense(64, activation='relu'),
        Dropout(d*0.01),
        Dense(32, activation='relu'),
        Dense(64, activation='relu'),
        Dropout(d*0.01),
        Dense(128, activation='relu'),
        Dropout(d*0.01),
        Dense(X_train.shape[1], activation='sigmoid')
    ])

    autoencoder.compile(optimizer='adam', loss='binary_crossentropy', 
                       metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), 
                       Recall(name='recall'), BinaryCrossentropy(name='BinaryCrossentropy')])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the autoencoder using only the non-cancerous patients
    history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                             callbacks=[early_stopping])
    
    # Find the epoch with the lowest validation loss
    best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
    best_val_loss = np.min(history.history['val_loss'])

    print(f"The best epoch is: {best_epoch}")
    print(f"The validation loss at the best epoch is: {best_val_loss}")

    # Calculate reconstruction error for each sample
    reconstructed = autoencoder.predict(X_test)
    reconstruction_error = np.mean(np.abs(reconstructed - X_test), axis=1)

    # Threshold the reconstruction error to detect anomalies
    threshold = np.percentile(reconstruction_error, 98)  # Set threshold (e.g., 99th percentile)
    predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
    cf = classification_report(y_resampled, predictions_autoencoder)
    print(f'\nDropout: {d}\n')
    print(cf)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
The best epoch is: 17
The validation loss at the best epoch is: 0.5080110430717468

Dropout: 5

              precision    recall  f1-score   support

           0       0.88      1.00      0.94    400666
           1       0.83      0.13      0.22     60099

    accuracy                           0.88    460765
   macro avg       0.86      0.56      0.58    460765
weighted avg       0.88      0.88      0.84    460765



In [None]:
# Build the autoencoder model - training is mix of cancerous and non-cancerous: dropout of 5 works best
for d in range(5, 45, 10):
    autoencoder = Sequential([
        Dense(128, input_dim=X_train.shape[1], activation='relu'),
        Dropout(d*0.01),  # Add dropout layer to reduce overfitting
        Dense(64, activation='relu'),
        Dropout(d*0.01),
        Dense(32, activation='relu'),
        Dropout(d*0.01),
        Dense(64, activation='relu'),
        Dropout(d*0.01),
        Dense(128, activation='relu'),
        Dense(X_train.shape[1], activation='sigmoid')
    ])

    autoencoder.compile(optimizer='adam', loss='mse')

    # Train the autoencoder using only the non-cancerous patients
    autoencoder.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.1)

    # Calculate reconstruction error for each sample
    reconstructed = autoencoder.predict(X_validation)
    reconstruction_error = np.mean(np.abs(reconstructed - X_validation), axis=1)

    # Threshold the reconstruction error to detect anomalies
    threshold = np.percentile(reconstruction_error, 98)  # Set threshold (e.g., 99th percentile)
    predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
    cf = classification_report(y_validation, predictions_autoencoder)
    print(f'\nDropout: {d}\n')
    print(cf)

# Method 4: Local Outlier Factor

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# n= 25 ==> best
for n in range(25, 55, 10):
    # Perform LOF on the training data
    lof = LocalOutlierFactor(n_neighbors=n, contamination=c, novelty=True)

    # Fit the model on the training data
    lof.fit(X_train)

    # Predict the targets for the validation data
    lof_validation_preds = lof.predict(X_validation)

    # Convert LOF predictions to binary (1 for cancerous, 0 for non-cancerous)
    y_pred_valid = [1 if p == -1 else 0 for p in lof_validation_preds]

    # Evaluate the model's performance on the validation data
    cr = classification_report(y_validation, y_pred_valid)
    print(f'\nN={n}\n', cr)