In [1]:
# Import libraries
import zipfile
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.losses import BinaryCrossentropy
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.callbacks import EarlyStopping

In [6]:
# Load data
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
skin_cancer_df = pd.read_csv(zip_folder.open('train-metadata.csv'), 
                             usecols=[num for num in range(0, 43) if num not in [2, 7]], index_col='isic_id')

# Convert categorical data to numbers

encoder = LabelEncoder()
for feature in ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_tile_type']:
    skin_cancer_df[feature] = encoder.fit_transform(skin_cancer_df[feature])

# Fill in blank values in columns using a KNN imputer
imputer = KNNImputer(n_neighbors=5)
skin_cancer_df[['age_approx', 'sex']] = imputer.fit_transform(skin_cancer_df[['age_approx', 'sex']])

ValueError: could not convert string to float: 'male'

# Data Preprocessing

In [2]:
# Load data
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
skin_cancer_df = pd.read_csv(zip_folder.open('train-metadata.csv'), 
                             usecols=[num for num in range(0, 43) if num not in [2, 7]], index_col='isic_id')

# Convert categorical data to numbers
encoder = LabelEncoder()
for feature in ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_tile_type']:
    skin_cancer_df[feature] = encoder.fit_transform(skin_cancer_df[feature])

# Fill in blank values in columns using a KNN imputer
imputer = KNNImputer(n_neighbors=5)
skin_cancer_df[['age_approx', 'sex']] = imputer.fit_transform(skin_cancer_df[['age_approx', 'sex']])

# Oversample the minority group to make the data more balanced
smote = SMOTE(sampling_strategy=0.21, random_state=42)
X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

# Split the data
X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Method 1: Isolation Forest

In [None]:
smote = SMOTE(sampling_strategy=0.21, random_state=42)
X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

X_train = X_train[y_train == 0]

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Develop and train the Isolation Forest model
c = len([1 for target in y_train if target == 1]) / len(y_train)

isf = IsolationForest(n_estimators=50, contamination=c, random_state=42)
isf.fit(X_train)

# Predict the targets for the validation data
isf_validation_preds = isf.predict(X_validation)
isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

cr = classification_report(y_validation, isf_valid_pred)
print(cr)

In [None]:
smote = SMOTE(sampling_strategy=0.21, random_state=42)
X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

# Scale the data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Develop and train the Isolation Forest model
c = len([1 for target in y_train if target == 1]) / len(y_train)

isf = IsolationForest(n_estimators=50, contamination=c, random_state=42)
isf.fit(X_train)

# Predict the targets for the validation data
isf_validation_preds = isf.predict(X_validation)
isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

cr = classification_report(y_validation, isf_valid_pred)

In [None]:
# Hyperparameter tuning
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(3, 19, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data
    X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len([1 for target in y_train if target == 1]) / len(y_train)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the validation data
        isf_validation_preds = isf.predict(X_validation)
        isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

        # Evaluate the models performance on validation data
        #print(f'\nS.S.: {ss}, Estimators: {estimators}')
        cr = classification_report(y_validation, isf_valid_pred)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (ss, estimators, f1_score, cr)
        print(f'\nS.S.: {ss}, Estimators: {estimators}, f1_score: {f1_score}')
        print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])

In [3]:
### Other way to train the isolation forest - only train on the non-cancerous patients

# Hyperparameter tuning
best = (0, 0, 0, None)
for ss in [x*0.01 for x in range(17, 23, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len(y_resampled[y_resampled == 1]) / len(y_resampled)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the test data
        preds = isf.predict(X_test)
        y_preds = [1 if p == -1 else 0 for p in preds]

        # Evaluate the models performance on testing data
        cr = classification_report(y_resampled, y_preds)
        f1_score = float(cr.split()[12])
        if f1_score > best[2]:
            best = (ss, estimators, f1_score, cr)
        print(f'\nS.S.: {ss}, Estimators: {estimators}, f1_score: {f1_score}')
        print(cr)
print('Best Hyperparameters + result:', best[:2], '\n', best[3])

# Make predictions for the entire data set 


S.S.: 0.17, Estimators: 50, f1_score: 0.43
              precision    recall  f1-score   support

           0       0.91      0.85      0.88    400666
           1       0.37      0.51      0.43     68113

    accuracy                           0.80    468779
   macro avg       0.64      0.68      0.66    468779
weighted avg       0.83      0.80      0.82    468779


S.S.: 0.17, Estimators: 60, f1_score: 0.43
              precision    recall  f1-score   support

           0       0.91      0.85      0.88    400666
           1       0.37      0.50      0.43     68113

    accuracy                           0.80    468779
   macro avg       0.64      0.68      0.65    468779
weighted avg       0.83      0.80      0.82    468779


S.S.: 0.17, Estimators: 70, f1_score: 0.43
              precision    recall  f1-score   support

           0       0.91      0.85      0.88    400666
           1       0.37      0.51      0.43     68113

    accuracy                           0.80    468

# Method 3: Autoencoders

In [4]:
best = (0, 0, 0, None)
# Oversample the minority group to make the data more balanced
for ss in range(5, 35, 10):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data - training is non-cancerous, test is on all patients to detect anomalies
    X_train = X_resampled[y_resampled == 0]
    X_test = X_resampled

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the autoencoder model - dropout of 5 works best
    for d in range(0, 10, 2):
        
        autoencoder = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation='relu'),
            #BatchNormalization(),  # Apply BatchNormalization
            #Activation('relu'),     # Then apply activation function
            Dropout(d*0.1),           # Optional dropout for regularization

            Dense(64, activation='relu'),
            #BatchNormalization(),
            #Activation('relu'),
            Dropout(d*0.1),

            Dense(32, activation='relu'),
            #BatchNormalization(),
            #Activation('relu'),

            Dense(64, activation='relu'),
            #BatchNormalization(),
            #Activation('relu'),
            Dropout(d*0.1),

            Dense(128, activation='relu'),
            #BatchNormalization(),
            #Activation('relu'),
            Dropout(d*0.1),

            Dense(X_train.shape[1], activation='sigmoid')  # Output layer should match input
        ])

        autoencoder.compile(optimizer='adam', loss='mse')

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the autoencoder using only the non-cancerous patients
        history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                                 callbacks=[early_stopping])

        # Find the epoch with the lowest validation loss
        best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
        best_val_loss = np.min(history.history['val_loss'])

        #print('Sampling Strategy:', ss*0.01)
        #print('Dropout', d)
        #print(f"The best epoch is: {best_epoch}")
        #print(f"The validation loss at the best epoch is: {best_val_loss}")

        # Calculate reconstruction error for each sample
        reconstructed = autoencoder.predict(X_test)
        reconstruction_error = np.mean(np.abs(reconstructed - X_test), axis=1)

        # Threshold the reconstruction error to detect anomalies
        # Can also try to expirement with this 98 val try values from 97 to 99?
        for thresh in range(97, 100):
            threshold = np.percentile(reconstruction_error, thresh)  # Set threshold (e.g., 99th percentile)
            predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
            cr = classification_report(y_resampled, predictions_autoencoder)
            f1_score = float(cr.split()[12])
            print(f'\nS.S.: {ss*.01}, Dropout: {d*.1}, Threshold: {thresh}, Best Epoch {best_epoch}',
                  f'f1 score: {f1_score}\n', cr)
            if f1_score > best[2]:
                best = (ss, d, thresh, best_epoch, f1_score, cr)

print('Best:')
print(best[:4])
print(best[5])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

S.S.: 0.05, Dropout: 0.0, Threshold: 97, Best Epoch 13 f1 score: 0.37
               precision    recall  f1-score   support

           0       0.97      0.98      0.97    400666
           1       0.48      0.30      0.37     20033

    accuracy                           0.95    420699
   macro avg       0.72      0.64      0.67    420699
weighted avg       0.94      0.95      0.95    420699


S.S.: 0.05, Dropout: 0.0, Threshold: 98, Best Epoch 13 f1 score: 0.32
               precision    recall  f1-score   support

           0       0.96      0.99      0.98    400666
           1       0.54      0.23      0.32     20033

    accuracy                           0.95    420699
   macro avg       0.

Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100

S.S.: 0.05, Dropout: 0.4, Threshold: 97, Best Epoch 11 f1 score: 0.12
               precision    recall  f1-score   support

           0       0.96      0.97      0.96    400666
           1       0.15      0.10      0.12     20033

    accuracy                           0.93    420699
   macro avg       0.55      0.53      0.54    420699
weighted avg       0.92      0.93      0.92    420699


S.S.: 0.05, Dropout: 0.4, Threshold: 98, Best Epoch 11 f1 score: 0.09
               precision    recall  f1-score   support

           0       0.95      0.98      0.97    400666
           1       0.15      0.06      0.09     20033

    accuracy                           0.94    420699
   macro avg       0.55      0.52      0.53    420699
weighted avg       0.92      0.94      0.93    420699


S.S.: 0.05, Dropout: 


S.S.: 0.05, Dropout: 0.8, Threshold: 98, Best Epoch 5 f1 score: 0.11
               precision    recall  f1-score   support

           0       0.96      0.98      0.97    400666
           1       0.19      0.08      0.11     20033

    accuracy                           0.94    420699
   macro avg       0.57      0.53      0.54    420699
weighted avg       0.92      0.94      0.93    420699


S.S.: 0.05, Dropout: 0.8, Threshold: 99, Best Epoch 5 f1 score: 0.08
               precision    recall  f1-score   support

           0       0.95      0.99      0.97    400666
           1       0.23      0.05      0.08     20033

    accuracy                           0.95    420699
   macro avg       0.59      0.52      0.53    420699
weighted avg       0.92      0.95      0.93    420699

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch

Epoch 24/100
Epoch 25/100

S.S.: 0.15, Dropout: 0.2, Threshold: 97, Best Epoch 15 f1 score: 0.25
               precision    recall  f1-score   support

           0       0.89      0.99      0.93    400666
           1       0.68      0.16      0.25     60099

    accuracy                           0.88    460765
   macro avg       0.78      0.57      0.59    460765
weighted avg       0.86      0.88      0.85    460765


S.S.: 0.15, Dropout: 0.2, Threshold: 98, Best Epoch 15 f1 score: 0.19
               precision    recall  f1-score   support

           0       0.88      0.99      0.93    400666
           1       0.71      0.11      0.19     60099

    accuracy                           0.88    460765
   macro avg       0.79      0.55      0.56    460765
weighted avg       0.86      0.88      0.84    460765


S.S.: 0.15, Dropout: 0.2, Threshold: 99, Best Epoch 15 f1 score: 0.11
               precision    recall  f1-score   support

           0       0.88      1.00      0.93    40

Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

S.S.: 0.15, Dropout: 0.8, Threshold: 97, Best Epoch 4 f1 score: 0.15
               precision    recall  f1-score   support

           0       0.88      0.98      0.93    400666
           1       0.41      0.09      0.15     60099

    accuracy                           0.86    460765
   macro avg       0.64      0.54      0.54    460765
weighted avg       0.82      0.86      0.83    460765


S.S.: 0.15, Dropout: 0.8, Threshold: 98, Best Epoch 4 f1 score: 0.12
               precision    recall  f1-score   support

           0       0.88      0.99      0.93    400666
           1       0.44      0.07      0.12     60099

    accuracy                           0.87    460765
   macro avg       0.66      0.53      0.52    460765
weighted avg       0.82      0.87      0.82    460765


S.S.: 0.15, Dropout: 0.8, Threshold: 99, Best Epoch 4 f1 score: 0.

Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

S.S.: 0.25, Dropout: 0.2, Threshold: 97, Best Epoch 28 f1 score: 0.2
               precision    recall  f1-score   support

           0       0.82      0.99      0.90    400666
           1       0.76      0.11      0.20    100166

    accuracy                           0.82    500832
   macro avg       0.79      0.55      0.55    500832
weighted avg       0.81      0.82      0.76    500832


S.S.: 0.25, Dropout: 0.2, Threshold: 98, Best Epoch 28 f1 score: 0.14
               precision    recall  f1-score   support

           0       0.81      0.99      0.89    400666
           1       0.78      0.08      0.14    100166

    accuracy                           0.81    500832
   macro avg       0.80      0.54      

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

S.S.: 0.25, Dropout: 0.6000000000000001, Threshold: 97, Best Epoch 22 f1 score: 0.17
               precision    recall  f1-score   support

           0       0.81      0.99      0.89    400666
           1       0.64      0.10      0.17    100166

    accuracy                           0.81    500832
   macro avg       0.73      0.54      0.53    500832
weighted avg       0.78      0.81      0.75    500832


S.S.: 0.25, Dropout: 0.6000000000000001, Threshold: 98, Best Epoch 22 f1 score: 0.12
               precision    recall  f1-score   support

           0       0.81      0.99      0.89    40

In [7]:
# Build the autoencoder model - training is mix of cancerous and non-cancerous: dropout of 5 works best
# Oversample the minority group to make the data more balanced
best = ()
for ss in range(5, 50, 15):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])
    
    # Split the data
    X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)
    
    autoencoder = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation=None),
            BatchNormalization(),  # Apply BatchNormalization
            Activation('relu'),     # Then apply activation function
            Dropout(0.5),           # Optional dropout for regularization

            Dense(64, activation=None),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(32, activation=None),
            BatchNormalization(),
            Activation('relu'),

            Dense(64, activation=None),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(128, activation=None),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),
        
            Dense(X_train.shape[1], activation='sigmoid')  # Output layer should match input
    ])


    autoencoder.compile(optimizer='adam', loss='mse')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the autoencoder
    history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_split=0.1,
                                 callbacks=[early_stopping])

    # Find the epoch with the lowest validation loss
    best_epoch = np.argmin(history.history['val_loss']) + 1  # Add 1 since epochs are 1-indexed
    best_val_loss = np.min(history.history['val_loss'])

    print('Sampling Strategy:', ss*0.01)
    print('Dropout', d)
    print(f"The best epoch is: {best_epoch}")
    print(f"The validation loss at the best epoch is: {best_val_loss}")

    # Calculate reconstruction error for each sample
    reconstructed = autoencoder.predict(X_validation)
    reconstruction_error = np.mean(np.abs(reconstructed - X_validation), axis=1)  

    # Threshold the reconstruction error to detect anomalies
    threshold = np.percentile(reconstruction_error, 98)  # Set threshold (e.g., 99th percentile)
    predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
    cf = classification_report(y_validation, predictions_autoencoder)
    print(f'\nDropout: {d}\n')
    print(cf)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Sampling Strategy: 0.05
Dropout 5
The best epoch is: 31
The validation loss at the best epoch is: 0.005869598593562841

Dropout: 5

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     75125
           1       0.19      0.08      0.11      3756

    accuracy                           0.94     78881
   macro avg       0.57      0.53      0.54     78881
weighted avg       0.92      0.94      0.93     78881

Epoch 1/100
Epoch

# Method 4: Local Outlier Factor

In [None]:
from sklearn.neighbors import LocalOutlierFactor

for ss in range(5, 15, 10):
    smote = SMOTE(sampling_strategy=ss*0.01, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])
    
    # Split the data
    X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

    # Scale the data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)
    
    # n= 25 ==> best
    for n in range(25, 55, 10):
        # Perform LOF on the training data
        c = len([1 for target in y_train if target == 1]) / len(y_train)
        lof = LocalOutlierFactor(n_neighbors=n, contamination=c, novelty=True)

        # Fit the model on the training data
        lof.fit(X_train)

        # Predict the targets for the validation data
        lof_validation_preds = lof.predict(X_validation)

        # Convert LOF predictions to binary (1 for cancerous, 0 for non-cancerous)
        y_pred_valid = [1 if p == -1 else 0 for p in lof_validation_preds]

        # Evaluate the model's performance on the validation data
        cr = classification_report(y_validation, y_pred_valid)
        print(f'\nS={ss}, N={n}\n', cr)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



S=5, N=25
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     75125
           1       0.13      1.00      0.23     11269

    accuracy                           0.13     86394
   macro avg       0.07      0.50      0.12     86394
weighted avg       0.02      0.13      0.03     86394



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



S=5, N=35
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     75125
           1       0.13      1.00      0.23     11269

    accuracy                           0.13     86394
   macro avg       0.07      0.50      0.12     86394
weighted avg       0.02      0.13      0.03     86394

