In [1]:
# Import libraries
import zipfile
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.decomposition import PCA

# Data Preprocessing

In [2]:
# Load data
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
skin_cancer_df = pd.read_csv(zip_folder.open('train-metadata.csv'), 
                             usecols=[num for num in range(0, 43) if num not in [2, 7, 8, 31]], index_col='isic_id')

# Convert categorical data to numbers
encoder = LabelEncoder()
for feature in ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']:
    skin_cancer_df[feature] = encoder.fit_transform(skin_cancer_df[feature])

# Fill in blank values in columns using a KNN imputer
imputer = KNNImputer(n_neighbors=5)
skin_cancer_df[['age_approx', 'sex']] = imputer.fit_transform(skin_cancer_df[['age_approx', 'sex']])

# Oversample the minority group to make the data more balanced
smote = SMOTE(sampling_strategy=0.15, random_state=42)
X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

# Split the data
X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

# Scale the numerical data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Method 1: Isolation Forest

In [7]:
# Tuning parameters
best = (None, 0)
for ss in [x*0.01 for x in range(3, 19, 2)]:
    # Oversample the minority group to make the data more balanced
    smote = SMOTE(sampling_strategy=ss, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(skin_cancer_df.iloc[:, 1:], skin_cancer_df.iloc[:, 0])

    # Split the data
    X_train, X_rest, y_train, y_rest = train_test_split(X_resampled, y_resampled, stratify=y_resampled, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, stratify=y_rest, random_state=42)

    # Scale the numerical data between 0 and 1
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)

    # Develop and train the Isolation Forest model
    c = len([1 for target in y_train if target == 1]) / len(y_train)
    for estimators in range(50, 120, 10):
        isf = IsolationForest(n_estimators=estimators, contamination=c, random_state=42)
        isf.fit(X_train)

        # Predict the targets for the validation data
        isf_validation_preds = isf.predict(X_validation)
        isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

        # Evaluate the models performance on validation data
        print(f'\nS.S.: {ss}, Estimators: {estimators}')
        cr = classification_report(y_validation, isf_valid_pred)
        best_f1_score = float(cr.split()[12])
        if best_f1_score > best[1]:
            best_print = (f'\nS.S.: {ss}, Estimators: {estimators}', best_f1_score)
            best = (ss, estimators)
        print(cr)
print(f'best F1 Score for 1: {best_print}')


S.S.: 0.03, Estimators: 50
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     75125
           1       0.20      0.19      0.20      2254

    accuracy                           0.95     77379
   macro avg       0.59      0.59      0.59     77379
weighted avg       0.95      0.95      0.95     77379


S.S.: 0.03, Estimators: 60
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     75125
           1       0.18      0.18      0.18      2254

    accuracy                           0.95     77379
   macro avg       0.58      0.58      0.58     77379
weighted avg       0.95      0.95      0.95     77379


S.S.: 0.03, Estimators: 70
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     75125
           1       0.19      0.19      0.19      2254

    accuracy                           0.95     77379
   macro avg       0.58      0.58      0.58 


S.S.: 0.09, Estimators: 80
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     75125
           1       0.29      0.29      0.29      6761

    accuracy                           0.88     81886
   macro avg       0.62      0.62      0.62     81886
weighted avg       0.88      0.88      0.88     81886


S.S.: 0.09, Estimators: 90
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     75125
           1       0.30      0.30      0.30      6761

    accuracy                           0.89     81886
   macro avg       0.62      0.62      0.62     81886
weighted avg       0.89      0.89      0.89     81886


S.S.: 0.09, Estimators: 100
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     75125
           1       0.31      0.30      0.30      6761

    accuracy                           0.89     81886
   macro avg       0.62      0.62      0.62


S.S.: 0.15, Estimators: 110
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     75125
           1       0.39      0.39      0.39     11269

    accuracy                           0.84     86394
   macro avg       0.65      0.65      0.65     86394
weighted avg       0.84      0.84      0.84     86394


S.S.: 0.17, Estimators: 50
              precision    recall  f1-score   support

           0       0.89      0.90      0.90     75125
           1       0.38      0.38      0.38     12771

    accuracy                           0.82     87896
   macro avg       0.64      0.64      0.64     87896
weighted avg       0.82      0.82      0.82     87896


S.S.: 0.17, Estimators: 60
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     75125
           1       0.40      0.39      0.39     12771

    accuracy                           0.83     87896
   macro avg       0.65      0.65      0.65

In [3]:
c = len([1 for target in y_train if target == 1]) / len(y_train)
isf = IsolationForest(n_estimators=90, contamination=c, random_state=42)
isf.fit(X_train)

# Predict the targets for the validation data
isf_validation_preds = isf.predict(X_validation)
isf_valid_pred = [1 if p == -1 else 0 for p in isf_validation_preds]

# Evaluate the models performance on validation data
cr = classification_report(y_validation, isf_valid_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     75125
           1       0.40      0.40      0.40     11269

    accuracy                           0.84     86394
   macro avg       0.65      0.65      0.65     86394
weighted avg       0.84      0.84      0.84     86394



In [None]:
"""
# other way to train
X_train = skin_cancer_df[skin_cancer_df['target'] == 0].drop('target', axis=1)
X_test = skin_cancer_df.drop('target', axis=1)

# Scale the numerical data between 0 and 1
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Develop and train the Isolation Forest model
c = len(skin_cancer_df[skin_cancer_df['target'] == 1]) / len(skin_cancer_df)
isf = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
isf.fit(X_train)

# Predict the targets for the validation data
preds = isf.predict(X_test)
y_preds = [1 if p == -1 else 0 for p in preds]

# Evaluate the models performance on validation data
print(classification_report(skin_cancer_df['target'].values, y_preds))

# Make predictions for the entire data set 
"""

# Method 2: One Class SVM - takes too long to run

In [None]:
"""
# SVM takes too long on large data
for comp in range(7, 25, 2):
    pca = PCA(n_components=comp, random_state=42)  # Reduce to 10 components (you can adjust this number)
    X_train_pca = pca.fit_transform(X_train)
    X_validation_pca = pca.transform(X_validation)

    # Fit One-Class SVM with a linear kernel on the reduced data
    ocsvm = OneClassSVM(kernel='linear')
    ocsvm.fit(X_train_pca)

    # Predict the targets for the validation data
    ocsvm_validation_preds = ocsvm.predict(X_validation_pca)

    # Convert One-Class SVM predictions to binary (1 for cancerous, 0 for non-cancerous)
    y_pred_valid = [1 if p == -1 else 0 for p in ocsvm_validation_preds]

    # Evaluate the model's performance on the validation data
    print(classification_report(y_validation, y_pred_valid))
"""

# Method 3: Autoencoders

In [7]:
# Build the autoencoder model
for d in range(5, 45, 10):
    autoencoder = Sequential([
        Dense(128, input_dim=X_train.shape[1], activation='relu'),
        Dropout(d*0.01),  # Add dropout layer to reduce overfitting
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(64, activation='relu'),
        Dense(128, activation='relu'),
        Dense(X_train.shape[1], activation='sigmoid')
    ])

    autoencoder.compile(optimizer='adam', loss='mse')

    # Train the autoencoder using only the non-cancerous patients
    autoencoder.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.1)

    # Calculate reconstruction error for each sample
    reconstructed = autoencoder.predict(X_validation)
    reconstruction_error = np.mean(np.abs(reconstructed - X_validation), axis=1)

    # Threshold the reconstruction error to detect anomalies
    threshold = np.percentile(reconstruction_error, 98)  # Set threshold (e.g., 99th percentile)
    predictions_autoencoder = (reconstruction_error > threshold).astype(int)  # 1 = anomaly (cancer), 0 = normal
    cf = classification_report(y_validation, predictions_autoencoder)
    print(f'\nDropout: {d}\n')
    # best dropout = 25 (0.25)
    print(cf)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Dropout: 5

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     75125
           1       0.41      0.06      0.11     11269

    accuracy                           0.87     86394
   macro avg       0.64      0.52      0.52     86394
weighted avg       0.82      0.87      0.82     86394

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Dropout: 15

              precision    recall  f1-score   support

           0       0.87      0.99      0.93     75125
           1       0.37      0.06      0.10     11269

    accuracy        

Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Dropout: 35

              precision    recall  f1-score   support

           0       0.87      0.98      0.93     75125
           1       0.32      0.05      0.09     11269

    accuracy                           0.86     86394
   macro avg       0.60      0.52      0.51     86394
weighted avg       0.80      0.86      0.82     86394



# Method 4: Local Outlier Factor

In [9]:
from sklearn.neighbors import LocalOutlierFactor

# n= 25 ==> best
for n in range(25, 55, 10):
    # Perform LOF on the training data
    lof = LocalOutlierFactor(n_neighbors=n, contamination=c, novelty=True)

    # Fit the model on the training data
    lof.fit(X_train)

    # Predict the targets for the validation data
    lof_validation_preds = lof.predict(X_validation)

    # Convert LOF predictions to binary (1 for cancerous, 0 for non-cancerous)
    y_pred_valid = [1 if p == -1 else 0 for p in lof_validation_preds]

    # Evaluate the model's performance on the validation data
    cr = classification_report(y_validation, y_pred_valid)
    print(f'\nN={n}\n', cr)


N=25
               precision    recall  f1-score   support

           0       0.90      0.89      0.90     75125
           1       0.31      0.32      0.32     11269

    accuracy                           0.82     86394
   macro avg       0.61      0.61      0.61     86394
weighted avg       0.82      0.82      0.82     86394


N=35
               precision    recall  f1-score   support

           0       0.89      0.89      0.89     75125
           1       0.30      0.30      0.30     11269

    accuracy                           0.82     86394
   macro avg       0.59      0.60      0.60     86394
weighted avg       0.82      0.82      0.82     86394


N=45
               precision    recall  f1-score   support

           0       0.89      0.89      0.89     75125
           1       0.25      0.26      0.25     11269

    accuracy                           0.80     86394
   macro avg       0.57      0.57      0.57     86394
weighted avg       0.81      0.80      0.80     86394