In [3]:
# Import libraries
import os
import zipfile
import numpy as np
import pandas as pd
from io import BytesIO
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler

## To get richer cancerous data, we can either: 
1) apply rotations, cropping, and other alterations to cancerous images or 2) apply SMOTE on features to maybe get better results


# Data Preprocessing

In [12]:
# Obtain the true target values from the metadata
zip_folder = zipfile.ZipFile('anon-patient-data.zip')
true_targets = pd.read_csv(zip_folder.open('train-metadata.csv'), usecols=[0, 1], index_col='isic_id')

In [None]:
# Extract features using ResNet50 and batch procesing
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def extract_features_from_image(img):
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    img_array = preprocess_input(img_array)  # Preprocess for ResNet50
    features = resnet_model.predict(img_array)  # Extract features
    return features.flatten()

def feature_batch_generator(zip_file, img_size=(224, 224), batch_size=32):
    with zipfile.ZipFile(zip_file, 'r') as z:
        files = [file for file in z.namelist() if file.startswith('images/') and file.endswith('.jpg')]
        
        for i in range(0, len(files), batch_size):
            batch_files = files[i:i + batch_size]
            features = []
            for file in batch_files:
                with z.open(file) as img_file:
                    img = load_img(BytesIO(img_file.read()), target_size=img_size)
                    img_features = extract_features_from_image(img)
                    features.append(img_features)
            yield np.array(features)

# Method 1: Isolation Forest

In [None]:
# Initialize tools
iso_forest = IsolationForest(contamination=0.1, random_state=42)
scaler = MinMaxScaler()

# Process features with batch processing
for feature_batch in feature_batch_generator('anon-patient-data.zip', batch_size=32):
    
    # Standardize the features
    feature_batch_scaled = scaler.fit_transform(feature_batch)

    # Fit Isolation Forest and predict anomalies, -1 = anomaly, 1 = normal
    predictions = iso_forest.fit_predict(feature_batch_scaled)

    cr = classification_report(true_targets.values.reshape(len(true_targets)), predictions)
    print(cr)

# Method 2: Convolutional Autoencoder