In [1]:
# Import libraries
import zipfile
from io import BytesIO
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# Data Preprocessing

In [None]:
def image_batch_generator(zip_file, img_size=(128, 128), batch_size=32):
    with zipfile.ZipFile(zip_file, 'r') as z:
        files = [file for file in z.namelist() if file.startswith('image/') and file.endswith('.jpg')]
        
        # Process images in batches
        for i in range(0, len(files), batch_size):
            batch_files = files[i:i + batch_size]
            images = []
            for file in batch_files:
                with z.open(file) as img_file:
                    img = load_img(BytesIO(img_file.read()), target_size=img_size)
                    img_array = img_to_array(img)
                    img_array /= 255.0  # Normalize pixel values to [0, 1]
                    images.append(img_array)
            
            yield np.array(images)  # Yield the batch of images as a NumPy array

# Example of how to use the generator
zip_file = 'anon-patient-data.zip'
batch_size = 32
iso_forest = IsolationForest(contamination=0.1)


# Iterate through the image batches
feature_vectors = []
for image_batch in image_batch_generator(zip_file, batch_size=batch_size):
    # Process each batch of images (e.g., apply anomaly detection)
    flattened_images = image_batch.reshape(image_batch.shape[0], -1)
    feature_vectors.append(flattened_images)

# Concatenate all batches into a single feature matrix
feature_vectors = np.vstack(feature_vectors)

    # Step 2: Fit Isolation Forest on the image data
iso_forest.fit(feature_vectors)

    # Step 3: Predict anomalies
predictions = iso_forest.predict(feature_vectors)  # -1 for anomaly, 1 for normal
anomaly_scores = iso_forest.decision_function(feature_vectors)  # Negative values indicate anomaly risk

# Method 1: Isolation Forest

In [None]:
# Standardize the features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.1)
iso_forest.fit(features_scaled)

# Predict anomalies (cancerous lesions)
predictions = iso_forest.predict(features_scaled)  # -1 = anomaly, 1 = normal

# Method 2: Autoencoder