In [None]:
import cv2
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import pickle


In [None]:
def load_and_process_image(image_path):
    # Load image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Initialize SIFT
    sift = cv2.SIFT_create()

    # Detect and compute SIFT features
    keypoints, descriptors = sift.detectAndCompute(image, None)
    if descriptors is None:
        descriptors = np.array([])

    return descriptors


In [None]:
def load_data(image_folder, metadata_file):
    # Read metadata CSV file
    metadata = pd.read_csv(metadata_file)

    # Initialize lists for storing image features and metadata
    image_features = []
    image_metadata = []

    for i, row in metadata.iterrows():
        if not i % 100:
            print(f"Processing image {i} / {len(metadata)}")
        file_id = row["isic_id"]
        image_path = os.path.join(image_folder, file_id + ".JPG")

        if os.path.exists(image_path):
            # Process each image and store the descriptors
            descriptors = load_and_process_image(image_path)
            image_features.append(descriptors)
            image_metadata.append(row)  # Store the entire row of metadata

    # Split the dataset into training and testing sets
    x_train, x_test, metadata_train, metadata_test = train_test_split(
        image_features, image_metadata, test_size=0.2, random_state=42
    )

    return (x_train, metadata_train), (x_test, metadata_test)


In [None]:
def save_datasets(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)


In [None]:
# Path to images and metadata
IMG_DIR = "../data/images"
DATA_DIR = "../data/sift_datasets"
image_folder = IMG_DIR
metadata_file = "../data/ham10000_metadata_2023-11-27.csv"
sift_datasets_folder = DATA_DIR


In [None]:
# Load data and metadata
(x_train, metadata_train), (x_test, metadata_test) = load_data(
    image_folder, metadata_file
)


In [None]:
# Save the datasets with metadata
save_datasets(
    (x_train, metadata_train), os.path.join(sift_datasets_folder, "train_data.pkl")
)
save_datasets(
    (x_test, metadata_test), os.path.join(sift_datasets_folder, "test_data.pkl")
)
