In [None]:
from collections import Counter
import numpy as np
import os
import pandas as pd

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
data_gen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.05,
    brightness_range=[0.9, 1.1],
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode="nearest",
)


In [None]:
def image_generator(
    image_directory,
    filtered_metadata,
    label_encoder,
    additional_augmentations,
    batch_size=32,
    num_classes=8,
    augment_data=False,
    label="diagnosis",
    filter=True,
):
    num_samples = len(filtered_metadata)

    while True:  # Generator loops indefinitely
        for offset in range(0, num_samples, batch_size):
            batch_samples = filtered_metadata.iloc[offset : offset + batch_size]

            images = []
            labels = []
            for ix, row in batch_samples.iterrows():
                if filter is True and pd.isna(
                    row[label]
                ):  # Assuming 'label' is the name of your label column
                    continue

                img_path = os.path.join(image_directory, row["isic_id"] + ".JPG")
                try:
                    img = image.load_img(img_path, target_size=(224, 224))
                    img = image.img_to_array(img)
                    img = K.applications.densenet.preprocess_input(img)

                    # Apply LabelEncoder to get the encoded label
                    encoded_label = label_encoder.transform([row[label]])[
                        0
                    ]  # Ensure this matches your label column

                    # Check if augmentation is needed for this label
                    if augment_data and additional_augmentations[label] > 0:
                        augmented_img = img.reshape(
                            (1,) + img.shape
                        )  # Reshape for data_gen
                        for _ in range(additional_augmentations[label]):
                            augmented_image = data_gen.flow(
                                augmented_img, batch_size=1
                            ).next()[0]
                            images.append(augmented_image)
                            labels.append(
                                encoded_label
                            )  # Use the same encoded label for augmented images

                    # Append the original image and its label
                    images.append(img)
                    labels.append(encoded_label)
                except Exception as e:
                    print(f"Error processing file {img_path}: {e}")
                    continue

            if images:
                X_batch = np.array(images)
                Y_batch = K.utils.to_categorical(labels, num_classes=num_classes)
                yield X_batch, Y_batch


In [None]:
def label_generator(filtered_metadata, label_encoder, batch_size=32, num_classes=8, label="diagnosis", filter=True):
    num_samples = len(filtered_metadata)

    while True:
        for offset in range(0, num_samples, batch_size):
            batch_samples = filtered_metadata.iloc[offset : offset + batch_size]

            labels = []
            for ix, row in batch_samples.iterrows():
                if filter is True and pd.isna(row[label]):
                    continue

                # Apply LabelEncoder to get the encoded label
                encoded_label = label_encoder.transform([row[label]])[0]

                labels.append(encoded_label)

            if labels:
                Y_batch = K.utils.to_categorical(labels, num_classes=num_classes)
                yield Y_batch


In [None]:
def get_percentage_of_classes(Y, hot_encoded=False):
    if hot_encoded:
        Y = np.argmax(Y, axis=1)
    counts = Counter(Y)
    total = sum(counts.values())
    percents = {key: value / total * 100 for key, value in counts.items()}
    return percents


In [None]:
def preprocess_data(X, Y, num_classes=8):
    X = K.applications.densenet.preprocess_input(X)

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(Y)
    Y = K.utils.to_categorical(encoded_labels, num_classes=num_classes)

    return X, Y, label_encoder


In [None]:
def get_fine_tuned_DenseNet(
    input_tensor,
    num_classes=8,
):
    print("Getting finetuned DenseNet")
    UNFREEZE_LAYER = -30

    # Step 1 load model
    base_model = tf.keras.applications.DenseNet201(
        include_top=False, weights="imagenet", input_tensor=input_tensor
    )

    # Step 2 add Custom layers
    for layer in base_model.layers:
        layer.trainable = False
    x = base_model.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(1024, activation="relu")(x)

    predictions = tf.keras.layers.Dense(num_classes, activation="softmax")(x)
    model = tf.keras.Model(inputs=base_model.input, outputs=predictions)

    # Step 3 Compile Model
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )

    # Step 4 Optional Initail Training
    # Step 5 Freeze Layers and set trainable
    unfreeze_layer = UNFREEZE_LAYER
    for layer in model.layers[:unfreeze_layer]:
        layer.trainable = False
    for layer in model.layers[unfreeze_layer:]:
        layer.trainable = True

    # Step 6 Compile Model again
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    # Step 7, Fine tuning done at the return
    return model


In [None]:
def get_metadata(
    image_directory,
    metadata_path,
    label_column,
    minimum_count=10,
    filter_images=True,
    min_count_threshold=0.25,
):
    # Load and preprocess metadata
    metadata = pd.read_csv(metadata_path)
    # if not filter_images:
    # diagnosis_counts = metadata[label_column].value_counts(dropna=False)
    # else:
    diagnosis_counts = metadata[label_column].value_counts()

    # Filter metadata
    if filter_images:
        filtered_metadata = metadata[
            metadata[label_column].isin(
                diagnosis_counts[diagnosis_counts >= minimum_count].index
            )
        ]
    else:
        filtered_metadata = metadata

    # Fit the LabelEncoder on the filtered labels
    label_encoder = LabelEncoder()
    if not filter_images:
        label_encoder.fit(filtered_metadata[label_column])
    else:
        label_encoder.fit(filtered_metadata[label_column].dropna())
    num_classes = len(label_encoder.classes_)

    return filtered_metadata, label_encoder, num_classes


In [None]:
def view_data(y1, preds, mapping):
    # Initialize dictionaries to store the results
    cluster_stats = {}
    totals = {key: 0 for key in range(8)}
    max_values = {key: {"label": None, "value": 0} for key in range(8)}

    for val in set(y1):
        inds = [i for i in range(len(y1)) if y1[i] == val]
        p = preds[inds]
        y2 = y1[inds]
        counts = dict(Counter(p))

        # Store the counts in the cluster_stats dictionary
        cluster_stats[val] = counts

        # Calculate totals and update max_values for each key (cluster)
        for key, value in counts.items():
            totals[key] += value
            if value > max_values[key]["value"]:
                max_values[key] = {"label": key, "value": value}

        print("Cluster:", val)
        print("Counts:", counts)
        # Max value and label
        print("Max value:", max(counts.values()))
        max_label = max(counts, key=counts.get)
        print("Max label:", max_label, mapping[max_label])
        percentage = (max(counts.values()) / sum(counts.values())) * 100
        percentage = round(percentage, 2)
        print(f"{percentage}% exclusive")
        # Total count
        print("Total count:", sum(counts.values()))
        print("----------------")

    # After processing all clusters, print the aggregated statistics
    print("Total Counts for Each Cluster:", totals)
    print("Maximum Value and Label for Each Cluster:", max_values)

    return cluster_stats, totals, max_values


In [None]:
def is_standard_python() -> bool:
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return False   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        # Probably a standard Python interpreter
        return True

In [None]:
def view_results(y_data, pred_data, kmeans, label_encoder, num_classes):
    y1 = np.argmax(y_data, axis=1)
    preds = kmeans.predict(pred_data)

    decoded_labels = label_encoder.inverse_transform(preds)

    mapping = {}
    reverse_mapping = {}

    for d, p in zip(decoded_labels, preds):
        mapping[d] = p
        reverse_mapping[p] = d
        if len(mapping) == num_classes:
            break

    cluster_stats, totals, max_values = view_data(y1, preds, reverse_mapping)


In [None]:
metadata_path = "/uw/ml_unsuper/ISIC_proc/data/ham10000_metadata_2023-11-27.csv"
densenet_metadata_path = (
    "/uw/ml_unsuper/ISIC_proc/data/challenge-2020-training_metadata_2023-11-28.csv"
)


if __name__ == "__main__" and is_standard_python():
    d = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    data_dir = os.path.join(d, "data")
else:
    data_dir = "../data"

image_directory = os.path.join(data_dir, "images")
densenet_image_directory = os.path.join(data_dir, "images2")
X_path = os.path.join(data_dir, "X.npy")
Y_path = os.path.join(data_dir, "Y.npy")

num_classes_path = os.path.join(data_dir, "num_classes.npy")
save_preds_train = os.path.join(data_dir, "preds_train.npy")
save_preds_test = os.path.join(data_dir, "preds_test.npy")
save_preds_X = os.path.join(data_dir, "preds_X.npy")


save_true_Ytrain = os.path.join(data_dir, "true_Ytrain.npy")
save_true_Ytest = os.path.join(data_dir, "true_Ytest.npy")
save_true_Y = os.path.join(data_dir, "true_Y.npy")

MODEL_NAME = os.path.join(data_dir, "fine_tuned_model.keras")
densenet_X_path = os.path.join(data_dir, "densenet_X.npy")
densenet_Y_path = os.path.join(data_dir, "densenet_Y.npy")


assert os.path.exists(data_dir), "Data directory not found"
assert os.path.exists(image_directory), "Image directory not found"
assert os.path.exists(metadata_path), "Metadata file not found"
assert os.path.exists(densenet_metadata_path), "Metadata file not found"
assert os.path.exists(densenet_image_directory), "Image directory not found"


In [None]:
def calculate_augmentations(filtered_metadata, label_column, min_count_threshold=0.25):
    """
    Calculate the number of additional images needed for each label in a dataset.

    Parameters:
        filtered_metadata (pandas.DataFrame): The filtered metadata containing the image labels. The applied
            filters may include a minimum number of labels and exclusion of NaN labels.
        label_column (str): The name of the column in the metadata that contains the image labels.
        min_count_threshold (float, optional): The minimum count threshold as a fraction of the maximum count.
            Default is 0.25.

    Returns:
        dict: A dictionary where the keys are the labels and the values are the number of additional images
            needed for each label.
    """
    label_counts = filtered_metadata[label_column].value_counts()
    max_count = max(label_counts)
    minimum_count = round(max_count * min_count_threshold)

    additional_images_needed = {
        label: max(0, minimum_count - count) for label, count in label_counts.items()
    }
    return additional_images_needed


In [None]:
label = "diagnosis"
minimum_count = 10
filtered_metadata, label_encoder, num_classes = get_metadata(
    image_directory, metadata_path, label, minimum_count=minimum_count
)


In [None]:
def predict_generator(model, metadata, x_generator, batch_size=32):
    num_samples = len(metadata)
    predict_steps = np.ceil(num_samples / batch_size)

    predictions = []
    true_labels = []
    for _ in range(int(predict_steps)):
        X_batch, Y_batch = next(x_generator)  # Get both features and labels from the same generator

        batch_predictions = model.predict(X_batch)
        predictions.extend(batch_predictions)
        true_labels.extend(Y_batch)  # Assuming Y_batch is not one-hot encoded; if it is, convert it back

    return np.array(predictions), np.array(true_labels)


In [None]:
# Assuming filtered_metadata is already prepared
train_metadata, test_metadata = train_test_split(
    filtered_metadata, test_size=0.2, random_state=42
)
# Calculate additional augmentations needed

label_column = "diagnosis"
train_augments = calculate_augmentations(train_metadata, label_column)
test_augments = calculate_augmentations(test_metadata, label_column)
X_augments = calculate_augmentations(filtered_metadata, label_column)

train_generator = image_generator(
    image_directory,
    train_metadata,
    label_encoder,
    train_augments,
    batch_size=32,
    num_classes=num_classes,
)
test_generator = image_generator(
    image_directory,
    test_metadata,
    label_encoder,
    test_augments,
    batch_size=32,
    num_classes=num_classes,
)

X_generator = image_generator(
    image_directory,
    filtered_metadata,
    label_encoder,
    X_augments,
    batch_size=32,
    num_classes=num_classes,
)

# Usage


Y_generator = label_generator(
    filtered_metadata,
    label_encoder,
    batch_size=32,
    num_classes=num_classes,
)

Ytrain_generator = label_generator(
    train_metadata,
    label_encoder,
    batch_size=32,
    num_classes=num_classes,
)

Ytest_generator = label_generator(
    test_metadata,
    label_encoder,
    batch_size=32,
    num_classes=num_classes,
)



In [None]:

force_load_model = False
EPOCHS = 4

if not os.path.exists(MODEL_NAME) or force_load_model == True:
    print("Fine tuning the model. This will take a while. Please wait.")
    print("Loading training data for DenseNet")

    # Get metadata and initialize generators for training and validation

    densenet_metadata, label_encoder, num_classes = get_metadata(
        densenet_image_directory,
        densenet_metadata_path,  # Ensure this is the correct path
        label,
        minimum_count=minimum_count,
        filter_images=False,
    )
    densenet_augments = calculate_augmentations(
        densenet_metadata, label, min_count_threshold=0.25
    )

    densenetX_generator = image_generator(
        densenet_image_directory,
        densenet_metadata,  # Assuming this is for training
        label_encoder,
        {},
        batch_size=32,
        num_classes=num_classes,
        augment_data=False,
        filter=False,
    )

    # If you have separate validation metadata
    # validation_generator = ...

    print("Fine-tuning DenseNet ...")
    input_tensor = K.Input(shape=(224, 224, 3))
    model = get_fine_tuned_DenseNet(input_tensor, num_classes=num_classes)

    print("Training pre-trained model")
    model.fit(
        densenetX_generator,
        epochs=EPOCHS,
        steps_per_epoch=len(densenet_metadata) // 32,
    )
    model.save(MODEL_NAME)  # Use model.save instead of np.save for models

else:
    print("Loading pre-trained model from local storage")
    model = load_model(MODEL_NAME)

model.summary()

In [None]:
force_load_preds = False

if not os.path.isfile(save_preds_train) or force_load_preds == True:
    # preds_train, true_Ytrain = predict_generator(model, filtered_metadata, X_generator, Y_generator)
    # preds_test, true_Ytest = predict_generator(model, filtered_metadata, X_generator, Y_generator)
    # preds_X, true_Y = predict_generator(model, filtered_metadata, X_generator, Y_generator)


    preds_train, true_Ytrain = predict_generator(model, train_metadata, train_generator)
    preds_test, true_Ytest = predict_generator(model, test_metadata, test_generator)
    preds_X, true_Y = predict_generator(model, filtered_metadata, X_generator)


    np.save(save_preds_train, preds_train)
    np.save(save_preds_test, preds_test)
    np.save(save_preds_X, preds_X)

    np.save(save_true_Ytrain, true_Ytrain)
    np.save(save_true_Ytest, true_Ytest)
    np.save(save_true_Y, true_Y)
else:
    print("Loading precomputed predictions")

    preds_train = np.load(save_preds_train)
    preds_test = np.load(save_preds_test)
    preds_X = np.load(save_preds_X)

    true_Ytrain = np.load(save_true_Ytrain)
    true_Ytest = np.load(save_true_Ytest)
    true_Y = np.load(save_true_Y)


In [None]:

# # Usage
# X_generator = image_generator(
#     image_directory,
#     filtered_metadata,
#     label_encoder,
#     X_augments,
#     batch_size=32,
#     num_classes=num_classes,
# )

# Y_generator = label_generator(
#     filtered_metadata,
#     label_encoder,
#     batch_size=32,
#     num_classes=num_classes,
# )

# preds_X, true_Y = predict_generator(model, filtered_metadata, X_generator, Y_generator)

In [None]:


kmeans = KMeans(
    n_clusters=num_classes,
    random_state=42,
    max_iter=1000,
    algorithm="elkan",
    tol=0.000001,
).fit(preds_test)



In [None]:
view_results(true_Ytest, preds_test, kmeans, label_encoder)


In [None]:
kmeans = KMeans(
    n_clusters=num_classes,
    random_state=42,
    max_iter=1000,
    algorithm="elkan",
    tol=0.000001,
).fit(preds_X)
view_results(true_Y, preds_X, kmeans, label_encoder)




In [None]:
def clear_local_saved_data():
    try:
        remove_these = [
            X_path,
            Y_path,
            num_classes_path,
            save_preds_train,
            save_preds_test,
            save_preds_X,
            
            MODEL_NAME,
        ]
    except NameError:
        print("No local saved data to clear or local saved data is incomplete.")
        return
    for r in remove_these:
        if os.path.isfile(r):
            try:
                os.remove(r)
            except OSError:
                print(f"Failed to remove {r}")
        else:
            print(f"{r} does not exist.")


# ask user if they want to remove the data
remove_data = input("Do you want to remove the data? (y/n): ")
if remove_data.lower() == "y":
    clear_local_saved_data()
