In [None]:
pip install umap-learn

In [None]:
import h5py
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform
import random
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import smart_resize

database = pd.read_excel('/Final Table- Process V').drop([0,5,10,13])
source_index = database['Source Index'].dropna().tolist()
dataset_index = database['Dataset Index'].dropna().tolist()
random_gain_dict = {}
diverse_gain_dict = {}

for i in range(len(source_index)):

  group_name = source_index[i]
  datasets = [dat.strip() for dat in dataset_index[i].split(',')]
  for dat in datasets:
    subgroup_name = dat

    try:
      with h5py.File(destination_hdf5_file, 'r') as hdf:
        subgroup = hdf[group_name][subgroup_name]
        if 'processed' in subgroup:
          dataset = subgroup['processed']
        else:
          dataset = subgroup['rawdata']
        data_array = np.array(dataset)
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                tf.config.experimental.set_memory_growth(gpus[0], True)
                print(f"Running on GPU: {gpus[0].name}")
            except RuntimeError as e:
                print(e)
        else:
            print("No GPU found. Running on CPU.")

        def extract_features(img_array):
            img_array = np.stack((img_array,) * 3, axis=-1)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)
            img_array = smart_resize(img_array, (224,224))
            features = model.predict(img_array, verbose=0)
            return features.flatten()

        model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
        image_features = []
        num_images = data_array.shape[0]
        for i in range(num_images):
            img_array = data_array[i]
            features = extract_features(img_array)
            image_features.append(features)
        image_features = np.array(image_features)

      kmeans = KMeans(n_clusters=20, n_init = 10, random_state=42).fit(image_features)
      labels = kmeans.labels_

      # Select equal number of images from each cluster to get a subset of 1000 images
      cluster_counts = np.bincount(labels)
      min_images_per_cluster = 1000 // 20

      selected_images = []
      for cluster_idx in range(20):
          cluster_images = np.where(labels == cluster_idx)[0]
          selected_indices = np.random.choice(cluster_images, min(min_images_per_cluster,len(cluster_images)), replace=False)
          selected_images.extend(selected_indices)

      # Randomly select images if there are remaining slots
      # remaining_slots = 1000 - len(selected_images)
      # if remaining_slots > 0:
          # remaining_indices = np.setdiff1d(range(len(labels)), selected_images)
          # additional_indices = np.random.choice(remaining_indices, remaining_slots, replace=False)
          # selected_images.extend(additional_indices)

      selected_image_indices = selected_images

      print(group_name,subgroup_name)


      # Measure diversity using distance gains
      selected_features = image_features[selected_image_indices]
      distances = pdist(selected_features, 'euclidean')
      distance_matrix = squareform(distances)

      # Calculate the average distance gain
      average_distance_gain = np.mean(distances)

      print(f"Average optimized Distance Gain: {average_distance_gain}")
      key = f"{group_name}_{subgroup_name}"
      # Append average_distance_gain to the dictionary
      diverse_gain_dict[key] = average_distance_gain

      random_sample_indices = np.random.choice(image_features.shape[0], selected_features.shape[0], replace=False)  # Random sample indices
      random_sample_features = image_features[random_sample_indices]
      distances = pdist(random_sample_features, 'euclidean')
      distance_matrix = squareform(distances)
      # Calculate the average distance gain
      average_distance_gain = np.mean(distances)

      print(f"Average random Distance Gain: {average_distance_gain}")
      random_gain_dict[key] = average_distance_gain

      file_name = f"{group_name}_{subgroup_name}.npy"
      # Define the path to save the .npy file
      save_path = f'/content/drive/MyDrive/MeltPoolViT/Diverse_index_KDPP/{file_name}'

# Define the filename for the JSON file
json_filename = "/Diverse_index_KDPP2/random.json"

# Save the dictionary as a JSON file
with open(json_filename, 'w') as json_file:
    json.dump(random_gain_dict, json_file)

print(f"Dictionary saved to {json_filename}")

# Define the filename for the JSON file
json_filename = "/Diverse_index_KDPP2/diverse.json"

# Save the dictionary as a JSON file
with open(json_filename, 'w') as json_file:
    json.dump(diverse_gain_dict, json_file)

print(f"Dictionary saved to {json_filename}")

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np


# Display three images from each cluster
def display_three_images_per_cluster(image_array, labels, num_clusters, images_per_cluster=3):
    # Determine the number of rows and columns for subplots
    num_rows = num_clusters
    num_cols = images_per_cluster

    # Adjust the figure size for larger images
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * 4, num_rows * 4))  # Adjust size as needed

    if num_clusters == 1:
        axes = np.array([axes])  # Ensure axes is iterable even for a single cluster
    else:
        axes = np.array(axes).reshape((num_rows, num_cols))  # Reshape axes for multiple clusters

    for cluster_idx in range(num_clusters):
        # Find the indices of images in the current cluster
        cluster_indices = np.where(labels == cluster_idx)[0]

        # Select three images from the cluster
        selected_indices = np.random.choice(cluster_indices, min(images_per_cluster, len(cluster_indices)), replace=False)

        for img_idx, sample_index in enumerate(selected_indices):
            img_array = image_array[sample_index]
            img_pil = img_array

            ax = axes[cluster_idx, img_idx]
            ax.imshow(img_pil)
            ax.set_title(f"Cluster {cluster_idx}")
            ax.axis('off')

    plt.tight_layout()
    plt.show()

# Display three images from each cluster
display_three_images_per_cluster(data_array, labels, optimal_clusters, images_per_cluster=3)

In [None]:
for i in range(len(source_index)):

  group_name = source_index[i]
  datasets = [dat.strip() for dat in dataset_index[i].split(',')]
  for dat in datasets:
    subgroup_name = dat

    try:
      with h5py.File(destination_hdf5_file, 'r') as hdf:
        subgroup = hdf[group_name][subgroup_name]
        if 'processed' in subgroup:
          dataset = subgroup['processed']
        else:
          dataset = subgroup['rawdata']
        data_array = np.array(dataset)
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                tf.config.experimental.set_memory_growth(gpus[0], True)
                print(f"Running on GPU: {gpus[0].name}")
            except RuntimeError as e:
                print(e)
        else:
            print("No GPU found. Running on CPU.")

        def extract_features(img_array):
            img_array = np.stack((img_array,) * 3, axis=-1)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)
            img_array = smart_resize(img_array, (224,224))
            features = model.predict(img_array, verbose=0)
            return features.flatten()

        model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
        image_features = []
        num_images = data_array.shape[0]
        for i in range(num_images):
            img_array = data_array[i]
            features = extract_features(img_array)
            image_features.append(features)
        image_features = np.array(image_features)

      DPP = FiniteDPP('likelihood', **{'L': image_features.dot(image_features.T)})

      k = 1000
      for _ in range(1):
          DPP.sample_mcmc_k_dpp(size=k, random_state=42)

      print(DPP.list_of_samples)

      selected_image_indices = np.array(DPP.list_of_samples[0][1])

      print(group_name,subgroup_name)


      # Measure diversity using distance gains
      selected_features = image_features[selected_image_indices]
      distances = pdist(selected_features, 'euclidean')
      distance_matrix = squareform(distances)

      # Calculate the average distance gain
      average_distance_gain = np.mean(distances)

      print(f"Average optimized Distance Gain: {average_distance_gain}")
      key = f"{group_name}_{subgroup_name}"
      # Append average_distance_gain to the dictionary
      diverse_gain_dict[key] = average_distance_gain

      random_sample_indices = np.random.choice(image_features.shape[0], selected_features.shape[0], replace=False)  # Random sample indices
      random_sample_features = image_features[random_sample_indices]
      distances = pdist(random_sample_features, 'euclidean')
      distance_matrix = squareform(distances)
      # Calculate the average distance gain
      average_distance_gain = np.mean(distances)

      print(f"Average random Distance Gain: {average_distance_gain}")
      random_gain_dict[key] = average_distance_gain

      file_name = f"{group_name}_{subgroup_name}.npy"
      # Define the path to save the .npy file
      save_path = f'/content/drive/MyDrive/MeltPoolViT/Diverse_index2/{file_name}'

      np.save(save_path, selected_image_indices)
    except:
      print(f"Error at {group_name}, {subgroup_name}")
      continue

In [None]:
import json
pathes = ['/Diverse_index','/Diverse_index_KDPP','/Diverse_index_KDPP2']
results = {}
for path in pathes:
  i = 0
  all = 0
  with open(path+'/random.json', 'r') as file:
    random = json.load(file)
  with open(path+'/diverse.json', 'r') as file:
    diverse = json.load(file)

    for key in random.keys():
      i = i+ 1
      all = all + diverse[key]/random[key]
      if key not in results.keys():
        results[key] = [diverse[key]/random[key]]
      else:
        results[key].append(diverse[key]/random[key])
      print(key, ' :', diverse[key]/random[key])

    print('mean: ', all/i)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image

def cosine_similarity_images(image_path1, image_path2):
    # Load images
    img1 = Image.open(image_path1)
    img2 = Image.open(image_path2)

    # Convert images to numpy arrays
    img_array1 = np.array(img1).reshape(1, -1)
    img_array2 = np.array(img2).reshape(1, -1)

    # Compute cosine similarity
    similarity = cosine_similarity(img_array1, img_array2)

    return similarity[0][0]

from vendi_score import vendi

vendi.score(selected, cosine_similarity_images)