## Data prepration

In [1]:
import os
import cv2
import numpy as np

In [2]:
bouding_boxes_image_path = "/kaggle/input/cub2002011/CUB_200_2011/bounding_boxes.txt" #1 60.0 27.0 325.0 304.0
classes_image_path = "/kaggle/input/cub2002011/CUB_200_2011/image_class_labels.txt" # 2 002.Laysan_Albatross
image_path_file = "/kaggle/input/cub2002011/CUB_200_2011/images.txt" #1 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
train_test_file = "/kaggle/input/cub2002011/CUB_200_2011/train_test_split.txt" #1 0

file_paths = [
    image_path_file,
    train_test_file,
    bouding_boxes_image_path,
    classes_image_path,
]
# Open all files at once
files = [open(file_path, 'r') for file_path in file_paths]
# # Initialize an empty list to store the line-by-line extracted info
dataset = {}
# # Loop through the files line by line simultaneously
# max_class_names = 20

#only select 10 for each class now
try:
    while True:
        # Read one line from each file
        lines = [(file.name, file.readline().strip()) for file in files]
#         print(len(lines))
        # If any file reaches the end, break the loop
        if any(line == '' for _, line in lines):
            break
#         is_training_set = lines[0][1].split(" ")[0] == '1'
        class_name = lines[3][1].split(" ")[1]
        dataset[lines[0][1].split(" ")[1]] = {
            "is_training": True if lines[1][1].split(" ")[1] == '1' else False,
            "bounding_boxes": lines[2][1].split(" ")[1:],
            "class_name": class_name
        }

        
finally:
    # Make sure to close all files after reading
    for file in files:
        file.close()




In [3]:
### count the datasets
training_dataset_count = 0
testing_dataset_count = 0
class_counts = {}
max_count = 0
min_count = 0
for key,value in dataset.items():
    if value["is_training"] == True:
        training_dataset_count +=1
    else:
        testing_dataset_count +=1
    class_name = value["class_name"]
    if class_name in class_counts.keys():
        class_counts[class_name] += 1
    else:
        class_counts[class_name] = 0
max_class = max(class_counts, key=class_counts.get)
min_class = min(class_counts, key=class_counts.get)
print("Training count={0}".format(training_dataset_count))
print("Testing count={0}".format(testing_dataset_count))
### create embeddings for cub

Training count=5994
Testing count=5794


In [4]:
def crop_image_with_bounding_box(image_cv, bounding_boxes):
    x, y, w, h = bounding_boxes
    # Crop the image using NumPy slicing
    cropped_image = image_cv[y:y+h, x:x+w]
    return cropped_image

def read_image_return_cropped(image_path, bounding_boxes, target_size=(224, 224)):
    # Step 1: Load the image using OpenCV
    image_cv = cv2.imread(image_path)

    # Step 2: Convert BGR to RGB (OpenCV loads images in BGR by default)
    image_cv_rgb = cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB)

    # Step 3: Crop the image using the bounding box
    cropped_image_cv = crop_image_with_bounding_box(image_cv_rgb, bounding_boxes)

    # Step 4: Resize the cropped image to the target size using cv2
    cropped_image_resized = cv2.resize(cropped_image_cv, target_size)

    # Step 5: Check if the image is grayscale and convert to RGB if necessary
    if len(cropped_image_resized.shape) == 2 or cropped_image_resized.shape[2] == 1:
        cropped_image_resized = cv2.cvtColor(cropped_image_resized, cv2.COLOR_GRAY2RGB)

    # Step 6: Preprocess the image for VGG16 model (normalize pixel values)
    return cropped_image_resized

In [5]:
images_train = []
labels_train = []
images_test = []
labels_test = []
main_path = "/kaggle/input/cub2002011/CUB_200_2011/images/"
count = 0
for key,value in dataset.items():
    (x,y,w,h) = value['bounding_boxes']
    is_training = value["is_training"]
    class_label = value["class_name"]
    image_path = os.path.join(main_path, key)
    cropped_image = read_image_return_cropped(image_path, (int(float(x)), int(float(y)), int(float(w)), int(float(h))))
    if is_training: 
        images_train.append(cropped_image)
        labels_train.append(class_label)
    else:
        images_test.append(cropped_image)
        labels_test.append(class_label)

In [6]:
X_train = np.array(images_train)
y_train = np.array(labels_train)
X_temp = np.array(images_test)
y_temp = np.array(labels_test)

# Verify the shapes
print(f'Images shape: {X_train.shape}')  # (num_images, 224, 224, 3)
print(f'Labels shape: {y_train.shape}')  # (num_images,)
print(f'Images shape: {X_temp.shape}')  # (num_images, 224, 224, 3)
print(f'Labels shape: {y_temp.shape}')  # (num_images,)

Images shape: (5994, 224, 224, 3)
Labels shape: (5994,)
Images shape: (5794, 224, 224, 3)
Labels shape: (5794,)


## Data preprocessing

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# ViT expects images normalized to [-1, 1]
def vit_preprocess(images):
    return (np.array(images).astype(np.float32) / 127.5) - 1.0  # → [-1, 1]

# One-hot encode labels
one_hot_encoder = OneHotEncoder(sparse=False)
y_train = one_hot_encoder.fit_transform(np.array(labels_train).reshape(-1, 1))
y_temp = one_hot_encoder.transform(np.array(labels_test).reshape(-1, 1))

# Preprocess image data for ViT
X_train = vit_preprocess(images_train)
X_temp = vit_preprocess(images_test)

# Split test portion into validation and test sets
X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,
    random_state=42
)

# Print shapes to verify
print(f"Train set:      {X_train.shape}, {y_train.shape}")
print(f"Test set:       {X_test.shape},  {y_test.shape}")
print(f"Validation set: {X_val.shape},  {y_val.shape}")




Train set:      (5994, 224, 224, 3), (5994, 200)
Test set:       (4635, 224, 224, 3),  (4635, 200)
Validation set: (1159, 224, 224, 3),  (1159, 200)


## Model

In [8]:
!pip install transformers



In [9]:
print(X_train.shape)  # should be (batch_size, 224, 224, 3)

(5994, 224, 224, 3)


In [12]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator

class CenterLossLayer(tf.keras.layers.Layer):
    def __init__(self, num_classes, embedding_dim, margin=1.5, alpha=0.001, **kwargs):
        super(CenterLossLayer, self).__init__(**kwargs)
        self.num_classes = num_classes
        self.embedding_dim = embedding_dim
        self.margin = tf.cast(margin, tf.float32)
        self.alpha = alpha  # EMA smoothing factor
        self.distance = 0.1  # The length between P1 and P2

        # Initialize center_p1 randomly using a normal distribution
        self.centers_P1 = self.add_weight(name='centers_P1',
                                          shape=(num_classes, embedding_dim),
                                          initializer='random_normal',
                                          trainable=False,
                                          dtype=tf.float32)

        # Create center_p2 without initialization
        self.centers_P2 = self.add_weight(name='centers_P2',
                                          shape=(num_classes, embedding_dim),
                                          initializer='zeros',
                                          trainable=False,
                                          dtype=tf.float32)

    def build(self, input_shape):
        # Generate a random unit vector direction for center_p2
        random_direction = tf.random.normal((self.num_classes, self.embedding_dim))
        unit_vector = random_direction / tf.norm(random_direction, axis=1, keepdims=True)  # Normalize

        # Assign center_p2 to be 'distance' units away from center_p1
        center_p2_value = self.distance * unit_vector
        self.centers_P2.assign(center_p2_value)

        super(CenterLossLayer, self).build(input_shape)
    
    def call(self, inputs):
        embeddings, labels = inputs

        # Ensure embeddings and labels are float32
        embeddings = tf.cast(embeddings, tf.float32)
        labels = tf.argmax(labels, axis=-1, output_type=tf.int32)

        # Step 1: Compute the midpoint for each class (mean of embeddings for each class)
        batch_midpoint = tf.math.unsorted_segment_mean(embeddings, labels, num_segments=self.num_classes)

        # Step 2: Calculate the variance for each class
        squared_diff = tf.square(embeddings - tf.gather(batch_midpoint, labels))
        batch_variance = tf.math.unsorted_segment_mean(squared_diff, labels, num_segments=self.num_classes)

        # Step 3: Compute the standard deviation (sqrt of variance)
        batch_stddev = tf.sqrt(batch_variance)

        # Step 4: Position centers_P1 and centers_P2 around the midpoint
        # Center_P1 closer to the midpoint, Center_P2 further from the midpoint
        batch_centers_P1 = batch_midpoint - 0.5 * batch_stddev  # Center_P1 inside dense region
        batch_centers_P2 = batch_midpoint + 0.5 * batch_stddev  # Center_P2 outside dense region

        # Gather the centers corresponding to the labels
        batch_centers_P1_gathered = tf.gather(batch_centers_P1, labels)
        batch_centers_P2_gathered = tf.gather(batch_centers_P2, labels)

        # Step 5: Update centers using EMA (Exponential Moving Average)
        center_updates_P1 = tf.scatter_nd(tf.expand_dims(labels, 1),
                                          batch_centers_P1_gathered,
                                          shape=tf.shape(self.centers_P1))
        center_updates_P2 = tf.scatter_nd(tf.expand_dims(labels, 1),
                                          batch_centers_P2_gathered,
                                          shape=tf.shape(self.centers_P2))

        # EMA update for centers_P1 and centers_P2
        new_centers_P1 = self.centers_P1 * (1 - self.alpha) + center_updates_P1 * self.alpha
        new_centers_P2 = self.centers_P2 * (1 - self.alpha) + center_updates_P2 * self.alpha

        # Assign updated centers
        self.centers_P1.assign(new_centers_P1)
        self.centers_P2.assign(new_centers_P2)

        # Step 6: Compute distances to all class segments for each embedding
        distances = self.compute_distance_to_segment_all_classes(embeddings)

        # Step 7: Get the correct distances by indexing with labels
        correct_distances = tf.gather_nd(distances, tf.expand_dims(labels, axis=-1), batch_dims=1)

        # Step 8: Mask out correct class distances and find the minimum incorrect distance
        mask = tf.one_hot(labels, depth=self.num_classes, on_value=False, off_value=True)
        masked_distances = tf.where(mask, distances, tf.fill(tf.shape(distances), float('inf')))
        min_incorrect_distances = tf.reduce_min(masked_distances, axis=1)

        # Step 9: Compute the loss
        incorrect_loss = tf.maximum(0.0, self.margin - min_incorrect_distances)
        center_loss = tf.reduce_mean(tf.square(correct_distances))

        return center_loss + tf.reduce_mean(incorrect_loss)


    def compute_distance_to_segment_all_classes(self, embeddings):
        """
        Compute the Euclidean distance from each embedding to the nearest point on the line segment
        defined by P1 and P2 for each class.
        """
        # Get P1 and P2 for all classes
        P1 = self.centers_P1
        P2 = self.centers_P2

        # Vector from P1 to P2 for all classes
        P1_P2 = P2 - P1

        # Expand dims for broadcasting
        P1 = tf.expand_dims(P1, axis=0)
        P2 = tf.expand_dims(P2, axis=0)
        P1_P2 = tf.expand_dims(P1_P2, axis=0)
        embeddings = tf.expand_dims(embeddings, axis=1)

        # Vector from P1 to the embeddings
        P1_emb = embeddings - P1

        # Project embeddings onto the line segment
        proj = tf.reduce_sum(P1_emb * P1_P2, axis=2, keepdims=True) / tf.maximum(tf.reduce_sum(P1_P2 ** 2, axis=2, keepdims=True), 1e-8)

        # Clamp projection to the range [0, 1] to restrict to the segment
        proj_clamped = tf.clip_by_value(proj, 0.0, 1.0)

        # Compute the nearest point on the line segment
        nearest_point = P1 + proj_clamped * P1_P2

        # Compute the Euclidean distance to the nearest point on the segment
        distances = tf.norm(embeddings - nearest_point, axis=2)

        return distances
        
import tensorflow as tf
from tensorflow.keras import layers, models
from transformers import TFViTModel

class ViTCLSLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.vit = TFViTModel.from_pretrained("WinKawaks/vit-small-patch16-224")
        self.vit.trainable = False

    def call(self, inputs):
        # Convert NHWC to NCHW for ViT (if required)
        inputs_nchw = tf.transpose(inputs, perm=[0, 3, 1, 2])
        outputs = self.vit(pixel_values=inputs_nchw, training=False)
        return outputs.last_hidden_state[:, 0, :]


def build_vit_center_loss(input_shape, num_classes, embedding_dim, dropout_rate, weight_decay):
    # Inputs
    inputs = tf.keras.Input(shape=input_shape, name='input')
    labels_input = tf.keras.Input(shape=(num_classes,), name='labels_input')  # assuming one-hot labels

    # Normalize to [-1, 1]
    # x = tf.keras.layers.Rescaling(scale=1./127.5, offset=-1)(inputs)
    x = inputs

    # ViT feature extractor (CLS token)
    x = ViTCLSLayer(name='vit_cls_token')(x)

    # Optional projection to embedding space
    if embedding_dim != 768:
        x = layers.Dense(embedding_dim, activation='swish')(x)

    # MLP head with regularization and dropout
    x = layers.Dense(1024, activation='swish',
                     kernel_regularizer=regularizers.l2(weight_decay),
                     bias_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(dropout_rate)(x)

    x = layers.Dense(512, activation='swish',
                     kernel_regularizer=regularizers.l2(weight_decay),
                     bias_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(dropout_rate)(x)

    x = layers.Dense(512, activation='swish',
                     kernel_regularizer=regularizers.l2(weight_decay),
                     bias_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(dropout_rate)(x)

    x = layers.Dense(1024, activation='swish',
                     kernel_regularizer=regularizers.l2(weight_decay),
                     bias_regularizer=regularizers.l2(0.01))(x)

    # Final projection to embedding space (again)
    x = layers.Dense(embedding_dim, activation='swish')(x)

    # Embedding model
    embedding_model = models.Model(inputs=inputs, outputs=x, name='vit_embedding_model')
    embedding = embedding_model(inputs)

    # Classification head
    logits = layers.Dense(num_classes, activation='softmax', name='classification_layer')(embedding)

    # Center loss branch
    center_loss_output = CenterLossLayer(num_classes=num_classes, embedding_dim=embedding_dim)([embedding, labels_input])

    # Full model
    full_model = models.Model(
        inputs=[inputs, labels_input],
        outputs=[logits, center_loss_output],
        name='full_model_vit'
    )

    return embedding_model, full_model




def train_and_evaluate(model, train_generator, val_generator, steps_per_epoch, validation_steps, epochs, center_loss_weight, learning_rate):
    optimizer = tf.keras.optimizers.RMSprop(learning_rate)

    history = {
        "train_loss": [],
        "train_class_loss": [],
        "train_center_loss": [],
        "train_acc": [],
        "val_class_loss": [],
        "val_center_loss": [],
        "val_acc": []
    }

    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logits, center_loss = model([inputs, labels], training=True)
            classification_loss = tf.keras.losses.CategoricalCrossentropy()(labels, logits)
            total_loss = classification_loss + center_loss_weight * center_loss

        gradients = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        predictions = tf.argmax(logits, axis=-1)
        labels_true = tf.argmax(labels, axis=-1)
        train_acc = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_true), tf.float32))

        return total_loss, classification_loss, center_loss, train_acc

    @tf.function
    def eval_step(inputs, labels):
        logits, center_loss = model([inputs, labels], training=False)
        classification_loss = tf.keras.losses.CategoricalCrossentropy()(labels, logits)

        predictions = tf.argmax(logits, axis=-1)
        labels_true = tf.argmax(labels, axis=-1)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_true), tf.float32))

        return classification_loss, center_loss, accuracy

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")

        # Training loop
        epoch_loss, epoch_class_loss, epoch_center_loss, epoch_acc = 0, 0, 0, 0
        for step in range(steps_per_epoch):
            inputs_batch, labels_batch = next(train_generator)
            loss, class_loss, center_loss, acc = train_step(inputs_batch, labels_batch)
            epoch_loss += loss
            epoch_class_loss += class_loss
            epoch_center_loss += center_loss
            epoch_acc += acc

        epoch_loss /= steps_per_epoch
        epoch_class_loss /= steps_per_epoch
        epoch_center_loss /= steps_per_epoch
        epoch_acc /= steps_per_epoch

        print(f"Train Loss: {epoch_loss:.4f}, Class Loss: {epoch_class_loss:.4f}, Center Loss: {epoch_center_loss:.4f}, Acc: {epoch_acc:.4f}")

        history["train_loss"].append(epoch_loss)
        history["train_class_loss"].append(epoch_class_loss)
        history["train_center_loss"].append(epoch_center_loss)
        history["train_acc"].append(epoch_acc)

        # Validation loop
        val_class_loss, val_center_loss, val_acc = 0, 0, 0
        for step in range(validation_steps):
            inputs_batch, labels_batch = next(val_generator)
            class_loss, center_loss, acc = eval_step(inputs_batch, labels_batch)
            val_class_loss += class_loss
            val_center_loss += center_loss
            val_acc += acc

        val_class_loss /= validation_steps
        val_center_loss /= validation_steps
        val_acc /= validation_steps

        print(f"Val Class Loss: {val_class_loss:.4f}, Val Center Loss: {val_center_loss:.4f}, Val Acc: {val_acc:.4f}")

        history["val_class_loss"].append(val_class_loss)
        history["val_center_loss"].append(val_center_loss)
        history["val_acc"].append(val_acc)

    return history

# Initialize ImageDataGenerator with augmentation options (without rescaling)
train_datagen = ImageDataGenerator(
    rotation_range=20,           # Randomly rotate images by 20 degrees
    width_shift_range=0.2,       # Randomly shift images horizontally
    height_shift_range=0.2,      # Randomly shift images vertically
    shear_range=0.2,             # Shear transformation
    zoom_range=0.2,              # Zoom in/out
    horizontal_flip=True,        # Random horizontal flipping
    fill_mode='nearest'          # Filling pixels after transformations
)

# Example usage with specified values
input_shape = (224, 224, 3)        # Input shape for images (224x224 RGB)
num_classes = y_train.shape[1]                  # Number of classes in the dataset
embedding_dim = 1000              # Dimensionality of the embedding space
dropout_rate = 0.2               # Dropout rate for regularization
weight_decay = 0.05 #0.005            # L2 regularization weight
center_loss_weight = 0.01 #0.0001          # Weight for center loss
learning_rate = 1e-4               # Learning rate for the optimizer
batch_size = 64                    # Batch size for training
epochs = 40                      # Number of epochs to train

val_datagen = ImageDataGenerator()  # No additional augmentations for validation

# Load and augment training data
train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size)

# Load validation data
val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size)

steps_per_epoch = len(X_train) // batch_size
validation_steps = len(X_val)  // batch_size


In [13]:
# Build the model using VGG19 and center loss
embedding_model, full_model = build_vit_center_loss(input_shape, num_classes, embedding_dim, dropout_rate, weight_decay)
# Train the model using data generators
history = train_and_evaluate(full_model, train_generator, val_generator, steps_per_epoch, validation_steps, epochs, center_loss_weight, learning_rate)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing TFViTModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFViTModel were not initialized from the PyTorch model and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/40
Train Loss: 5.1037, Class Loss: 5.0160, Center Loss: 8.7742, Acc: 0.1742
Val Class Loss: 4.3008, Val Center Loss: 16.7388, Val Acc: 0.4766
Epoch 2/40
Train Loss: 4.0659, Class Loss: 3.5208, Center Loss: 54.5073, Acc: 0.4497
Val Class Loss: 2.5190, Val Center Loss: 56.5722, Val Acc: 0.6395
Epoch 3/40
Train Loss: 3.1405, Class Loss: 2.3567, Center Loss: 78.3792, Acc: 0.6094
Val Class Loss: 1.8530, Val Center Loss: 58.5822, Val Acc: 0.7325
Epoch 4/40
Train Loss: 2.5732, Class Loss: 1.7831, Center Loss: 79.0147, Acc: 0.7049
Val Class Loss: 1.3677, Val Center Loss: 61.0172, Val Acc: 0.7777
Epoch 5/40
Train Loss: 2.1933, Class Loss: 1.4490, Center Loss: 74.4229, Acc: 0.7570
Val Class Loss: 1.1664, Val Center Loss: 56.3721, Val Acc: 0.8053
Epoch 6/40
Train Loss: 1.9122, Class Loss: 1.2171, Center Loss: 69.5045, Acc: 0.7983
Val Class Loss: 0.9969, Val Center Loss: 49.5370, Val Acc: 0.8297
Epoch 7/40
Train Loss: 1.7101, Class Loss: 1.0683, Center Loss: 64.1839, Acc: 0.8138
Val Class 

## Modal evaluation

In [16]:
# Save the history to a .pkl file
import pickle
with open('/kaggle/working/cub_history_standard_vit.pkl', 'wb') as file:
    pickle.dump(history, file)

# Save the embedding model
embedding_model.save('/kaggle/working/cub_embedding_model_standard_vit.keras')

# Save the full model
full_model.save('/kaggle/working/cub_full_model_standard_vit.keras')

In [18]:
# Load the embedding model
import tensorflow as tf
embedding_model_loaded = tf.keras.models.load_model('/kaggle/working/cub_embedding_model_standard_vit.keras')

TypeError: <class 'keras.src.models.functional.Functional'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras.src.models.functional', 'class_name': 'Functional', 'config': {'name': 'vit_embedding_model', 'trainable': True, 'layers': [{'module': 'keras.layers', 'class_name': 'InputLayer', 'config': {'batch_shape': [None, 224, 224, 3], 'dtype': 'float32', 'sparse': False, 'name': 'input'}, 'registered_name': None, 'name': 'input', 'inbound_nodes': []}, {'module': None, 'class_name': 'ViTCLSLayer', 'config': {'name': 'vit_cls_token', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}, 'registered_name': 'ViTCLSLayer', 'build_config': {'input_shape': [None, 224, 224, 3]}, 'name': 'vit_cls_token', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 224, 224, 3], 'dtype': 'float32', 'keras_history': ['input', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_6', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 1000, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 384]}, 'name': 'dense_6', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 384], 'dtype': 'float32', 'keras_history': ['vit_cls_token', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_7', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 1024, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.05}, 'registered_name': None}, 'bias_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.01}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1000]}, 'name': 'dense_7', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 1000], 'dtype': 'float32', 'keras_history': ['dense_6', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dropout', 'config': {'name': 'dropout_3', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'rate': 0.2, 'seed': None, 'noise_shape': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1024]}, 'name': 'dropout_3', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 1024], 'dtype': 'float32', 'keras_history': ['dense_7', 0, 0]}}], 'kwargs': {'training': False}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_8', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 512, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.05}, 'registered_name': None}, 'bias_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.01}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1024]}, 'name': 'dense_8', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 1024], 'dtype': 'float32', 'keras_history': ['dropout_3', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dropout', 'config': {'name': 'dropout_4', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'rate': 0.2, 'seed': None, 'noise_shape': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dropout_4', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 512], 'dtype': 'float32', 'keras_history': ['dense_8', 0, 0]}}], 'kwargs': {'training': False}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_9', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 512, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.05}, 'registered_name': None}, 'bias_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.01}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dense_9', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 512], 'dtype': 'float32', 'keras_history': ['dropout_4', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dropout', 'config': {'name': 'dropout_5', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'rate': 0.2, 'seed': None, 'noise_shape': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dropout_5', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 512], 'dtype': 'float32', 'keras_history': ['dense_9', 0, 0]}}], 'kwargs': {'training': False}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_10', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 1024, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.05}, 'registered_name': None}, 'bias_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.01}, 'registered_name': None}, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 512]}, 'name': 'dense_10', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 512], 'dtype': 'float32', 'keras_history': ['dropout_5', 0, 0]}}], 'kwargs': {}}]}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_11', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 136934670513168}, 'units': 1000, 'activation': 'silu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 1024]}, 'name': 'dense_11', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 1024], 'dtype': 'float32', 'keras_history': ['dense_10', 0, 0]}}], 'kwargs': {}}]}], 'input_layers': [['input', 0, 0]], 'output_layers': [['dense_11', 0, 0]]}, 'registered_name': 'Functional', 'build_config': {'input_shape': None}}.

Exception encountered: Could not locate class 'ViTCLSLayer'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'ViTCLSLayer', 'config': {'name': 'vit_cls_token', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None}}, 'registered_name': 'ViTCLSLayer', 'build_config': {'input_shape': [None, 224, 224, 3]}, 'name': 'vit_cls_token', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 224, 224, 3], 'dtype': 'float32', 'keras_history': ['input', 0, 0]}}], 'kwargs': {}}]}

In [19]:
from sklearn.metrics import normalized_mutual_info_score
import numpy as np
from sklearn.cluster import KMeans

def evaluation(X, Y, Kset):
    num = X.shape[0]
    classN = np.max(Y) + 1
    kmax = np.max(Kset)
    recallK = np.zeros(len(Kset))
    
    # Compute NMI using KMeans clustering
    kmeans = KMeans(n_clusters=classN).fit(X)
    nmi = normalized_mutual_info_score(Y, kmeans.labels_, average_method='arithmetic')
    
    # Compute Recall@K
    sim = X.dot(X.T)
    minval = np.min(sim) - 1.
    sim -= np.diag(np.diag(sim))
    sim += np.diag(np.ones(num) * minval)
    indices = np.argsort(-sim, axis=1)[:, :kmax]
    YNN = Y[indices]
    
    for i in range(len(Kset)):
        pos = 0.
        for j in range(num):
            if Y[j] in YNN[j, :Kset[i]]:
                pos += 1.
        recallK[i] = pos / num
    
    return nmi, recallK

def calculate_metrics(embedding_model, val_data, labels, k_values):
    # Generate embeddings for the validation set
    embeddings = embedding_model.predict(val_data)
    
    # Convert labels to the appropriate format
    labels = np.argmax(labels, axis=1)  # Assuming labels are one-hot encoded
    
    # Calculate NMI and Recall@K
    nmi_score, recall_scores = evaluation(embeddings, labels, k_values)
    
    return recall_scores, nmi_score

In [20]:
k_values = [1, 2, 4, 8, 16, 32, 64, 128]
recall_scores_test, nmi_score_test = calculate_metrics(embedding_model, X_test, y_test, k_values)

I0000 00:00:1746968005.512024     103 service.cc:148] XLA service 0x7c8708033650 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1746968005.512789     103 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1746968005.512810     103 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m  3/145[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 95ms/step

I0000 00:00:1746968007.688568     103 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 139ms/step




In [21]:
print(recall_scores_test, nmi_score_test)

[0.85738943 0.90226537 0.92729234 0.94886731 0.96591154 0.98360302
 0.99007551 0.99395901] 0.9309972836880591
