<a href="https://colab.research.google.com/github/MarshaGomez/DNN-Sketches-image-analysis/blob/main/Code/lsh_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Setting up the project

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import numpy as np
import time
import itertools
import os, shutil
import tensorflow as tf
import matplotlib.pyplot as plt

from random import random
from ipywidgets import Image
from numpy.linalg import norm
from IPython.display import display
from keras.models import Model
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping, ModelCheckpoint

# tensorflow version 2.4.0
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input, InceptionV3

In [None]:
zip_path = '/content/gdrive/Shareddrives/COMPUTER_VISION/MIM_zipped.zip'
!cp "{zip_path}" .
!unzip -q MIM_zipped.zip
!rm MIM_zipped.zip

In [None]:
test_images_per_class = 20 # le immagini sono al massimo 80
train_images_per_class = 80 - test_images_per_class

In [None]:
BASE_DIR = "/content/content/gdrive/Shareddrives/COMPUTER_VISION/MIRCV"
SKETCHES_DIR = os.path.join(BASE_DIR, "sketches")
MIRFLICKR_DIR = os.path.join(BASE_DIR, "mirflickr/mirflickr25k")

print(os.listdir(os.path.join(SKETCHES_DIR, "png")))
def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)
      
copytree(f'{SKETCHES_DIR}/png', f'{SKETCHES_DIR}/all')

try:
  os.rename(f'{SKETCHES_DIR}/png', f'{SKETCHES_DIR}/train')
except:
  print('already renamed')

if not os.path.exists(f'{SKETCHES_DIR}/test'):
  os.mkdir(f'{SKETCHES_DIR}/test')
  for class_folder in os.listdir(f'{SKETCHES_DIR}/train'):
    if os.path.isdir(f'{SKETCHES_DIR}/train/{class_folder}'):
      images = os.listdir(f'{SKETCHES_DIR}/train/{class_folder}')
      os.mkdir(f'{SKETCHES_DIR}/test/{class_folder}')
      for i in range(test_images_per_class):
        os.rename(f'{SKETCHES_DIR}/train/{class_folder}/{images[i]}', f'{SKETCHES_DIR}/test/{class_folder}/{images[i]}')

['rooster', 'head-phones', 'door handle', 'flashlight', 'panda', 'bottle opener', 'revolver', 'binoculars', 'piano', 'lightbulb', 'saxophone', 'snowboard', 'giraffe', 'filelist.txt', 'scissors', 'rabbit', 'pear', 'ship', 'helicopter', 'van', 'barn', 'tiger', 'dragon', 'ice-cream-cone', 'wheel', 'dolphin', 'cell phone', 'tree', 'squirrel', 'speed-boat', 'wheelbarrow', 'screwdriver', 'rifle', 'arm', 'bee', 'house', 'penguin', 'blimp', 'pretzel', 'motorbike', 'mosquito', 'hourglass', 'moon', 'spoon', 'pig', 'pizza', 'rollerblades', 'ant', 'cabinet', 'car (sedan)', 'book', 'bed', 'sailboat', 'candle', 'tooth', 'train', 'strawberry', 'palm tree', 'streetlight', 'present', 'angel', 'envelope', 'table', 'eye', 'bathtub', 'lion', 'fan', 'crocodile', 'castle', 'keyboard', 'crown', 'flying bird', 'human-skeleton', 'octopus', 'laptop', 'nose', 'parrot', 'umbrella', 'bookshelf', 'sponge bob', 'satellite', 'couch', 'bicycle', 'stapler', 'kangaroo', 'door', 'ear', 'mug', 'hot-dog', 'suv', 'fish', 'k

In [None]:
BASE_DIR = "/content/content/gdrive/Shareddrives/COMPUTER_VISION/MIRCV"
SKETCHES_DIR = os.path.join(BASE_DIR, "sketches")
MIRFLICKR_DIR = os.path.join(BASE_DIR, "mirflickr/mirflickr25k")

# LSH definition

In [None]:
class LSH:
  def __init__(self, feature_dim, g = 10, h = 20, w = 4, bitwise_hash = False):
    """
    We have to find a way to load the stored index if exists or initialize the 
    initial structure
    """
    self._index = {}
    if bitwise_hash:
      # self.x = np.random.randn(g, h, feature_dim) # non sembra richieda una distribuzione normale
      self.x = np.random.normal(size=(g, h, feature_dim))
    else:
      self.x = np.random.normal(size=(g, h, feature_dim))
    self.w = np.ones((g, h, 1)) * w
    self.b = np.random.rand(g, h, 1) * w
    self.bitwise_hash = bitwise_hash # another way to create h (h will be only 0 o 1)

  def _hash(self, features):
    """
    crea l'hash di più cose contemporaeamente si aspetta un array composto dalle features una sotta l'altra (linguaggio super matematico)
    """
    # g = np.trunc((np.dot(p, self.x) + self.b) / self.w)
    #g = np.trunc((np.dot(self.x, p) + self.b) / self.w) questo funziona con 1
    # g = np.transpose(np.trunc(((np.dot(self.x, p.T) + self.b) / self.w)), (0,2,1)) miglior modo di vederlo
    if self.bitwise_hash:
      return (np.transpose(np.dot(self.x, features.T), (0,2,1)) > 0).astype(int).astype(str)
    return np.transpose(np.trunc(((np.dot(self.x, features.T) + self.b) / self.w)), (0,2,1)).astype(int).astype(str)

  def insert(self, features, ids, labels):
    """
    Insert new data, ci aspettiamo un array d
    """
    g = self._hash(features)
    assert features.shape[0] == len(ids), "mismatch between ids length and features"
    assert len(labels) == len(ids), "mismatch between ids length and labels"
    
    number_elements = len(ids)
    i = 0
    # print("hash calculated")
    # print(g.shape)
    g_index = -1
    for g_function in g:
      start_inner_for = time.time()
      g_index += 1
      for row in g_function:
        if i % 10000 == 0:
          start = time.time()
        
        bucket_id = str(g_index) + '_' + ','.join(row)
        
        if i % 10000 == 0:
            end = time.time()
            # print(f'join {end - start}')
        if not bucket_id in self._index:
          # self._index[bucket_id] = { 'features': np.array([features[i % number_elements]]), 'ids': np.array([ids[i % number_elements]]), 'labels': np.array([labels[i % number_elements]])}
          self._index[bucket_id] = { 'features': [features[i % number_elements]], 'ids': [ids[i % number_elements]], 'labels': [labels[i % number_elements]]}
          if i % 10000 == 0:
            end = time.time()
            # print(f'not in bucket {end - start}')
        else:
          if ids[i % number_elements] in self._index[bucket_id]['ids']:
            # print("duplicate")
            continue
          if i % 10000 == 0:
            end = time.time()
            # print(f'checking duplicates {end - start}')
          # print("collision inserted")
          # self._index[bucket_id]['features'] = np.vstack((self._index[bucket_id]['features'], features[i % number_elements]))
          self._index[bucket_id]['features'].append(features[i % number_elements])
          # self._index[bucket_id]['ids'] = np.vstack((self._index[bucket_id]['ids'], ids[i % number_elements]))
          self._index[bucket_id]['ids'].append(ids[i % number_elements])
          # self._index[bucket_id]['labels'] = np.vstack((self._index[bucket_id]['labels'], labels[i % number_elements]))
          self._index[bucket_id]['labels'].append(labels[i % number_elements])
          if i % 10000 == 0:
            end = time.time()
            # print(f'stacking {end - start}')
        i += 1
        assert i > 0, 'out of bound'
      end_inner_for = time.time()
      # print(f'inner for time: {end_inner_for - start_inner_for}')
    
    for bucket_id in self._index:
      self._index[bucket_id]['features'] = np.array(self._index[bucket_id]['features'])
      # print(self._index[bucket_id]['features'].shape)
      self._index[bucket_id]['ids'] = np.array(self._index[bucket_id]['ids'], )
      self._index[bucket_id]['labels'] = np.array(self._index[bucket_id]['labels'])
      

  def query(self, features, top_k, mode = 'euclidean', return_cost = False):
    """
    Query the data
    """
    g = self._hash(np.array([features]))
    i = 0
    k = None
    top_k += 1 # per far ritornare k e non k - 1
    cost = 0
    g_index = -1
    assert mode in ['similarity', 'euclidean'], "mode must be similarity or euclidean"
    for g_function in g:
      g_index += 1
      for row in g_function:
        bucket_id = str(g_index) + '_' + ','.join(row)
        # print(bucket_id)
        if bucket_id in self._index:
          # posso avere duplicati perchè se i punti vengono inseriti in più bucket, posso avere duplicati
          # quindi devo eliminarli
          # l'ho messo qua fuori che il controllo duplicati è uguale per tutte e due le distanze
          # print(f'bucket {bucket_id}')
          if k is not None:
            # print("duplicate")
            duplicate_index = np.isin(self._index[bucket_id]['ids'], k['ids'])
            if duplicate_index.all():
              continue; # se sono tutti duplicati non ha senso contare nulla
            bucket = {}
            bucket['ids'] = self._index[bucket_id]['ids'][~duplicate_index] # prendo quelli che non sono duplicati
            # print(duplicate_index)
            # print(self._index[bucket_id]['features'])
            bucket['features'] = self._index[bucket_id]['features'][~duplicate_index.flatten()] # each duplicate index must delete a row of features
            bucket['labels'] = self._index[bucket_id]['labels'][~duplicate_index]
          else:
            bucket = self._index[bucket_id]
        
          if mode == 'euclidean':
            # print(bucket['features'].shape)
            dist = norm(bucket['features'] - np.array(features), axis=1)
            # print(f'dist shape {dist.shape} and dist size {dist.size}')
            cost += dist.size
            if k is None:
              idx_partitioned = np.argpartition(dist, top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0] - 1)
              if dist.shape[0] - 1 > top_k - 1:  
                  idx_partitioned = idx_partitioned[:top_k - 1]
              k = {}
              # qua è più comodo avere array 1- dimensionali
              k['ids'] = bucket['ids'][idx_partitioned].flatten()
              k['labels'] = bucket['labels'][idx_partitioned].flatten()
              k['distances'] = dist[idx_partitioned]
              continue
            # https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
            # argpartition sembra essere incredibilmente veloce
            # ma non ordina completamente, ordina solo rispetto un punto, nel senso
            # io gli sto dicendo butta quelli più piccoli di k da una parte e quelli più grandi all'altra, ma non sto ordinando
            if k['distances'].shape[0] < top_k:
                # print((k['distances'].shape, dist.shape))
                distances = np.concatenate((k['distances'], dist))
                # print((k['ids'].shape, bucket['ids'].shape))
                # print(k['ids'])
                # print(bucket['ids'])
                ids = np.concatenate((k['ids'], bucket['ids']))
                # print((k['labels'].shape, bucket['labels'].shape))
                labels = np.concatenate((k['labels'], bucket['labels']))
                idx_sorted = np.argpartition(distances, top_k - 1 if distances.shape[0] - 1 > top_k else distances.shape[0] - 1)
                if distances.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[:top_k - 1]
                k['ids'] = ids[idx_sorted]
                k['labels'] = labels[idx_sorted]
                k['distances'] = distances[idx_sorted]
                # print(f'k = {k}')
                continue

            idx = dist < np.max(k['distances'])
            # print(f"idx = {idx}")
            if np.any(idx):
              distances = np.concatenate((k['distances'], dist[idx]))
              ids = np.concatenate((k['ids'], bucket['ids'][idx]))
              labels = np.concatenate((k['labels'], bucket['labels'][idx]))
              idx_sorted = np.argpartition(distances, top_k - 1 if distances.shape[0] - 1 > top_k else distances.shape[0] - 1)
              if distances.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[:top_k - 1]
              k['ids'] = ids[idx_sorted]
              k['labels'] = labels[idx_sorted]
              k['distances'] = distances[idx_sorted]

          else:
            # print(bucket['features'].shape)
            sim = np.sum(bucket['features'] * np.array(features), axis=1) / (norm(bucket['features'], axis=1) * norm(np.array([features]), axis=1))
            # print(f'sim shape {sim.shape} and sim size {sim.size}')
            cost += sim.size
            if k is None:
              idx_partitioned = np.argpartition(sim, -(top_k - 1) if sim.shape[0] - 1 > top_k - 1 else sim.shape[0] - 1)
              if sim.shape[0] - 1 > top_k - 1:  
                idx_partitioned = idx_partitioned[-(top_k - 1):]
              k = {}
              # qua è più comodo avere array 1- dimensionali
              k['ids'] = bucket['ids'][idx_partitioned].flatten()
              k['labels'] = bucket['labels'][idx_partitioned].flatten()
              k['similarities'] = sim[idx_partitioned]
              continue
            # https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
            # argpartition sembra essere incredibilmente veloce
            # ma non ordina completamente, ordina solo rispetto un punto, nel senso
            # io gli sto dicendo butta quelli più piccoli di k da una parte e quelli più grandi all'altra, ma non sto ordinando
            if k['similarities'].shape[0] < top_k:
                # print((k['similarities'].shape, sim.shape))
                similarities = np.concatenate((k['similarities'], sim))
                # print((k['ids'].shape, bucket['ids'].shape))
                # print(k['ids'])
                # print(bucket['ids'])
                ids = np.concatenate((k['ids'], bucket['ids']))
                # print((k['labels'].shape, bucket['labels'].shape))
                labels = np.concatenate((k['labels'], bucket['labels']))
                idx_sorted = np.argpartition(similarities, -(top_k - 1) if similarities.shape[0] - 1 > top_k - 1 else similarities.shape[0] - 1)
                if similarities.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[-(top_k - 1):]
                k['ids'] = ids[idx_sorted]
                k['labels'] = labels[idx_sorted]
                k['similarities'] = similarities[idx_sorted]
                # print(f'k = {k}')
                continue

            idx = sim > np.min(k['similarities'])
            # print(f"idx = {idx}")
            if np.any(idx):
              similarities = np.concatenate((k['similarities'], sim[idx]))
              ids = np.concatenate((k['ids'], bucket['ids'][idx]))
              labels = np.concatenate((k['labels'], bucket['labels'][idx]))
              idx_sorted = np.argpartition(similarities, -(top_k - 1) if similarities.shape[0] - 1 > top_k - 1 else similarities.shape[0] - 1)
              if similarities.shape[0] - 1 > top_k - 1:  
                idx_sorted = idx_sorted[-(top_k - 1):]
              k['ids'] = ids[idx_sorted]
              k['labels'] = labels[idx_sorted]
              k['similarities'] = similarities[idx_sorted]
        i += 1
    # ora ordino totalmente i risultati
    if k is None:
      if return_cost:
        return (None, 1)
      return None #zero result
    if mode == 'euclidean':
      idx_sorted = np.argsort(k['distances'])
      idx_sorted = idx_sorted[:top_k - 1 if k['distances'].shape[0] - 1 > top_k else k['distances'].shape[0]]
      k['distances'] = k['distances'][idx_sorted]
      k['ids'] = k['ids'][idx_sorted]
      k['labels'] = k['labels'][idx_sorted]
      if return_cost:
        return (k, cost)
      return k
    idx_sorted = np.argsort(k['similarities'])[::-1]
    idx_sorted = idx_sorted[:top_k - 1 if k['similarities'].shape[0] - 1 > top_k else k['similarities'].shape[0]]
    k['similarities'] = k['similarities'][idx_sorted]
    k['ids'] = k['ids'][idx_sorted]
    k['labels'] = k['labels'][idx_sorted]
    if return_cost:
      return (k, cost)
    return k

  def store(self):
    pass


## No Index

In [None]:
# Useful to have comparable performance
class NO_INDEX:
  def __init__(self):
    self.features = None
    self.ids = None
    self.labels = None

  def insert(self, features, ids, labels):
    assert features.shape[0] == len(ids), "mismatch between ids length and features"
    assert len(labels) == len(ids), "mismatch between ids length and labels"
    self.features = features
    self.ids = ids
    self.labels = labels
  
  def query(self, features, top_k, mode = 'euclidean', return_cost = False):
    assert mode in ['similarity', 'euclidean'], "mode must be similarity or euclidean"
    top_k += 1
    if mode == 'euclidean':
      dist = norm(self.features - np.array(features), axis=1)
      idx_partitioned = np.argpartition(dist, top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0] - 1)
      k = {}
      # qua è più comodo avere array 1- dimensionali
      k['ids'] = self.ids[idx_partitioned].flatten()[:top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0]]
      k['labels'] = self.labels[idx_partitioned].flatten()[:top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0]]
      k['distances'] = dist[idx_partitioned][:top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0]]
      
      idx_sorted = np.argsort(k['distances'])
      idx_sorted = idx_sorted
      k['distances'] = k['distances'][idx_sorted]
      k['ids'] = k['ids'][idx_sorted]
      k['labels'] = k['labels'][idx_sorted]
      if return_cost:
        return (k, dist.size)
      return k
    sim = np.sum(self.features * np.array(features), axis=1) / (norm(self.features, axis=1) * norm(np.array([features]), axis=1))
    idx_partitioned = np.argpartition(sim, -(top_k - 1) if sim.shape[0] - 1 > top_k - 1 else sim.shape[0] - 1)
    k = {}
    # qua è più comodo avere array 1- dimensionali
    if sim.shape[0] - 1 > top_k - 1:  
      idx_partitioned = idx_partitioned[-(top_k - 1):]

    k['ids'] = self.ids[idx_partitioned].flatten()
    k['labels'] = self.labels[idx_partitioned].flatten()
    k['similarities'] = sim[idx_partitioned]
    idx_sorted = np.argsort(k['similarities'])[::-1]
    idx_sorted = idx_sorted
    k['similarities'] = k['similarities'][idx_sorted]
    k['ids'] = k['ids'][idx_sorted]
    k['labels'] = k['labels'][idx_sorted]
    if return_cost:
      return (k, sim.size)
    return k
    

## Evaluation utils

In [None]:
def average_precision(requested_label, result_labels, n_ground_truth = 80):
  """
  label ricercata, label ottenute, il numero di oggetti che ci sono quella label
  """
  correct_array = (requested_label == result_labels).astype(int)
  precision_array = [np.mean(correct_array[:k]) for k in range(1, correct_array.shape[0] + 1)]
  # print(precision_array * correct_array) # mi rimangono solo quelli a 1
  return np.sum(precision_array * correct_array) / n_ground_truth


def mAP(index, features, n_queries = 250, n_labels = 250, img_per_labels = 80, mode = 'euclidean', check_first_label=True, n_ground_truth=None):
  sum = 0
  if not n_ground_truth:
    n_ground_truth = img_per_labels
  for i in range(n_queries):
    label = i % n_labels
    image_idx = ((i * img_per_labels) + int(random() * img_per_labels)) % (n_labels * img_per_labels)
    # print('QUERY')
    # print('index = ' + str(image_idx))
    # print('label =' + str(label))
    res = index.query(features[image_idx], n_ground_truth, mode = mode)
    if res is None:
      sum += 0
      continue
    # print('first label of resultset (must be equal to label) = ' + str(res['labels'][0]))
    if check_first_label:
      assert res['labels'][0] == label, 'deve essere della stessa label'
    a = average_precision(label, res['labels'], n_ground_truth)
    sum += a
  return sum / n_queries

def ie(cost_with_index, cost_no_index=45000): # utilizzando tutte le features abbiamo 45000 immagini tra distrattore e non
  return cost_no_index / cost_with_index


def mAP_with_ie(index, features, n_queries = 250, n_labels = 250, img_per_labels = 80, mode = 'euclidean', check_first_label=True, n_ground_truth=None):
  sum = 0
  sum_ie = 0
  if not n_ground_truth:
    n_ground_truth = img_per_labels
  for i in range(n_queries):
    label = i % n_labels
    image_idx = ((i * img_per_labels) + int(random() * img_per_labels)) % (n_labels * img_per_labels)
    # print('QUERY')
    # print('index = ' + str(image_idx))
    # print('label =' + str(label))
    res, cost = index.query(features[image_idx], n_ground_truth, mode = mode, return_cost = True)
    if res is None:
      sum += 0
      sum_ie += ie(cost, n_ground_truth * n_labels)
      continue
    # print('first label of resultset (must be equal to label) = ' + str(res['labels'][0]))
    if check_first_label:
      assert res['labels'][0] == label, 'deve essere della stessa label'
    a = average_precision(label, res['labels'], n_ground_truth)
    sum += a
    sum_ie += ie(cost, n_ground_truth * n_labels)
  return { 'mAP': sum / n_queries, 'ie': sum_ie / n_queries }


# Model loading and features extraction

In [None]:
def extract_features(extractor, generator, sample_count, dim=2048):
  features = np.zeros((sample_count, dim)) #extractor output shape 
  i = 0
  for inputs_batch, labels_batch in generator:
    start = time.time()
    features_batch = extractor.predict(inputs_batch)
    start = time.time()
    if (i + 1) * BATCH_SIZE > sample_count:
      features[i * BATCH_SIZE : sample_count , :] = features_batch
      assert np.array_equal(np.argmax(labels_batch, axis = 1), generator.labels[i * BATCH_SIZE : sample_count]), 'LABELS NOT CORRESPONDING REINIZIALIZE GENERATOR'
    else:
      features[i * BATCH_SIZE : (i + 1) * BATCH_SIZE, : ] = features_batch
      assert np.array_equal(np.argmax(labels_batch, axis = 1), generator.labels[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]), 'LABELS NOT CORRESPONDING REINIZIALIZE GENERATOR'
    i += 1
    if i * BATCH_SIZE >= sample_count:
      break
  
  return features

In [None]:
IMG_HEIGHT = 299
IMG_WIDTH = 299
INPUT_SHAPE = (IMG_WIDTH, IMG_HEIGHT, 3)
BATCH_SIZE = 64


sketches_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
mirflickr_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

sketches_generator_train = sketches_datagen.flow_from_directory(
      # This is the target directory
      SKETCHES_DIR + '/train',
      shuffle=False,
      target_size=(IMG_HEIGHT, IMG_WIDTH),
      batch_size=BATCH_SIZE)

mirflickr_generator = mirflickr_datagen.flow_from_directory(
        # This is the target directory
        MIRFLICKR_DIR,
        shuffle=False,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE)

sketches_generator_test = sketches_datagen.flow_from_directory(
      # This is the target directory
      SKETCHES_DIR + '/test',
      shuffle=False,
      target_size=(IMG_HEIGHT, IMG_WIDTH),
      batch_size=BATCH_SIZE)


Found 15000 images belonging to 250 classes.
Found 25000 images belonging to 1 classes.
Found 5000 images belonging to 250 classes.


In [None]:
MODEL_PATH = "/content/gdrive/Shareddrives/COMPUTER_VISION/models"
model = models.load_model(MODEL_PATH + '/inception_finetuning_classification_3_last_one_more_train_all_parameters_more_train.h5')
extractor = Model(model.input, model.layers[-2].output)
extractor.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None, None,  0                                            
__________________________________________________________________________________________________
conv2d_376 (Conv2D)             (None, None, None, 3 864         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_376 (BatchN (None, None, None, 3 96          conv2d_376[0][0]                 
__________________________________________________________________________________________________
activation_376 (Activation)     (None, None, None, 3 0           batch_normalization_376[0][0]    
______________________________________________________________________________________________

In [None]:
sketches_features_train = extract_features(extractor, sketches_generator_train, 15000, dim=2048)
sketches_features_test = extract_features(extractor, sketches_generator_test, 5000, dim=2048)
mirflickr_features = extract_features(extractor, mirflickr_generator, 25000, dim=2048)

# NO Index mAP

In [None]:
print('mAP on train')
no_index_base = NO_INDEX()
no_index_base.insert(np.vstack((sketches_features_train, mirflickr_features)), 
              np.concatenate((sketches_generator_train.filenames, mirflickr_generator.filenames)), 
              np.concatenate((sketches_generator_train.labels, np.array([250] * mirflickr_features.shape[0]))))
print('Euclidean mAP')
print(mAP(no_index_base, sketches_features_train, img_per_labels=train_images_per_class))
print('Cosine mAP')
print(mAP(no_index_base, sketches_features_train, mode = 'similarity', img_per_labels=train_images_per_class))
print('')

print('mAP using test as query on train')

print('Euclidean mAP')
print(mAP(no_index_base, sketches_features_test, img_per_labels=test_images_per_class, check_first_label=False, n_ground_truth=train_images_per_class))
print('Cosine mAP')
print(mAP(no_index_base, sketches_features_test, mode = 'similarity', img_per_labels=test_images_per_class, check_first_label=False, n_ground_truth=train_images_per_class))
print('')


del no_index_base

mAP on train
Euclidean mAP
0.39624772359313365
Cosine mAP
0.45834064304191197

mAP using test as query on train
Euclidean mAP
0.15376194037501312
Cosine mAP
0.18571864659363127

query cost always 45000


# LSH Finetuning

In [None]:
def evaluate_lsh(index, train, train_generator, test, test_generator, distractor, distractor_generator):
  index.insert(np.vstack((train, distractor)), 
                np.concatenate((train_generator.filenames, distractor_generator.filenames)), 
                np.concatenate((train_generator.labels, np.array([250] * distractor.shape[0]))))
  print('Euclidean mAP')
  print(mAP_with_ie(index, train, img_per_labels=train_images_per_class))
  print('Cosine mAP')
  print(mAP_with_ie(index, train, mode = 'similarity', img_per_labels=train_images_per_class))
  print('')

  print('mAP using test as query on train')

  print('Euclidean mAP')
  print(mAP_with_ie(index, test, img_per_labels=test_images_per_class, check_first_label=False, n_ground_truth=train_images_per_class))
  print('Cosine mAP')
  print(mAP_with_ie(index, test, mode = 'similarity', img_per_labels=test_images_per_class, check_first_label=False, n_ground_truth=train_images_per_class))

## LSH BitWise = False

### Test 1 - g = 3, h = 5

In [None]:
evaluate_lsh(
  LSH(2048, g = 3, h = 5, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
)

Euclidean mAP
{'mAP': 0.033346878554332514, 'ie': 2753.0064187374837}
Cosine mAP
{'mAP': 0.03565740640727756, 'ie': 2704.5203132245015}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.014448705677420837, 'ie': 3775.3113447310348}
Cosine mAP
{'mAP': 0.014971287329228498, 'ie': 4000.964065702717}


### Test 2 - g = 3, h = 3

In [None]:
evaluate_lsh(
  LSH(2048, g = 3, h = 3, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.08866491501258762, 'ie': 282.3312579106736}
Cosine mAP
{'mAP': 0.09001556085828957, 'ie': 302.3180885158709}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.06505553613569094, 'ie': 387.6267176154561}
Cosine mAP
{'mAP': 0.06639555759828986, 'ie': 269.2375273430253}


### Test 3 - g = 5, h = 2

In [None]:
evaluate_lsh(
  LSH(2048, g = 5, h = 2, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.22297371394744478, 'ie': 7.644244013878772}
Cosine mAP
{'mAP': 0.23996936825516268, 'ie': 9.160585240444693}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.1801783468594954, 'ie': 6.680432345452961}
Cosine mAP
{'mAP': 0.1975698210198003, 'ie': 9.316108045411216}


### Test 4 - g = 8, h = 2

In [None]:
evaluate_lsh(
  LSH(2048, g = 8, h = 2, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.2946193514735761, 'ie': 4.132316559073223}
Cosine mAP
{'mAP': 0.2962872316555351, 'ie': 3.5115839909751436}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.24829437079734626, 'ie': 3.3804775269211995}
Cosine mAP
{'mAP': 0.28623508681160026, 'ie': 3.819802775411196}


### Test 5 - g = 4, h = 2

In [None]:
evaluate_lsh(
  LSH(2048, g = 4, h = 2, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.20769879359863885, 'ie': 15.595131202189513}
Cosine mAP
{'mAP': 0.22525620682255051, 'ie': 8.34958613171565}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.16848850384572175, 'ie': 7.269454322158421}
Cosine mAP
{'mAP': 0.1711514087763372, 'ie': 7.816050744832702}


### Test 6 - g = 5, h = 1

In [None]:
evaluate_lsh(
  LSH(2048, g = 5, h = 1, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.3707804054005468, 'ie': 0.4374005942381987}
Cosine mAP
{'mAP': 0.4220271451214096, 'ie': 0.4433267613109037}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.309219318757023, 'ie': 0.4119930156338686}
Cosine mAP
{'mAP': 0.3807077876670305, 'ie': 0.38356122725634867}


### Test 7 - g = 5, h = 2

In [None]:
evaluate_lsh(
  LSH(2048, g = 5, h = 2, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.21666880107414974, 'ie': 4.6549899244315665}
Cosine mAP
{'mAP': 0.24959612076455243, 'ie': 38.64144028999621}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.21723403861928955, 'ie': 5.683951742789867}
Cosine mAP
{'mAP': 0.22852786484843948, 'ie': 4.340974961079275}


### Test 8 - g = 7, h = 2

In [None]:
evaluate_lsh(
  LSH(2048, g = 7, h = 2, w = 4, bitwise_hash = False),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.25933988934904945, 'ie': 7.252277631469861}
Cosine mAP
{'mAP': 0.29747764957327666, 'ie': 10.102374112431063}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.24274348926511324, 'ie': 3.5384874728011417}
Cosine mAP
{'mAP': 0.2477900264574565, 'ie': 5.068979564879656}


## LSH BitWise = True

### Test 1 - g = 3, h = 5

In [None]:
evaluate_lsh(
  LSH(2048, g = 3, h = 5, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.33309311444516065, 'ie': 4.053016053293838}
Cosine mAP
{'mAP': 0.3976903716638469, 'ie': 4.178324701125581}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.1626955021721199, 'ie': 3.707340503175567}
Cosine mAP
{'mAP': 0.18228916377052617, 'ie': 3.994830574455172}


### Test 2 - g = 3, h = 6

In [None]:
evaluate_lsh(
  LSH(2048, g = 3, h = 6, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.32148159485939165, 'ie': 9.251448899924094}
Cosine mAP
{'mAP': 0.3452556370669649, 'ie': 9.295563425766309}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.2975683258928303, 'ie': 8.559723965980773}
Cosine mAP
{'mAP': 0.30939495156403996, 'ie': 9.447083291855845}


### Test 3 - g = 5, h = 2


In [None]:
evaluate_lsh(
  LSH(2048, g = 5, h = 2, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.3911384970436625, 'ie': 0.2514255391714975}
Cosine mAP
{'mAP': 0.4650808283723997, 'ie': 0.25739805436729146}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.3485418042453301, 'ie': 0.24203368229280825}
Cosine mAP
{'mAP': 0.41995840922665584, 'ie': 0.2564838051577386}


### Test 4 - g = 5, h = 5

In [None]:
evaluate_lsh(
  LSH(2048, g = 5, h = 5, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.402926760324066, 'ie': 2.5325466296550663}
Cosine mAP
{'mAP': 0.4303191721103078, 'ie': 2.5300422998755527}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.30984659557617583, 'ie': 2.617817805551897}
Cosine mAP
{'mAP': 0.3945218244205997, 'ie': 2.6405850023454334}


### Test 5 - g = 4, h = 4

In [None]:
evaluate_lsh(
  LSH(2048, g = 4, h = 4, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.379875174638251, 'ie': 1.8256262371745438}
Cosine mAP
{'mAP': 0.46092027620329934, 'ie': 1.8770648730406085}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.3259360600474855, 'ie': 1.8430578825317898}
Cosine mAP
{'mAP': 0.41296741831976314, 'ie': 1.823323520180483}


### Test 6 - g = 3, h = 1

In [None]:
evaluate_lsh(
  LSH(2048, g = 3, h = 1, w = 4, bitwise_hash = True),
  sketches_features_train, 
  sketches_generator_train, 
  sketches_features_test, 
  sketches_generator_test, 
  mirflickr_features, 
  mirflickr_generator
) # meglio passarlo così che elimina subito la ram occupata

Euclidean mAP
{'mAP': 0.41422510059586404, 'ie': 0.24225390913153647}
Cosine mAP
{'mAP': 0.4665102802814581, 'ie': 0.24665013580778442}

mAP using test as query on train
Euclidean mAP
{'mAP': 0.3527937596577726, 'ie': 0.23709095089860183}
Cosine mAP
{'mAP': 0.4295193047128391, 'ie': 0.25421377194091155}
