<a href="https://colab.research.google.com/github/MarshaGomez/DNN-Sketches-image-analysis/blob/main/Code/save_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up the project

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import numpy as np
import time
import itertools
import os, shutil
import tensorflow as tf
import matplotlib.pyplot as plt

from random import random
from ipywidgets import Image
from numpy.linalg import norm
from IPython.display import display
from keras.models import Model
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping, ModelCheckpoint

# tensorflow version 2.4.0
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input, InceptionV3

In [None]:
zip_path = '/content/gdrive/Shareddrives/COMPUTER_VISION/MIM_zipped.zip'
!cp "{zip_path}" .
!unzip -q MIM_zipped.zip
!rm MIM_zipped.zip

In [None]:
#data loading
# reading from unzipped
BASE_DIR = "/content/content/gdrive/Shareddrives/COMPUTER_VISION/MIRCV"
# FILELIST_PATH = BASE_DIR + "/filelist.txt"
SKETCHES_DIR = os.path.join(BASE_DIR, "sketches")
MIRFLICKR_DIR = os.path.join(BASE_DIR, "mirflickr/mirflickr25k")
IMG_HEIGHT = 299
IMG_WIDTH = 299
INPUT_SHAPE = (IMG_WIDTH, IMG_HEIGHT, 3)
BATCH_SIZE = 64
MODEL_PATH = "/content/gdrive/Shareddrives/COMPUTER_VISION/models"

In [None]:
def extract_features(extractor, generator, sample_count, dim=2048):
  features = np.zeros((sample_count, dim)) #extractor output shape 
  i = 0
  for inputs_batch, labels_batch in generator:
    start = time.time()
    features_batch = extractor.predict(inputs_batch)
    start = time.time()
    if (i + 1) * BATCH_SIZE > sample_count:
      features[i * BATCH_SIZE : sample_count , :] = features_batch
      assert np.array_equal(np.argmax(labels_batch, axis = 1), generator.labels[i * BATCH_SIZE : sample_count]), 'LABELS NOT CORRESPONDING REINIZIALIZE GENERATOR'
    else:
      features[i * BATCH_SIZE : (i + 1) * BATCH_SIZE, : ] = features_batch
      assert np.array_equal(np.argmax(labels_batch, axis = 1), generator.labels[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]), 'LABELS NOT CORRESPONDING REINIZIALIZE GENERATOR'
    i += 1
    if i * BATCH_SIZE >= sample_count:
      break
  
  return features
  

In [None]:
class LSH:
  def __init__(self, feature_dim, g = 10, h = 20, w = 4, bitwise_hash = False):
    """
    We have to find a way to load the stored index if exists or initialize the 
    initial structure
    """
    self._index = {}
    if bitwise_hash:
      # self.x = np.random.randn(g, h, feature_dim) # non sembra richieda una distribuzione normale
      self.x = np.random.normal(size=(g, h, feature_dim))
    else:
      self.x = np.random.normal(size=(g, h, feature_dim))
    self.w = np.ones((g, h, 1)) * w
    self.b = np.random.rand(g, h, 1) * w
    self.bitwise_hash = bitwise_hash # another way to create h (h will be only 0 o 1)

  def _hash(self, features):
    """
    crea l'hash di più cose contemporaeamente si aspetta un array composto dalle features una sotta l'altra (linguaggio super matematico)
    """
    # g = np.trunc((np.dot(p, self.x) + self.b) / self.w)
    #g = np.trunc((np.dot(self.x, p) + self.b) / self.w) questo funziona con 1
    # g = np.transpose(np.trunc(((np.dot(self.x, p.T) + self.b) / self.w)), (0,2,1)) miglior modo di vederlo
    if self.bitwise_hash:
      return (np.transpose(np.dot(self.x, features.T), (0,2,1)) > 0).astype(int).astype(str)
    return np.transpose(np.trunc(((np.dot(self.x, features.T) + self.b) / self.w)), (0,2,1)).astype(int).astype(str)

  def insert(self, features, ids, labels):
    """
    Insert new data, ci aspettiamo un array d
    """
    g = self._hash(features)
    assert features.shape[0] == len(ids), "mismatch between ids length and features"
    assert len(labels) == len(ids), "mismatch between ids length and labels"
    
    number_elements = len(ids)
    i = 0
    # print("hash calculated")
    # print(g.shape)
    g_index = -1
    for g_function in g:
      start_inner_for = time.time()
      g_index += 1
      for row in g_function:
        if i % 10000 == 0:
          start = time.time()
        
        bucket_id = str(g_index) + '_' + ','.join(row)
        
        if i % 10000 == 0:
            end = time.time()
            # print(f'join {end - start}')
        if not bucket_id in self._index:
          # self._index[bucket_id] = { 'features': np.array([features[i % number_elements]]), 'ids': np.array([ids[i % number_elements]]), 'labels': np.array([labels[i % number_elements]])}
          self._index[bucket_id] = { 'features': [features[i % number_elements]], 'ids': [ids[i % number_elements]], 'labels': [labels[i % number_elements]]}
          if i % 10000 == 0:
            end = time.time()
            # print(f'not in bucket {end - start}')
        else:
          if ids[i % number_elements] in self._index[bucket_id]['ids']:
            # print("duplicate")
            continue
          if i % 10000 == 0:
            end = time.time()
            # print(f'checking duplicates {end - start}')
          # print("collision inserted")
          # self._index[bucket_id]['features'] = np.vstack((self._index[bucket_id]['features'], features[i % number_elements]))
          self._index[bucket_id]['features'].append(features[i % number_elements])
          # self._index[bucket_id]['ids'] = np.vstack((self._index[bucket_id]['ids'], ids[i % number_elements]))
          self._index[bucket_id]['ids'].append(ids[i % number_elements])
          # self._index[bucket_id]['labels'] = np.vstack((self._index[bucket_id]['labels'], labels[i % number_elements]))
          self._index[bucket_id]['labels'].append(labels[i % number_elements])
          if i % 10000 == 0:
            end = time.time()
            # print(f'stacking {end - start}')
        i += 1
        assert i > 0, 'out of bound'
      end_inner_for = time.time()
      # print(f'inner for time: {end_inner_for - start_inner_for}')
    
    for bucket_id in self._index:
      self._index[bucket_id]['features'] = np.array(self._index[bucket_id]['features'])
      # print(self._index[bucket_id]['features'].shape)
      self._index[bucket_id]['ids'] = np.array(self._index[bucket_id]['ids'], )
      self._index[bucket_id]['labels'] = np.array(self._index[bucket_id]['labels'])
      

  def query(self, features, top_k, mode = 'euclidean', return_cost = False):
    """
    Query the data
    """
    g = self._hash(np.array([features]))
    i = 0
    k = None
    top_k += 1 # per far ritornare k e non k - 1
    cost = 0
    g_index = -1
    assert mode in ['similarity', 'euclidean'], "mode must be similarity or euclidean"
    for g_function in g:
      g_index += 1
      for row in g_function:
        bucket_id = str(g_index) + '_' + ','.join(row)
        # print(bucket_id)
        if bucket_id in self._index:
          # posso avere duplicati perchè se i punti vengono inseriti in più bucket, posso avere duplicati
          # quindi devo eliminarli
          # l'ho messo qua fuori che il controllo duplicati è uguale per tutte e due le distanze
          # print(f'bucket {bucket_id}')
          if k is not None:
            # print("duplicate")
            duplicate_index = np.isin(self._index[bucket_id]['ids'], k['ids'])
            if duplicate_index.all():
              continue; # se sono tutti duplicati non ha senso contare nulla
            bucket = {}
            bucket['ids'] = self._index[bucket_id]['ids'][~duplicate_index] # prendo quelli che non sono duplicati
            # print(duplicate_index)
            # print(self._index[bucket_id]['features'])
            bucket['features'] = self._index[bucket_id]['features'][~duplicate_index.flatten()] # each duplicate index must delete a row of features
            bucket['labels'] = self._index[bucket_id]['labels'][~duplicate_index]
          else:
            bucket = self._index[bucket_id]
        
          if mode == 'euclidean':
            # print(bucket['features'].shape)
            dist = norm(bucket['features'] - np.array(features), axis=1)
            # print(f'dist shape {dist.shape} and dist size {dist.size}')
            cost += dist.size
            if k is None:
              idx_partitioned = np.argpartition(dist, top_k - 1 if dist.shape[0] - 1 > top_k - 1 else dist.shape[0] - 1)
              if dist.shape[0] - 1 > top_k - 1:  
                  idx_partitioned = idx_partitioned[:top_k - 1]
              k = {}
              # qua è più comodo avere array 1- dimensionali
              k['ids'] = bucket['ids'][idx_partitioned].flatten()
              k['labels'] = bucket['labels'][idx_partitioned].flatten()
              k['distances'] = dist[idx_partitioned]
              continue
            # https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
            # argpartition sembra essere incredibilmente veloce
            # ma non ordina completamente, ordina solo rispetto un punto, nel senso
            # io gli sto dicendo butta quelli più piccoli di k da una parte e quelli più grandi all'altra, ma non sto ordinando
            if k['distances'].shape[0] < top_k:
                # print((k['distances'].shape, dist.shape))
                distances = np.concatenate((k['distances'], dist))
                # print((k['ids'].shape, bucket['ids'].shape))
                # print(k['ids'])
                # print(bucket['ids'])
                ids = np.concatenate((k['ids'], bucket['ids']))
                # print((k['labels'].shape, bucket['labels'].shape))
                labels = np.concatenate((k['labels'], bucket['labels']))
                idx_sorted = np.argpartition(distances, top_k - 1 if distances.shape[0] - 1 > top_k else distances.shape[0] - 1)
                if distances.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[:top_k - 1]
                k['ids'] = ids[idx_sorted]
                k['labels'] = labels[idx_sorted]
                k['distances'] = distances[idx_sorted]
                # print(f'k = {k}')
                continue

            idx = dist < np.max(k['distances'])
            # print(f"idx = {idx}")
            if np.any(idx):
              distances = np.concatenate((k['distances'], dist[idx]))
              ids = np.concatenate((k['ids'], bucket['ids'][idx]))
              labels = np.concatenate((k['labels'], bucket['labels'][idx]))
              idx_sorted = np.argpartition(distances, top_k - 1 if distances.shape[0] - 1 > top_k else distances.shape[0] - 1)
              if distances.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[:top_k - 1]
              k['ids'] = ids[idx_sorted]
              k['labels'] = labels[idx_sorted]
              k['distances'] = distances[idx_sorted]

          else:
            # print(bucket['features'].shape)
            sim = np.sum(bucket['features'] * np.array(features), axis=1) / (norm(bucket['features'], axis=1) * norm(np.array([features]), axis=1))
            # print(f'sim shape {sim.shape} and sim size {sim.size}')
            cost += sim.size
            if k is None:
              idx_partitioned = np.argpartition(sim, -(top_k - 1) if sim.shape[0] - 1 > top_k - 1 else sim.shape[0] - 1)
              if sim.shape[0] - 1 > top_k - 1:  
                idx_partitioned = idx_partitioned[-(top_k - 1):]
              k = {}
              # qua è più comodo avere array 1- dimensionali
              k['ids'] = bucket['ids'][idx_partitioned].flatten()
              k['labels'] = bucket['labels'][idx_partitioned].flatten()
              k['similarities'] = sim[idx_partitioned]
              continue
            # https://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
            # argpartition sembra essere incredibilmente veloce
            # ma non ordina completamente, ordina solo rispetto un punto, nel senso
            # io gli sto dicendo butta quelli più piccoli di k da una parte e quelli più grandi all'altra, ma non sto ordinando
            if k['similarities'].shape[0] < top_k:
                # print((k['similarities'].shape, sim.shape))
                similarities = np.concatenate((k['similarities'], sim))
                # print((k['ids'].shape, bucket['ids'].shape))
                # print(k['ids'])
                # print(bucket['ids'])
                ids = np.concatenate((k['ids'], bucket['ids']))
                # print((k['labels'].shape, bucket['labels'].shape))
                labels = np.concatenate((k['labels'], bucket['labels']))
                idx_sorted = np.argpartition(similarities, -(top_k - 1) if similarities.shape[0] - 1 > top_k - 1 else similarities.shape[0] - 1)
                if similarities.shape[0] - 1 > top_k - 1:  
                  idx_sorted = idx_sorted[-(top_k - 1):]
                k['ids'] = ids[idx_sorted]
                k['labels'] = labels[idx_sorted]
                k['similarities'] = similarities[idx_sorted]
                # print(f'k = {k}')
                continue

            idx = sim > np.min(k['similarities'])
            # print(f"idx = {idx}")
            if np.any(idx):
              similarities = np.concatenate((k['similarities'], sim[idx]))
              ids = np.concatenate((k['ids'], bucket['ids'][idx]))
              labels = np.concatenate((k['labels'], bucket['labels'][idx]))
              idx_sorted = np.argpartition(similarities, -(top_k - 1) if similarities.shape[0] - 1 > top_k - 1 else similarities.shape[0] - 1)
              if similarities.shape[0] - 1 > top_k - 1:  
                idx_sorted = idx_sorted[-(top_k - 1):]
              k['ids'] = ids[idx_sorted]
              k['labels'] = labels[idx_sorted]
              k['similarities'] = similarities[idx_sorted]
        i += 1
    # ora ordino totalmente i risultati
    if k is None:
      return {} #zero result
    if mode == 'euclidean':
      idx_sorted = np.argsort(k['distances'])
      idx_sorted = idx_sorted[:top_k - 1 if k['distances'].shape[0] - 1 > top_k else k['distances'].shape[0]]
      k['distances'] = k['distances'][idx_sorted]
      k['ids'] = k['ids'][idx_sorted]
      k['labels'] = k['labels'][idx_sorted]
      if return_cost:
        return (k, cost)
      return k
    idx_sorted = np.argsort(k['similarities'])[::-1]
    idx_sorted = idx_sorted[:top_k - 1 if k['similarities'].shape[0] - 1 > top_k else k['similarities'].shape[0]]
    k['similarities'] = k['similarities'][idx_sorted]
    k['ids'] = k['ids'][idx_sorted]
    k['labels'] = k['labels'][idx_sorted]
    if return_cost:
      return (k, cost)
    return k

  def store(self):
    pass


# Load NN

In [None]:
model = models.load_model(MODEL_PATH + '/inception_finetuning_classification_3_last_one_more_train_all_parameters_more_train.h5')
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None, None,  0                                            
__________________________________________________________________________________________________
conv2d_376 (Conv2D)             (None, None, None, 3 864         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_376 (BatchN (None, None, None, 3 96          conv2d_376[0][0]                 
__________________________________________________________________________________________________
activation_376 (Activation)     (None, None, None, 3 0           batch_normalization_376[0][0]    
____________________________________________________________________________________________

In [None]:
extractor = Model(model.input, model.layers[-2].output)

# Extract features

In [None]:
sketches_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
mirflickr_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

sketches_generator = sketches_datagen.flow_from_directory(
        # This is the target directory
        SKETCHES_DIR + '/png',
        shuffle=False,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE)

mirflickr_generator = mirflickr_datagen.flow_from_directory(
        # This is the target directory
        MIRFLICKR_DIR,
        shuffle=False,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE)


Found 20000 images belonging to 250 classes.
Found 25000 images belonging to 1 classes.


In [None]:
sketches_features = extract_features(extractor, sketches_generator, 20000)
mirflickr_features = extract_features(extractor, mirflickr_generator, 25000)

# Save index

In [None]:
lsh = LSH(2048, g = 3, h = 5, w = 4, bitwise_hash = True)
lsh.insert(np.vstack((sketches_features, mirflickr_features)), 
                np.concatenate((sketches_generator.filenames, mirflickr_generator.filenames)), 
                np.concatenate((sketches_generator.labels, np.array([250] * mirflickr_features.shape[0]))))

In [None]:
import pickle

with open('/content/gdrive/Shareddrives/COMPUTER_VISION/saved_index/index.pickle', 'wb') as handle:
    pickle.dump(lsh, handle, protocol=pickle.HIGHEST_PROTOCOL)
