In [None]:
# Author Markus Laubenthal

In [None]:
!pip install falconn
!pip install annoy
!mkdir -p input

In [None]:
!rm -rf functions
!git clone https://github.com/Lennard-Alms/lab_bd.git functions

In [None]:
# !wget https://storage.googleapis.com/laubenthal_spatiolab/feature_vectors_75.h5 -O input/feature_vectors_75.h5 --no-verbose
!wget https://storage.googleapis.com/laubenthal_spatiolab/duplicates.h5 -O input/duplicates.h5 --no-verbose
!wget https://storage.googleapis.com/laubenthal_spatiolab/final_test.h5 -O input/final_test.h5 --no-verbose


In [None]:
import tensorflow as tf
import numpy as np
import glob
import cv2
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
import math
import h5py
import keras
from keras.layers import Input
from keras import backend as K
from keras import layers
import gc
from scipy.spatial import distance_matrix
import seaborn as sns
from operator import itemgetter 
from google.colab.patches import cv2_imshow
from functions.preprocessing.BatchToFile import BatchProcessToFile
from functions.preprocessing.FeatureExtractor import VGGFeatureExtractorMax
from functions.preprocessing.FeatureExtracorMaxNoPatches import VGGFeatureExtractorMaxNoPatches

In [None]:
from falconn import LSHIndex, LSHConstructionParameters, get_default_parameters
import falconn
from annoy import AnnoyIndex
from functions.preprocessing.ImageMutation import PatchMutation
from functions.preprocessing.HelperFunctions import get_patches_from_image
from functions.postprocessing.ErrorEvaluation import evaluate_result

In [None]:
f = h5py.File('input/duplicates.h5', 'r')
for key in f.keys():
  print(key)
f.close()


print("---")
f = h5py.File('input/final_test.h5', 'r')
for key in f.keys():
  print(key)
f.close()

In [None]:
f = h5py.File('input/duplicates.h5', 'r')
g = h5py.File('input/final_test.h5', 'r')
vectors = f['a2d2_background_horses_50_cover'][:].astype(np.float32)
labels = f['a2d2_background_horses_50_cover_label'][:].astype(np.float32)
queries = f['query(200, 200)'][:].astype(np.float32)
query_labels = np.arange(0, queries.shape[0]) + 1

In [None]:
def do_query(query_vector):
  global query_labels
  query_index = 0
  query = lsh_index.construct_query_object()
  # query.set_num_probes(70)
  candidates = np.array(query.get_unique_candidates(query_vector))
  return candidates



def evaluate_hash_candidates(candidates, filtered, query_vector, vectors, ground_truth = None):
  if ground_truth is None:
    ground_truth = filter_results(vectors, np.arange(0, vectors.shape[0]), query_vector, threshold).flatten()
  database_size = vectors.shape[0]
  query_size = candidates.shape[0]
  filtered_size = filtered.shape[0]


  false_positives = query_size - filtered_size
  false_negatives = ground_truth.shape[0] - filtered_size

  #recall = 100 / ground_truth.shape[0] * filtered_size / 100
  query_ratio = 100 / database_size * query_size / 100
  # relevant_ratio = 100 / query_size * filtered_size / 100
  relevant_ratio = 0
  return 0, query_ratio, relevant_ratio, false_positives, false_negatives

def calculate_cosine_sim(feature_vectors, feature_vectors_b = None):
  if feature_vectors_b is None:
    feature_vectors_b = feature_vectors.copy()
  norms = np.linalg.norm(feature_vectors, axis=1)
  norms_b = np.linalg.norm(feature_vectors_b, axis=1)
  angle_matrix = (np.dot(feature_vectors, feature_vectors_b.T) / np.dot(norms[:,np.newaxis], norms_b[np.newaxis, :])).clip(-1,1)
  angle_matrix = np.arccos(angle_matrix)
  return angle_matrix

def filter_results(vectors, result_ids, query, threshold):
  selection = vectors[result_ids]
  cosine_sim = calculate_cosine_sim(selection, query[np.newaxis, :]).flatten()
  filter = np.argwhere(cosine_sim < threshold)
  return result_ids[filter]


In [None]:
params = get_default_parameters(
    num_points = vectors.shape[0],
    dimension = vectors.shape[1],
    distance=falconn.DistanceFunction.NegativeInnerProduct)
params.lsh_family = falconn.LSHFamily.Hyperplane
params.k = 20
params.l = 50

print(params.k)
print(params.l)
lsh_index = LSHIndex(params)
lsh_index.setup(vectors)

In [None]:
# NEAR DUPLICATE TEST

query_index = 0
query_vector = queries[query_index]
candidates = do_query(query_vector)
for threshold in np.arange(0.6, 1, 0.1):

  query_label = query_labels[query_index]

  filtered = filter_results(vectors, candidates, query_vector, threshold).flatten()
  ground_truth = filter_results(vectors, np.arange(0, vectors.shape[0]), query_vector, threshold).flatten()

  recall, query_ratio, relevant_ratio, fp, fn = evaluate_hash_candidates(candidates, filtered, query_vector, vectors)
  sc,ic = evaluate_result(filtered, labels, query_label)
  precision, recall, accuracy = sc[0], sc[1], sc[2]

  print("Results for threshold: ", threshold)
  print("Recall:                ", recall)
  print("Precision:             ", precision)
  print("Accuracy:              ", accuracy)
  print("Queried % of database: ", query_ratio)
  print("True Positive Ratio:   ", relevant_ratio)
  print("FP / FN:               ", fp, fn)
  print("")

In [None]:
# NEAR DUPLICATE EVALUATION WITH ALL QUERIES
nd_precisions = []
nd_recalls = []
nd_accuracies = []

nd_precisions_gem = []
nd_recalls_gem = []
nd_accuracies_gem = []

query_index = 0
for threshold in np.arange(0, 15, 0.1):
  mean_precision = 0
  mean_recall = 0
  mean_accuracy = 0
  mean_query_ratio = 0
  mean_relevant_ratio = 0

  gemmean_precision = 0
  gemmean_recall = 0
  gemmean_accuracy = 0
  gemmean_query_ratio = 0
  gemmean_relevant_ratio = 0

  indices = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
  # indices = range(queries.shape[0])

  for query_index in indices:

    query_vector = queries[query_index]
    query_label = query_labels[query_index]
    candidates = do_query(query_vector)

    filtered = filter_results(vectors, candidates, query_vector, threshold / 10).flatten()
    ground_truth = filter_results(vectors, np.arange(0, vectors.shape[0]), query_vector, threshold / 10).flatten()
    
    recall, query_ratio, relevant_ratio, fp, fn = evaluate_hash_candidates(candidates, filtered, query_vector, vectors)
    sc,ic = evaluate_result(filtered, labels, query_label)
    gsc,gic = evaluate_result(ground_truth, labels, query_label)

    precision, recall, accuracy = sc[0], sc[1], sc[2]
    mean_precision = mean_precision + precision
    mean_recall = mean_recall + recall
    mean_accuracy = mean_accuracy + accuracy
    mean_query_ratio = mean_query_ratio + query_ratio
    mean_relevant_ratio = mean_relevant_ratio + relevant_ratio

    gemprecision, gemrecall, gemaccuracy = gsc[0], gsc[1], gsc[2]
    gemmean_precision = gemmean_precision + gemprecision
    gemmean_recall = gemmean_recall + gemrecall
    gemmean_accuracy = gemmean_accuracy + gemaccuracy

  nd_precisions.append(mean_precision / len(indices))
  nd_recalls.append(mean_recall / len(indices))
  nd_accuracies.append(mean_accuracy / len(indices))

  nd_precisions_gem.append(gemmean_precision / len(indices))
  nd_recalls_gem.append(gemmean_recall / len(indices))
  nd_accuracies_gem.append(gemmean_accuracy / len(indices))

  print("Results for threshold: ", threshold / 10)
  print("Recall:                ", mean_recall / len(indices))
  print("Precision:             ", mean_precision / len(indices))
  print("Accuracy:              ", mean_accuracy / len(indices))

  print("gemRecall:             ", gemmean_recall / len(indices))
  print("gemPrecision:          ", gemmean_precision / len(indices))
  print("gemAccuracy:           ", gemmean_accuracy / len(indices))

  print("Queried % of database: ", mean_query_ratio / len(indices))
  print("True Positive Ratio:   ", mean_relevant_ratio / len(indices))
  print("")

In [None]:
x_axis = np.arange(0, 15, 0.1) / 10

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x_axis, nd_recalls, label="recall")
ax.plot(x_axis, nd_precisions, label="precision")
ax.plot(x_axis, nd_accuracies, label="accuracy")
ax.legend(loc="center left")
ax.set_xlabel('Cosine distance threshold')
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x_axis, nd_recalls_gem, label="recall")
ax.plot(x_axis, nd_precisions_gem, label="precision")
ax.plot(x_axis, nd_accuracies_gem, label="accuracy")
ax.legend(loc="center left")
ax.set_xlabel('Cosine distance threshold')
plt.show()


In [None]:
# NEAR DUPLICATE EVALUATION WITH ALL QUERIES AND QUERY SCALING

nd_precisions = []
nd_recalls = []
nd_accuracies = []

nd_precisions_gem = []
nd_recalls_gem = []
nd_accuracies_gem = []

query_index = 0
for threshold in np.arange(0, 15, 0.1):
  mean_precision = 0
  mean_recall = 0
  mean_accuracy = 0
  mean_query_ratio = 0
  mean_relevant_ratio = 0

  gemmean_precision = 0
  gemmean_recall = 0
  gemmean_accuracy = 0
  gemmean_query_ratio = 0
  gemmean_relevant_ratio = 0

  indices = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
  # indices = range(queries.shape[0])

  for query_index in indices:

    query_vector = queries[query_index]
    # query_vectors = [0] * len(q)
    # for _i in range(len(q)):
    #   query_vectors[_i] = q[_i][query_index]
    query_label = query_labels[query_index]
    candidates_list = []
    # for query_vector in query_vectors:
    #   candidates = do_query(query_vector)
    #   candidates_list.append(candidates)
    # candidates = np.concatenate(candidates_list)
    # candidates = np.array(list(set(candidates)))

    filtered = filter_results(vectors, candidates, query_vector, threshold / 10).flatten()
    ground_truth = filter_results(vectors, np.arange(0, vectors.shape[0]), query_vector, threshold / 10).flatten()
    
    recall, query_ratio, relevant_ratio, fp, fn = evaluate_hash_candidates(candidates, filtered, query_vector, vectors)
    sc,ic = evaluate_result(filtered, labels, query_label)
    gsc,gic = evaluate_result(ground_truth, labels, query_label)

    precision, recall, accuracy = sc[0], sc[1], sc[2]
    mean_precision = mean_precision + precision
    mean_recall = mean_recall + recall
    mean_accuracy = mean_accuracy + accuracy
    mean_query_ratio = mean_query_ratio + query_ratio
    mean_relevant_ratio = mean_relevant_ratio + relevant_ratio

    gemprecision, gemrecall, gemaccuracy = gsc[0], gsc[1], gsc[2]
    gemmean_precision = gemmean_precision + gemprecision
    gemmean_recall = gemmean_recall + gemrecall
    gemmean_accuracy = gemmean_accuracy + gemaccuracy

  nd_precisions.append(mean_precision / len(indices))
  nd_recalls.append(mean_recall / len(indices))
  nd_accuracies.append(mean_accuracy / len(indices))

  nd_precisions_gem.append(gemmean_precision / len(indices))
  nd_recalls_gem.append(gemmean_recall / len(indices))
  nd_accuracies_gem.append(gemmean_accuracy / len(indices))

  print("Results for threshold: ", threshold / 10)
  print("Recall:                ", mean_recall / len(indices))
  print("Precision:             ", mean_precision / len(indices))
  print("Accuracy:              ", mean_accuracy / len(indices))

  print("gemRecall:             ", gemmean_recall / len(indices))
  print("gemPrecision:          ", gemmean_precision / len(indices))
  print("gemAccuracy:           ", gemmean_accuracy / len(indices))

  print("Queried % of database: ", mean_query_ratio / len(indices))
  print("True Positive Ratio:   ", mean_relevant_ratio / len(indices))
  print("")

In [None]:
# SIMILAR IMAGE EVALUATION WITH ALL QUERIES

query_index = 0

nd_precisions = []
nd_recalls = []
nd_accuracies = []

nd_precisions_gem = []
nd_recalls_gem = []
nd_accuracies_gem = []

for threshold in np.arange(0, 15, 0.1):
  mean_precision = 0
  mean_recall = 0
  mean_accuracy = 0
  mean_query_ratio = 0
  mean_relevant_ratio = 0

  gemmean_precision = 0
  gemmean_recall = 0
  gemmean_accuracy = 0
  gemmean_query_ratio = 0
  gemmean_relevant_ratio = 0

  

  indices = list(range(queries.shape[0]))
  indices.remove(15)
  indices.remove(2)
  indices.remove(18)

  for query_index in indices:

    query_vector = queries[query_index]
    query_label = query_labels[query_index]
    candidates = do_query(query_vector)
    filtered = filter_results(vectors, candidates, query_vector, threshold / 10).flatten()
    ground_truth = filter_results(vectors, np.arange(0, vectors.shape[0]), query_vector, threshold / 10).flatten()

    same_label_ids = np.where(labels == query_label)[0]

    # Remove Same Duplicates
    # filtered = np.array([x for x in filtered if (x not in same_label_ids)])
    # ground_truth = np.array([x for x in ground_truth if x not in same_label_ids])
    # _vectors = []
    
    # for i, v in enumerate(vectors):
    #   if labels[i] != query_label:
    #     _vectors.append(v)
    # _vectors = np.array(_vectors)

    gem_recall, gem_qr, gem_rr, gem_fp, gem_fn = evaluate_hash_candidates(ground_truth, ground_truth, query_vector, vectors, ground_truth=ground_truth)
    recall, query_ratio, relevant_ratio, fp, fn = evaluate_hash_candidates(candidates, filtered, query_vector, vectors, ground_truth=ground_truth)
    
    sc,ic = evaluate_result(filtered, labels, query_label)
    gsc,gic = evaluate_result(ground_truth, labels, query_label)

    precision, recall, accuracy = ic[0], ic[1], ic[2]
    mean_precision = mean_precision + precision
    mean_recall = mean_recall + recall
    mean_accuracy = mean_accuracy + accuracy
    mean_query_ratio = mean_query_ratio + query_ratio
    mean_relevant_ratio = mean_relevant_ratio + relevant_ratio

    gemprecision, gemrecall, gemaccuracy = gic[0], gic[1], gic[2]
    gemmean_precision = gemmean_precision + gemprecision
    gemmean_recall = gemmean_recall + gemrecall
    gemmean_accuracy = gemmean_accuracy + gemaccuracy

  nd_precisions.append(mean_precision / len(indices))
  nd_recalls.append(mean_recall / len(indices))
  nd_accuracies.append(mean_accuracy / len(indices))

  nd_precisions_gem.append(gemmean_precision / len(indices))
  nd_recalls_gem.append(gemmean_recall / len(indices))
  nd_accuracies_gem.append(gemmean_accuracy / len(indices))

  print("Results for threshold: ", threshold / 10)
  print("Recall:                ", mean_recall / len(indices))
  print("Precision:             ", mean_precision / len(indices))
  print("Accuracy:              ", mean_accuracy / len(indices))
  print("gemRecall:             ", gemmean_recall / len(indices))
  print("gemPrecision:          ", gemmean_precision / len(indices))
  print("gemAccuracy:           ", gemmean_accuracy / len(indices))
  print("Queried % of database: ", mean_query_ratio / len(indices))
  print("True Positive Ratio:   ", mean_relevant_ratio / len(indices))
  print("")

In [None]:
sc_ev, ic_ev = evaluate_result(filtered, labels, query_label)
sc_precision, sc_recall, sc_accuracy = sc_ev
ic_precision, ic_recall, ic_accuracy = ic_ev
print(sc_ev, ic_ev)

In [None]:
x_axis = np.arange(0, 15, 0.1) / 10

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x_axis, nd_recalls, label="recall")
ax.plot(x_axis, nd_precisions, label="precision")
ax.plot(x_axis, nd_accuracies, label="accuracy")
ax.legend(loc="center left")
ax.set_xlabel('Cosine distance threshold')
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x_axis, nd_recalls_gem, label="recall")
ax.plot(x_axis, nd_precisions_gem, label="precision")
ax.plot(x_axis, nd_accuracies_gem, label="accuracy")
ax.legend(loc="center left")
ax.set_xlabel('Cosine distance threshold')
plt.show()
