# Visual Genome Region Description Embeddings

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from visual_genome import api as vg
import urllib.request
from PIL import Image
import tensorflow as tf
import json
import nltk
import time
import torch
import re
import seaborn as sns
import pandas as pd
from nltk.cluster import KMeansClusterer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
from scipy.spatial import distance_matrix, distance
from sklearn.manifold import TSNE
from tqdm import tqdm
import pickle
import os
from utils import *

from sklearn.metrics import silhouette_samples, silhouette_score

## SentenceTransformer

In [None]:
# pre-trained models
model_name='all-mpnet-base-v2'

# Possible value:
# 'all-mpnet-base-v2' : Best accuracy overall
# 'all-MiniLM-L12-v2' : Faster
# 'all-roberta-large-v1' : Better but slower

model = SentenceTransformer(model_name)

In [None]:
def get_embeddings(model, region_sentences):
    sentence_embeddings = model.encode(region_sentences)
    return sentence_embeddings

## Read images and their descriptions


In [None]:
## Read images and their descriptions
image_regions = pd.read_csv("/home/maelic/Documents/PhD/ModelZoo/visual-genome-embeddings/create image regions/image_regions.csv")
image_regions.head()
image_regions["region_sentences"] = image_regions["region_sentences"].apply(eval)
regions = image_regions["region_sentences"].tolist()

In [None]:
print(image_regions.keys())

# Get embeddings

In [None]:
region_emb_path = "/home/maelic/Documents/PhD/MyModel/PhD_Commonsense_Enrichment/VG_refinement/clustering/sentence_embeddings/"+model_name+"_regions_embeddings.pkl"

if not os.path.exists(region_emb_path):
    values = []
    for corpus in tqdm(regions):
        # Number of rows corresponds to number of sentences
        if corpus:
            emb = model.encode(corpus)
            # get mean value columnwise, so that sentence embeddings are averaged per region for each image
            emb = np.mean(np.array(emb), axis=0) 
            # for each model, a 768-length embedding is stored
            values.append(emb)
        else:
            values.append(0)
            print(0)

    image_regions['embeddings'] = pd.Series(values, index=image_regions.index) 
    # number of sentences x 768
    image_regions.head()

    with open(region_emb_path, "wb") as fOut:
        pickle.dump(image_regions,fOut,protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(region_emb_path, "rb") as fIn:
        image_regions = pickle.load(fIn)

In [None]:
print(image_regions.keys())

# Clustering

In [None]:
def clustering_question(images_regions, key, NUM_CLUSTERS):

    sentences = images_regions['region_sentences']

    X = np.array(images_regions[key].tolist())

    data = images_regions[['Image_id', 'region_sentences', key]].copy()

    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(data[key], assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters

In [None]:
def clustering_question(images_regions, key, NUM_CLUSTERS):

    sentences = images_regions['region_sentences']

    X = np.array(images_regions[key].tolist())

    data = images_regions[['Image_id', 'region_sentences', key]].copy()

    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(data[key], assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters

In [None]:
def distance_from_centroid(row):
    # type of emb and centroid is different, hence using tolist below
    return distance_matrix([row['embeddings']], [row['centroid'].tolist()])[0][0]

In [None]:
def nltk_inertia(feature_matrix, centroid):
    sum_ = []
    for i in range(feature_matrix.shape[0]):
        sum_.append(np.sum((feature_matrix[i] - centroid[i])**2))  

    return sum(sum_) 
    
def number_of_clusters(image_regions, key):
    sse = []
    list_k = list(range(2,31))
    for k in tqdm(list_k):
        data, assigned_clusters = clustering_question(image_regions, key, k)
        sse.append(nltk_inertia(data[key].to_numpy(), data.centroid.to_numpy()))

    # Plot sse against k
    plt.figure(figsize=(12, 6))
    plt.title('Elbow method for '+model_name)
    plt.plot(list_k, sse, '-o')
    plt.xlabel('Number of clusters k')
    plt.ylabel('Sum of squared distance')
    plt.show()

#number_of_clusters(image_regions, 'embeddings')

In [None]:
def make_clusters(key, images_regions, n_clusters):
    data, assigned_clusters = clustering_question(images_regions, key, NUM_CLUSTERS = n_clusters)
    # Compute centroid distance to the data
    data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)
    return data

## TSNE visualizations

In [None]:
def tsne_visual(embedding_clusters, key, n_clusters=0, save=False):
    mat = np.matrix([x for x in embedding_clusters[key]])
    t_sne = TSNE(n_components=2)
    low_dim_data = t_sne.fit_transform(np.asarray(mat))
    print('Lower dim data has shape',low_dim_data.shape)
    tsne_df =  pd.DataFrame(low_dim_data, embedding_clusters['cluster'])
    plt.figure(figsize=(20,12))
    ax = sns.scatterplot(data=tsne_df[0], x=tsne_df[0], y=tsne_df[1], hue=tsne_df.index, palette = "viridis", s=80)
    ax.set_title('T-SNE BERT Embeddings')
    plt.draw()
    if save:
        plt.savefig("visualization/"+model_name+"_tsne_clusters_"+str(n_clusters)+".png")
    return low_dim_data

## Similar image retrieval

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
def find_distances(images_regions, input_id, key, input_embedding = np.zeros(5)):
    distances = []     
    image_ids=images_regions.Image_id
    if input_id>0:
        reference_embedding = images_regions.loc[images_regions.Image_id == input_id][key]
        reference_embedding = reference_embedding.values[0].reshape(-1,1)
        corpus_embeddings = images_regions.loc[images_regions.Image_id != input_id][key]
    else:
        reference_embedding = input_embedding
        corpus_embeddings = images_regions[key]
        
    for j in range(len(corpus_embeddings)):  # rows of def_embeddings matrix
        defin = j
        if image_ids[j]!=input_id:      # avoid calculating distance with itself
            corpus = corpus_embeddings[j].reshape(-1,1)
            # euclidean distance between multidimensional vectors
            #dist = distance.euclidean(np.asarray(reference_embedding)[0], np.asarray(corpus)[0])
            dist = distance.euclidean(reference_embedding.flatten(), corpus.flatten())

            distances.append([image_ids[j], dist]) 
        
    # store in df
    col_names = ['image_id', 'distances']
    distances_df = pd.DataFrame(distances, columns=col_names)
    distances_df = distances_df.sort_values(by='distances', ascending=True)
    distances_df.to_csv('distances.csv', index=False)
    return distances_df

In [None]:
# given image id to retrieve its k most similar images

def retrieve_images(key, images_regions, input_id = -1, input_embedding = np.zeros(5)):
    print('Images retrieved using method:', key)
    top_k=10
    
    if input_id>0:
    # top_k results to return
        print('Reference image:', input_id)
        retrieve_image(input_id)
        distances_df=find_distances(images_regions, input_id, key)
    else:
        distances_df=find_distances(images_regions, input_id, key, input_embedding)
    top_images=distances_df.head(top_k)

    print("Top", top_k, "most similar images to image", input_id, "in Visual Genome:")
    for index, row in top_images.iterrows():   
        im_id = int(row.image_id)
        print("Image id:", im_id, "Euclidean distance: %.4f" % (row.distances))

        # find similar images from api and show
        retrieve_image(im_id)

    return top_images

# Method: stsb-mpnet-base-v2

In [None]:
embeddings_method = "embeddings"
num_clusters = 9

embedding_clusters=make_clusters(embeddings_method, image_regions, num_clusters)
print('Number of clusters:', num_clusters)
labels = embedding_clusters['cluster'].to_numpy()
#print(embedding_clusters.groupby('cluster').count())
low_embedding_clusters = tsne_visual(embedding_clusters, embeddings_method, n_clusters=num_clusters, save=True)

silhouette_vals = silhouette_samples(low_embedding_clusters, labels)
avg_score = silhouette_score(low_embedding_clusters, labels)
print("Average silhouette score:", avg_score)

# plt.figure(figsize=(20,12))

# # Silhouette plot
# y_ticks = []
# y_lower, y_upper = 0, 0
# for i, cluster in enumerate(np.unique(labels)):
#     cluster_silhouette_vals = silhouette_vals[labels == cluster]
#     cluster_silhouette_vals.sort()
#     y_upper += len(cluster_silhouette_vals)
#     plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
#     plt.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
#     y_lower += len(cluster_silhouette_vals)

# avg_score = np.mean(silhouette_vals)
# plt.axvline(avg_score, linestyle='--', linewidth=2, color='green')
# plt.yticks([])
# plt.xlim([-0.1, 1])
# plt.xlabel('Silhouette coefficient values')
# plt.ylabel('Cluster labels')
# plt.title(f'Silhouette analysis using k = {i+1}', y=1.02)
# # title=f'Silhouette analysis using k = {i}, average score = {avg_score:.3f}'
# # plt.suptitle(title, fontsize=16, y=1.05)

In [None]:
avg_score = silhouette_score(low_embedding_clusters, labels)
print("Average silhouette score:", avg_score)

plt.figure(figsize=(20,12))

# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
    cluster_silhouette_vals = silhouette_vals[labels == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
    plt.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
    y_lower += len(cluster_silhouette_vals)

avg_score = np.mean(silhouette_vals)
plt.axvline(avg_score, linestyle='--', linewidth=2, color='green')
plt.yticks([])
plt.xlim([-0.1, 1])
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster labels')
plt.title(f'Silhouette analysis using k = {i+1}', y=1.02)
# # title=f'Silhouette analysis using k = {i}, average score = {avg_score:.3f}'
# # plt.suptitle(title, fontsize=16, y=1.05)

In [None]:
print(embedding_clusters.keys())
print(embedding_clusters['cluster'].value_counts())

In [None]:
print(embedding_clusters.keys())
i=0
img_clusters = embedding_clusters[['Image_id','cluster']].copy()
print(img_clusters.keys())
print(img_clusters['cluster'].value_counts())
img_clusters.to_csv('img_clusters.csv', index=False)

# Visualize image from cluster

In [None]:
def visualize_regions(image, regions):
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    
    plt.imshow(image)
    ax = plt.gca()
    for region in regions:
        ax.add_patch(Rectangle((region['x'], region['y']),
                               region['width'],
                               region['height'],
                               fill=False,
                               edgecolor='red',
                               linewidth=3))
        ax.text(region['x'], region['y'], region['phrase'], style='italic', bbox={'facecolor':'white', 'alpha':0.7, 'pad':10})
    fig = plt.gcf()
    plt.tick_params(labelbottom='off', labelleft='off')
    plt.show()


def show_image_regions(vg_img_path, regions, image_id):
    # read image from disk using image path
    image = Image.open(os.path.join(vg_img_path, str(image_id) + '.jpg'))
    # get regions of image
    if image:
        for r in regions:
            if r['id'] == image_id:
                reg = r['regions']
                break
        # show images
        visualize_regions(image, reg[:8])    # call with fewer regions for better visualization

In [None]:
# show image samples from selected cluster
import random

img_clusters = pd.read_csv('img_clusters.csv')
vg_img_path = '/home/maelic/Documents/PhD/Datasets/VisualGenome/VG_100K'
region_path = '/home/maelic/Documents/PhD/Datasets/VisualGenome/original_annotations/region_descriptions.json'
regions = json.load(open(region_path, 'r'))
cluster_id = 8
nb_img = 5
cluster_images = img_clusters[img_clusters['cluster']==cluster_id]
print(len(cluster_images))
for i in range(nb_img):
    random_id = random.randint(0, len(cluster_images))
    show_image_regions(vg_img_path, regions, cluster_images.iloc[random_id,0])

In [13]:
indoor_vg = img_clusters[img_clusters['cluster']==5]
print(len(indoor_vg))
indoor_vg.to_csv('indoor_vg.csv', index=False)

17740


In [None]:
embeddings_method = "embeddings_4"
clusters_max = 21

score_list = {}

for i in range(15, clusters_max):
    embedding_clusters=make_clusters(embeddings_method, image_regions, i)
    print('Number of clusters:', i)
    print(embedding_clusters.groupby('cluster').count())
    low_embedding_clusters = tsne_visual(embedding_clusters, embeddings_method, n_clusters=i, save=True)

    # Get silhouette samples
    low_embedding_clusters = low_embedding_clusters
    embedding_clusters = embedding_clusters

    plt.figure(figsize=(20,12))

    embedding_samples = low_embedding_clusters
    labels = embedding_clusters['cluster'].to_numpy()
    silhouette_vals = silhouette_samples(embedding_samples, labels)
    avg_score = silhouette_score(embedding_samples, labels)
    print(f"For n_clusters = {i} The average silhouette_score is : {avg_score}")
    score_list[i] = {'score': avg_score, 'silhouette_vals': silhouette_vals}

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        plt.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    avg_score = np.mean(silhouette_vals)
    plt.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    plt.yticks([])
    plt.xlim([-0.1, 1])
    plt.xlabel('Silhouette coefficient values')
    plt.ylabel('Cluster labels')
    plt.title(f'Silhouette analysis using k = {i}', y=1.02)
    # title=f'Silhouette analysis using k = {i}, average score = {avg_score:.3f}'
    # plt.suptitle(title, fontsize=16, y=1.05)
    plt.draw()
    plt.savefig("visualization/"+model_name+"_silhouette_clusters_embeddings_"+str(i)+".png")

# write to log file:
with open('visualization/'+model_name+'__clusters_log_'+key+'.txt', 'w') as f:
    for key, value in score_list.items():
        f.write('%s: %s \n' % (key, value))

In [None]:
print(embedding_clusters.keys())

## Similar image retrieval based on given image id


In [None]:
input_id = 1
top_images = retrieve_images(embeddings_method, image_regions, input_id = input_id)

In [None]:
top_images

## Similar image retrieval based on given user sentence


In [None]:
input_sentence = 'a cat is sleeping on the beach'

In [None]:
input_embedding = get_embeddings(model_4, input_sentence)

In [None]:
input_embedding.shape

In [None]:
top_images_2 = retrieve_images(embeddings_method, image_regions, input_id = -1, input_embedding=input_embedding)

In [None]:
top_images_2