# Visual Genome Region Description Embeddings

In [1]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from visual_genome import api as vg
import urllib.request
from PIL import Image
import tensorflow as tf
import seaborn as sns
import pandas as pd
from nltk.cluster import KMeansClusterer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
from scipy.spatial import distance_matrix, distance
from sklearn.manifold import TSNE
from tqdm import tqdm
import pickle
import os
import h5py

from sklearn.metrics import silhouette_samples, silhouette_score

2023-01-20 13:59:45.374521: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Init paths

In [2]:
region_path = '/home/maelic/Documents/PhD/Datasets/VisualGenome/original_annotations/region_descriptions.json'
vg_img_path = '/home/maelic/Documents/PhD/Datasets/VisualGenome/VG_100K'
rel_path = '/home/maelic/Documents/PhD/Datasets/VisualGenome/original_annotations/relationships.json'
h5_path = '/home/maelic/Documents/PhD/MyModel/PhD_Commonsense_Enrichment/VG_refinement/data_tools/VG80K/VG80K-SGG.h5'

In [3]:
# read relationships from disk using rel path
relations_file = json.load(open(rel_path))

## SentenceTransformer

In [3]:
# pre-trained models
model_name='all-mpnet-base-v2'

# Possible value:
# 'all-mpnet-base-v2' : Best accuracy overall
# 'all-MiniLM-L12-v2' : Faster
# 'all-roberta-large-v1' : Better but slower

model = SentenceTransformer(model_name)

## Read images and their descriptions


In [5]:
## Read images triplets
relationships = []
for idx, img in enumerate(relations_file):
    rel_img = []
    for rel in img['relationships']:
        pred = rel['predicate']
        if 'names' in rel['object'].keys():
            obj = rel['object']['names'][0]
        else:
            obj = rel['object']['name']
        if 'names' in rel['subject'].keys():
            sub = rel['subject']['names'][0]
        else:
            sub = rel['subject']['name']
        
        rel_img.append(str(sub) + ' ' + str(pred.lower()) + ' ' + str(obj))
    if len(rel_img) > 0:
        relationships.append({'Image_id': idx, 'Triplets': rel_img})
relationships = pd.DataFrame(relationships)

In [6]:
sum = 0
for idx, rel in enumerate(relationships["Triplets"].tolist()):
    if rel == []:
        sum += 1
        print(relationships['Image_id'][idx])
print(sum)

0


# Get embeddings

In [4]:
region_emb_path = "/home/maelic/Documents/PhD/MyModel/PhD_Commonsense_Enrichment/VG_refinement/clustering/sentence_embeddings/"+model_name+"_triplets_embeddings.pkl"

if not os.path.exists(region_emb_path):
    values = []
    for corpus in tqdm(relationships["Triplets"].tolist()):
        # Number of rows corresponds to number of sentences
        if corpus:
            emb = model.encode(corpus)
            # get mean value columnwise, so that sentence embeddings are averaged per region for each image
            emb = np.mean(np.array(emb), axis=0) 
            # for each model, a 768-length embedding is stored
            values.append(emb)
        # 3225 images have no relationships
        else:
            values.append(0)

    relationships['embeddings'] = pd.Series(values, index=relationships.index) 
    # number of sentences x 768
    relationships.head()

    with open(region_emb_path, "wb") as fOut:
        pickle.dump(relationships,fOut,protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(region_emb_path, "rb") as fIn:
        relationships = pickle.load(fIn)

In [5]:
print(relationships.keys())

Index(['Image_id', 'Triplets', 'embeddings'], dtype='object')


# Clustering

In [6]:
def clustering_question(triplets_embeddings, NUM_CLUSTERS):

    sentences = triplets_embeddings['Triplets']

    X = np.array(triplets_embeddings['embeddings'].tolist(), dtype=object)

    data = triplets_embeddings[['Image_id', 'Triplets', 'embeddings']].copy()
    
    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(data['embeddings'], assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters

def distance_from_centroid(row):
    # type of emb and centroid is different, hence using tolist below
    return distance_matrix([row['embeddings']], [row['centroid'].tolist()])[0][0]
    
def make_clusters(triplets_embeddings, n_clusters):
    data, assigned_clusters = clustering_question(triplets_embeddings, NUM_CLUSTERS = n_clusters)
    # Compute centroid distance to the data
    data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)
    return data

In [11]:
def nltk_inertia(feature_matrix, centroid):
    sum_ = []
    for i in range(feature_matrix.shape[0]):
        sum_.append(np.sum((feature_matrix[i] - centroid[i])**2))  

    return sum(sum_) 
def number_of_clusters(image_regions, max_clusters=20):
    sse = []
    list_k = list(range(2, max_clusters+1))

    for k in tqdm(list_k):
        data, assigned_clusters = clustering_question(image_regions, k)
        sse.append(nltk_inertia(data['embeddings'].to_numpy(), data.centroid.to_numpy()))

    # Plot sse against k
    plt.figure(figsize=(12, 6))
    plt.title('Elbow method for '+model_name)
    plt.plot(list_k, sse, '-o')
    plt.xlabel('Number of clusters k')
    plt.ylabel('Sum of squared distance')
    plt.show()

number_of_clusters(relationships, max_clusters=20)


 16%|█▌        | 3/19 [00:43<04:15, 15.97s/it]

: 

: 

## TSNE visualizations

In [9]:
def tsne_visual(embedding_clusters, n_clusters=0, save=False):
    mat = np.matrix([x for x in embedding_clusters['embeddings']])
    t_sne = TSNE(n_components=2)
    low_dim_data = t_sne.fit_transform(np.asarray(mat))
    print('Lower dim data has shape',low_dim_data.shape)
    tsne_df =  pd.DataFrame(low_dim_data, embedding_clusters['cluster'])
    plt.figure(figsize=(20,12))
    ax = sns.scatterplot(data=tsne_df[0], x=tsne_df[0], y=tsne_df[1], hue=tsne_df.index, palette = "viridis", s=80)
    ax.set_title('T-SNE '+model_name+' Embeddings')
    plt.draw()
    if save:
        plt.savefig("visualization/triplets/"+model_name+"_tsne_clusters_"+str(n_clusters)+".png")
    return low_dim_data

## Similar image retrieval

In [10]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [11]:
def find_distances(images_regions, input_id, input_embedding = np.zeros(5)):
    distances = []     
    image_ids=images_regions.Image_id
    if input_id>0:
        reference_embedding = images_regions.loc[images_regions.Image_id == input_id]['embeddings']
        reference_embedding = reference_embedding.values[0].reshape(-1,1)
        corpus_embeddings = images_regions.loc[images_regions.Image_id != input_id]['embeddings']
    else:
        reference_embedding = input_embedding
        corpus_embeddings = images_regions['embeddings']
        
    for j in range(len(corpus_embeddings)):  # rows of def_embeddings matrix
        defin = j
        if image_ids[j]!=input_id:      # avoid calculating distance with itself
            corpus = corpus_embeddings[j].reshape(-1,1)
            # euclidean distance between multidimensional vectors
            #dist = distance.euclidean(np.asarray(reference_embedding)[0], np.asarray(corpus)[0])
            dist = distance.euclidean(reference_embedding.flatten(), corpus.flatten())

            distances.append([image_ids[j], dist]) 
        
    # store in df
    col_names = ['image_id', 'distances']
    distances_df = pd.DataFrame(distances, columns=col_names)
    distances_df = distances_df.sort_values(by='distances', ascending=True)
    distances_df.to_csv('distances.csv', index=False)
    return distances_df

In [12]:
# given image id to retrieve its k most similar images

def retrieve_images(key, images_regions, input_id = -1, input_embedding = np.zeros(5)):
    top_k=10
    
    if input_id>0:
    # top_k results to return
        print('Reference image:', input_id)
        retrieve_image(input_id)
        distances_df=find_distances(images_regions, input_id)
    else:
        distances_df=find_distances(images_regions, input_id, input_embedding)
    top_images=distances_df.head(top_k)

    print("Top", top_k, "most similar images to image", input_id, "in Visual Genome:")
    for index, row in top_images.iterrows():   
        im_id = int(row.image_id)
        print("Image id:", im_id, "Euclidean distance: %.4f" % (row.distances))

        # find similar images from api and show
        retrieve_image(im_id)

    return top_images

# Method: stsb-mpnet-base-v2

In [15]:
clusters_max = 21

score_list = {}

for i in range(2, clusters_max):
    embedding_clusters=make_clusters(relationships, i)
    print('Number of clusters:', i)
    print(embedding_clusters.groupby('cluster').count())
    low_embedding_clusters = tsne_visual(embedding_clusters, n_clusters=i, save=True)

    # Get silhouette samples
    low_embedding_clusters = low_embedding_clusters
    embedding_clusters = embedding_clusters

    plt.figure(figsize=(20,12))

    embedding_samples = low_embedding_clusters
    labels = embedding_clusters['cluster'].to_numpy()
    silhouette_vals = silhouette_samples(embedding_samples, labels)
    avg_score = silhouette_score(embedding_samples, labels)
    print(f"For n_clusters = {i} The average silhouette_score is : {avg_score}")
    score_list[i] = {'score': avg_score, 'silhouette_vals': silhouette_vals}

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        plt.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    avg_score = np.mean(silhouette_vals)
    plt.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    plt.yticks([])
    plt.xlim([-0.1, 1])
    plt.xlabel('Silhouette coefficient values')
    plt.ylabel('Cluster labels')
    plt.title(f'Silhouette analysis using k = {i}, average score = {avg_score:.3f}', y=1.02)
    # title=f'Silhouette analysis using k = {i}, average score = {avg_score:.3f}'
    # plt.suptitle(title, fontsize=16, y=1.05)
    plt.draw()
    plt.savefig("visualization/triplets/"+model_name+"_silhouette_clusters_embeddings_"+str(i)+".png")

# write to log file:
with open('visualization/triplets/'+model_name+'_clusters_score_log_.txt', 'w') as f:
    for key, value in score_list.items():
        f.write('%s: %s \n' % (key, value))

Number of clusters: 4
         Image_id  Triplets  embeddings  centroid  distance_from_centroid
cluster                                                                  
0           27513     27513       27513     27513                   27513
1           16752     16752       16752     16752                   16752
2           29979     29979       29979     29979                   29979
3           30608     30608       30608     30608                   30608
Lower dim data has shape (104852, 2)
For n_clusters = 4 The average silhouette_score is : 0.24654890596866608
Number of clusters: 5
         Image_id  Triplets  embeddings  centroid  distance_from_centroid
cluster                                                                  
0           26716     26716       26716     26716                   26716
1           12138     12138       12138     12138                   12138
2           26735     26735       26735     26735                   26735
3           23212     23212     

: 

: 

In [None]:
embeddings_method = "embeddings"
num_clusters = 9

embedding_clusters=make_clusters(relationships, num_clusters)
print('Number of clusters:', num_clusters)
print(embedding_clusters.groupby('cluster').count())
low_embedding_clusters = tsne_visual(embedding_clusters, n_clusters=num_clusters, save=True)

# Get silhouette samples
low_embedding_clusters = low_embedding_clusters
embedding_clusters = embedding_clusters

plt.figure(figsize=(20,12))

embedding_samples = low_embedding_clusters
labels = embedding_clusters['cluster'].to_numpy()
silhouette_vals = silhouette_samples(embedding_samples, labels)
avg_score = silhouette_score(embedding_samples, labels)

In [None]:
print(embedding_clusters.keys())
i=0
for idx, img in embedding_clusters.iterrows():
    if img['cluster'] == 8:
        print(img['Image_id'])
        print(img['region_sentences'])
        i+=1
        if i>10:
            break

In [None]:
print(embedding_clusters.keys())

## Similar image retrieval based on given image id


In [None]:
input_id = 1
top_images = retrieve_images(embeddings_method, image_regions, input_id = input_id)

In [None]:
top_images

## Similar image retrieval based on given user sentence


In [None]:
input_sentence = 'a cat is sleeping on the beach'

In [None]:
input_embedding = get_embeddings(model_4, input_sentence)

In [None]:
input_embedding.shape

In [None]:
top_images_2 = retrieve_images(embeddings_method, image_regions, input_id = -1, input_embedding=input_embedding)

In [None]:
top_images_2