In [12]:
from PIL import Image

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
from dgl.nn import GraphConv
from factual_scene_graph.parser.scene_graph_parser import SceneGraphParser
import json

parser = SceneGraphParser('lizhuang144/flan-t5-base-VG-factual-sg', device='cpu')

captions_text_graph = {
    "552666": [("people" , "is" , "group of") , ( "people" , "ride" , "horse" ) , ( "people" , "ride through" , "field" )],
    "687618": [( "man" , "stand in" ,"shirt" ) , ( "shirt" , "is" , "blue" )],
    "405058": [( "counters" , "is" , "wooden" ) , ( "kitchen" , "is" , "simple" ) , ( "kitchen" , "with" , "stove" )],
    "703860": [( "adult" , "ride" , "motorcycle" ) , ( "adult" , "ride with" , "child" ) , ( "adults" , "ride" , "motorcycle" )],
    "776132": [( "kite" , "fly over" , "sky" ) , ( "kite" , "is" , "yellow" ) , ( "sky" , "is" , "large" )],
    "287571": [( "skate boarder" , "trick on" , "picnic table" )],
    "67000": [( "elephant" , "inside" , "fence" ) , ( "fence" , "is" , "wire" ) , ( "people" , "on side of" , "fence" )],
    "137494": [( "slices" , "eat from" , "pizza" )],
    "427130": [( "dog" , "is" , "brown" ) , ( "dog" , "walk across" , "field" ) , ( "field" , "is" , "green" ) , ( "frisbee", "in" , "mouth" )],
    "285328": [( "men" , "at" , "stop light" ) , ( "men" , "is" , "2" ) , ( "men" , "on" , "motorcycles" )],
    "549270": [( "silverware" , "have" , "handle" )],
    "270030": [( "area" , "is" , "grassy" ) , ( "zebras" , "is" , "4" ) , ( "zebras" , "walk in" , "area" )],
    "377603": [( "cow" , "is" , "statue" ) , ( "dog" , "look at" , "cow" )],
    "22348": [( "field" , "next to" ," forest ") , ( "horses" , "graze in" , "field" ) , ( "horses" , "is" , "2" )],
    "196658": [( "boy" , "in" , "helmet" ) , ( "boy" , "ride" , "skateboard" )],
    "231576": [( "mobile phone" , "on" , "display area" ) , ( "mobile phone" , "with" , "writing" ) , ( "writing" , "is" , "Asian" )],
    "491426": [( "flowers" , "next to" , "vase" ) , ( "pictures" , "next to" , "vase" ) , ( "vase" , "is" , "beautiful" )],
    "455622": [( "pizza" , "is" , "piece" ) , ( "plate" , "have" , "fork" ) , ( "plate" , "have" , "pizza" )],
    "452624": [( "boy" , "hold" , "tennis racquet" ) , ( "boy" , "on" , "court" )],
    "587826": [( "man" , "in" , "parade" ) , ( "man" , "on" , "motorcycle" )]
}

class SceneGraphGNN(nn.Module):
    def __init__(self, in_feats, hidden_feats, num_classes):
        super(SceneGraphGNN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_feats)  # First GraphConv layer
        self.conv2 = GraphConv(hidden_feats, num_classes)  # Second GraphConv layer
        self.relu = nn.ReLU()  # Activation function (ReLU)

    def forward(self, g, features):
        # Graph Convolutional Layers
        h = self.conv1(g, features)  # Apply first graph convolution
        h = self.relu(h)  # Apply ReLU activation
        h = self.conv2(g, h)  # Apply second graph convolution
        return h  # Return the output features (node embeddings)
    
def process_scene_graph(scene_graph):
    """
    Convert a scene graph in textual form into nodes and edges.
    """
    nodes = set()
    edges = []
    # print("Raw scene_graph:", scene_graph)
    # Parse the scene graph
    for relation in scene_graph:
        # print("relation:", relation)
        # print("len(relation)",len(relation))
        if len(relation) < 3:
            continue
        for i in range(0,len(relation),3):
            src = relation[i].strip().lstrip("(").strip()
            rel = relation[i + 1].strip().lstrip('v:').strip()
            dst = relation[i + 2].strip().rstrip(")").strip()
            # print(f"Processed relation: src = {src}, rel = {rel}, dst = {dst}")
            nodes.add(src)
            nodes.add(dst)
            edges.append((src, rel, dst))

    return list(nodes), edges

def create_dgl_graph(nodes, edges):
    """
    Create a DGL graph from nodes and edges.
    """
    # Map nodes and relations to unique IDs
    # print("nodes:", nodes)
    # print("edges:", edges)
    node_to_id = {node: i for i, node in enumerate(nodes)}
    # print("node_to_id", node_to_id)
    relation_to_id = {rel: i for i, (_, rel, _) in enumerate(edges)}
    # print("relation_to_id:", relation_to_id)

    # Prepare source and destination node indices
    src_nodes = [node_to_id[src] for src, _, _ in edges]
    # print("src_nodes:", src_nodes)
    dst_nodes = [node_to_id[dst] for _, _, dst in edges]
    # print("dst_nodes:", dst_nodes)

    # Create the graph
    g = dgl.graph((src_nodes, dst_nodes), num_nodes=len(nodes))
    # print("g (before adding self-loops):", g)

    # Add self-loops to the graph
    g = dgl.add_self_loop(g)
    # print("g (after adding self-loops):", g)

    # Update edge features to match the new number of edges
    original_edge_features = [relation_to_id[rel] for _, rel, _ in edges]
    self_loop_features = [-1] * len(nodes)  # Assign default value for self-loops
    all_edge_features = original_edge_features + self_loop_features
    g.edata['relation_type'] = torch.tensor(all_edge_features, dtype=torch.int64)
    # print("g with edge data:", g)

    return g, node_to_id, relation_to_id

input_dim = 64  # Input node feature dimension
hidden_dim = 128  # Hidden layer dimension
num_classes = 512  # Number of output classes (adjust based on your task)

# Initialize the model
model = SceneGraphGNN(input_dim, hidden_dim, num_classes)

# Load checkpoint
checkpoint = torch.load("gnn_model/model_checkpoint.pth")

# Access model and optimizer states
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

captions_graph_embeddings = {}

for key, converted_scene_graph in captions_text_graph.items():
    nodes , edges = process_scene_graph(converted_scene_graph)
    g, node_to_id, relation_to_id= create_dgl_graph(nodes , edges)
    g.ndata['feat'] = torch.rand(len(nodes), 64)  # Random 10-dimensional features
    logits = model(g, g.ndata['feat'])
    captions_graph_embeddings[key] = logits[0]

  checkpoint = torch.load("gnn_model/model_checkpoint.pth")


In [None]:
captions_graph_embeddings = {}

for key, converted_scene_graph in captions_text_graph.items():
    nodes , edges = process_scene_graph(converted_scene_graph)
    g, node_to_id, relation_to_id= create_dgl_graph(nodes , edges)
    g.ndata['feat'] = torch.rand(len(nodes), 64)  # Random 10-dimensional features
    logits = model(g, g.ndata['feat'])
    captions_graph_embeddings[key] = logits[0]

  assert input.numel() == input.storage().size(), "Cannot convert view " \


In [38]:
len(captions_graph_embeddings['552666'])

512

In [18]:
import numpy as np
import torch
from pkg_resources import packaging
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
from tqdm import tqdm
from chromadb.config import Settings
from annoy import AnnoyIndex
import json
import numpy as np
# print("Torch version:", torch.__version__)

import clip

clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [105]:
model, preprocess = clip.load("ViT-B/32")
# model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [106]:
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    preprocessed = preprocess(image)
    image_input = torch.tensor(preprocessed).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        image_features = model.encode_image(image_input).float()
    image_features /= image_features.norm(dim=-1, keepdim=True)
    return image_features

def get_text_embedding(text, model=model, device=device):
    input_token = clip.tokenize(text).to(device)
    with torch.no_grad():
        text_features = model.encode_text(input_token).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    return text_features

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [107]:
with open('output.json', 'r') as file:
    images = json.load(file)
print(len(images))

20


In [108]:
print(images)

{'552666': ['000000445512', '000000068120', '000000439180', '000000424812', '000000322511', '000000560880', '000000441009', '000000031667', '000000450037', '000000474934', '000000329336', '000000492282', '000000373119'], '687618': ['000000143554', '000000084258', '000000296524', '000000032400', '000000240274', '000000515347', '000000042201', '000000333182', '000000262175'], '405058': ['000000301827', '000000545734', '000000482022', '000000575367', '000000048905', '000000206705', '000000412978', '000000059383', '000000097240', '000000298461'], '703860': ['000000269254', '000000191846', '000000012744', '000000559665', '000000187857', '000000480021', '000000005205', '000000031965', '000000349437', '000000487450', '000000046077'], '776132': ['000000462755', '000000182279', '000000276458', '000000502090', '000000082293', '000000509270', '000000382743'], '287571': ['000000256868', '000000262148', '000000391365', '000000208808', '000000125071', '000000110551', '000000465911', '000000532211', 

In [109]:
import os
from PIL import Image

# Specify the folder path
combined_dict = {}

for folder in images:
    folder_path = f'/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/{str(folder)}'
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            image_path = os.path.join(folder_path, filename)
            img = Image.open(image_path)
            # print(filename[18:-4])
            id = int(folder + filename[18:-4])
            # # print(image_path)
            # # (type(id))
            combined_dict[id] = image_path

In [110]:
embeddings_combined_dict = {}

for folder in images:
    folder_path = f'/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/{str(folder)}'
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            image_path = os.path.join(folder_path, filename)
            img = Image.open(image_path)
            id = int(folder + filename[18:-4])
            basic_embeddings = get_image_embedding(image_path)[0]
            gnn_embeddings = captions_graph_embeddings[folder]
            embeddings_combined_dict[id] = result = torch.cat((basic_embeddings.unsqueeze(0), gnn_embeddings.unsqueeze(0)), dim=1)

  image_input = torch.tensor(preprocessed).unsqueeze(0)  # Add batch dimension


In [111]:
alternate_keys = {}
start = 1
for i in embeddings_combined_dict:
    alternate_keys[i] = start
    start += 1
    
reverse_alternate_keys = {}

for i, j in alternate_keys.items():
    reverse_alternate_keys[j] = i

In [113]:
f = 1024 # Number of Dimensions
t = AnnoyIndex(f)
for i, j in embeddings_combined_dict.items():
    t.add_item(alternate_keys[i], j[0]) # Adding the key-value pair of the AnnoyIndex

t.build(f) # Building 1024 trees for AnnoyIndex, more the number of trees, more the memory consumed, better are the results of ANN algorithm
t.save('image-search-tree-gnn-rn50.ann') # Saving the AnnoyIndex for faster reading

  t = AnnoyIndex(f)


True

In [114]:
search_space = AnnoyIndex(1024)
search_space.load('./image-search-tree-gnn-rn50.ann')

  search_space = AnnoyIndex(1024)


True

In [115]:
def text_image_search(folder, query : str, num : int = 10):
    query_vector = get_text_embedding(query)[0]
    gnn_embeddings = captions_graph_embeddings[folder]
    query_embeddings = torch.cat((query_vector.unsqueeze(0), gnn_embeddings.unsqueeze(0)), dim=1)
    ans = search_space.get_nns_by_vector(query_embeddings[0], num)
    return ans

def recall_at_k(actual, predicted, k):
    actual_set = set(actual)
    top_k_predicted = set(predicted[:k])
    relevant_hits = len(actual_set & top_k_predicted)
    if len(actual_set) == 0:
        return 0.0
    return round(relevant_hits / len(actual_set), 2)

In [116]:
with open('new-captions.json', 'r') as file:
    captions = json.load(file)
    
with open('output.json', 'r') as file:
    actual_images = json.load(file)
    
answers = {}
finals = {1: [], 2: [], 5: [], 10: []}
for i, caption in captions.items():
    ans = text_image_search(query=caption, folder=i)
    predicted = []
    for p in ans:
        key = reverse_alternate_keys[p]
        predicted.append(key)
    ls = actual_images[i]
    actual = [int(i+k[-6:]) for k in ls]
    answers[i] = []
    for val in [1, 2, 5, 10]:
        temp = recall_at_k(actual, predicted, val)
        finals[val].append(temp)
        answers[i].append(temp)

In [117]:
ekdum_final = {}
for i, ls in finals.items():
    ekdum_final[i] = round(sum(ls) / len(ls), 5)
    # ekdum_final[i] = % .4f % ekdum_final[i]
print(ekdum_final)

{1: 0.112, 2: 0.203, 5: 0.439, 10: 0.719}


In [118]:
answers

{'552666': [0.0, 0.0, 0.23, 0.31],
 '687618': [0.11, 0.22, 0.56, 0.89],
 '405058': [0.1, 0.2, 0.5, 1.0],
 '703860': [0.09, 0.18, 0.27, 0.45],
 '776132': [0.14, 0.29, 0.71, 0.86],
 '287571': [0.1, 0.2, 0.5, 0.8],
 '67000': [0.17, 0.33, 0.67, 0.67],
 '137494': [0.06, 0.06, 0.19, 0.31],
 '427130': [0.11, 0.22, 0.56, 1.0],
 '285328': [0.12, 0.12, 0.25, 0.62],
 '549270': [0.17, 0.33, 0.67, 0.67],
 '270030': [0.12, 0.25, 0.5, 1.0],
 '377603': [0.33, 0.33, 0.33, 1.0],
 '22348': [0.1, 0.2, 0.5, 0.9],
 '196658': [0.1, 0.2, 0.4, 0.5],
 '231576': [0.17, 0.33, 0.67, 0.83],
 '491426': [0.08, 0.15, 0.38, 0.77],
 '455622': [0.0, 0.1, 0.2, 0.5],
 '452624': [0.08, 0.17, 0.42, 0.75],
 '587826': [0.09, 0.18, 0.27, 0.55]}