In [1]:
import sys
sys.path.insert(0, "/notebooks/pipenv")
sys.path.insert(0, "/notebooks/nebula3_database")
sys.path.insert(0, "/notebooks/nebula3_experiments")
sys.path.insert(0, "/notebooks/nebula3_videoprocessing")
sys.path.insert(0, "/notebooks/")
from PIL import Image
import requests
import visual_genome.local as vg
import json
import copy
import operator
import itertools
import subprocess

import numpy as np
import torch
import spacy
import nltk
import openai
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 
from sentence_transformers import SentenceTransformer
from database.arangodb import DatabaseConnector
from config import NEBULA_CONF
from vg_eval import VGEvaluation, get_sc_graph, spice_get_triplets, tuples_from_sg
from videoprocessing.vlm_factory import VlmFactory
from videoprocessing.vlm_interface import VlmInterface
from videoprocessing.vlm_implementation import VlmChunker

In [2]:
nltk.download('wordnet')
nlp = spacy.load('en_core_web_lg')
# nlp.add_pipe("spacy_wordnet", after='tagger', config={'lang': nlp.lang})

with open('/storage/keys/openai.key','r') as f:
    OPENAI_API_KEY = f.readline().strip()
openai.api_key = OPENAI_API_KEY

VG_DATA = '/storage/vg_data'
IPC_COLLECTION = 'ipc_relations_spice'
RECALL_COLLECTION = 'ipc_recall_spice'
GLOBAL_TOKENS_COLLECTION = 's3_global_tokens'
FS_GPT_MODEL = 'text-davinci-002'
class PIPELINE:
    def __init__(self):
        config = NEBULA_CONF()
        self.db_host = config.get_database_host()
        self.database = config.get_playground_name()
        self.gdb = DatabaseConnector()
        self.db = self.gdb.connect_db(self.database)
pipeline = PIPELINE()
def get_all_s3_ids():
    results = {}
    query = 'FOR doc IN {} RETURN doc.image_id'.format(GLOBAL_TOKENS_COLLECTION)
    cursor = pipeline.db.aql.execute(query)
    return [doc for doc in cursor]

s3_ids = get_all_s3_ids()
evaluator = VGEvaluation()
def flatten(lst): return [x for l in lst for x in l]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
class GTBaseGenerator:
    def __init__(self):
        self.pipeline = PIPELINE()
        self.ipc_data = json.load(open('/storage/ipc_data/paragraphs_v1.json','r'))
        self.global_captioner = 'blip'
        self.global_tagger = 'blip'
        self.places_source = 'blip'
        self.global_prompt1 = '''Caption of image: {}
This image is taking place in: {}
Tags: This image is about {}
Describe this image in detail:'''

    def get_image_id_from_collection(self, id,collection=GLOBAL_TOKENS_COLLECTION):
        results = {}
        query = 'FOR doc IN {} FILTER doc.image_id == {} RETURN doc'.format(collection,id)
        cursor = self.pipeline.db.aql.execute(query)
        for doc in cursor:
            results.update(doc)
        return results
    
    def get_structure(self, id):
        sg = get_sc_graph(id)
        global_doc = self.get_image_id_from_collection(id)
        if not global_doc:
            print("Couldn't find global tokens for id {}".format(id))
            return
        rc_doc = {
            'image_id': id,
            'url': sg.image.url            
        }
        for (k,v) in global_doc.items():
            if k.startswith('global'):
                rc_doc[k]=copy.copy(v)
        rois = []
        for obj in sg.objects:
            obj_dic = {
                'GT': list(zip(obj.names,[1.0]*len(obj.names)))
            }
            attr_dic = {
                'GT': list(zip(obj.attributes,[1.0]*len(obj.attributes)))
            }
            obj_doc = {                
                'objects': obj.names,
                'attributes': obj.attributes,
                'bbox': [obj.x, obj.y, obj.x+obj.width, obj.y+obj.height]              
                }
            rois.append(obj_doc)
        rc_doc['rois']=rois

        return rc_doc

    def get_prompt(self, id, include_answer=False):
        base_doc = self.get_structure(id)
        if base_doc == None:
            return
        caption = base_doc['global_captions'][self.global_captioner]
        all_objects = base_doc['global_objects'][self.global_tagger]
        all_persons = base_doc['global_persons'][self.global_tagger]
        all_places = base_doc['global_scenes'][self.places_source]
        # print("Caption: {}".format(caption))
        # print("Objects: ")
        # print(all_objects[:5])
        # print("Places:")
        # print(all_places[:5])
        # print("Persons:")
        # print(all_persons[:5])
        objects = '; '.join([x['label'] for x in all_objects[:8]])
        persons = '; '.join([x['label'] for x in all_persons[:5]])
        places = ' or '.join([x['label'] for x in all_places[:3]])
        prompt_before_answer = self.global_prompt1.format(caption,places,objects)
        if include_answer:
            [answer] = [x['paragraph'] for x in self.ipc_data if x['image_id']==id]
            final_prompt = prompt_before_answer+" "+answer
        else:
            final_prompt = prompt_before_answer
        return final_prompt
        
base_gen = GTBaseGenerator()

def few_shot_process_target_id(fs_ids: list[int],target_id: int, vlm: VlmInterface, pgen=base_gen, **kwargs):
    target_sg = get_sc_graph(target_id)
    fs_prompt = generate_gpt_prompt(fs_ids, target_id=target_id, pgen=pgen)
    results = gpt_execute(fs_prompt, model=FS_GPT_MODEL, **kwargs)
    return results
    # scores = vlm.compute_similarity_url(target_sg.image.url,results)
    # best_index = np.argmax(scores)
    # return results[best_index]


In [4]:
def gpt_execute(prompt_template, *args, **kwargs):            
    prompt = prompt_template.format(*args)   
    response = openai.Completion.create(prompt=prompt, max_tokens=256, **kwargs)   
    # return response
    return [x['text'].strip() for x in response['choices']]

def generate_gpt_prompt(ids, target_id=None, pgen=GTBaseGenerator()):
    rc = []
    for id in ids:
        rc.append(pgen.get_prompt(id,include_answer=True))
    if target_id:
        rc.append(pgen.get_prompt(target_id,include_answer=False))
        print("Target id is: {}".format(target_id))
    return '\n'.join(rc)


In [14]:
def candidates_from_paragraph(paragraph):
    senter = nlp.get_pipe("senter")
    sentences = [str(x) for x in senter(nlp(paragraph)).sents]
    n = len(sentences)
    cands = []
    for i in range(2,n+1):
        for comb in itertools.combinations(range(n),i):
            cands.append(' '.join(operator.itemgetter(*comb)(sentences)))
    return cands

In [25]:
vlm = VlmChunker(VlmFactory().get_vlm("blip_itc"), chunk_size=5)

Initializing model on GPU
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth


In [7]:
s3_train, s3_test = np.split(np.array(s3_ids),[900])

In [43]:
train_ids = np.random.choice(s3_train,5)
target_id = np.random.choice(s3_test)
# rc = generate_gpt_prompt(train_ids, target_id=target_id, pgen=base_gen)
rc = few_shot_process_target_id(train_ids,target_id,vlm,n=5)
candidates = flatten([candidates_from_paragraph(x) for x in rc])

Target id is: 2404490


In [44]:
rc

['A counter with a wine bottle and a wine glass on it. The wine bottle is red and has a gold label on it. The label has black text on it. The wine glass is clear and has a stem. The stem is thin and has a small base. There is liquid in the glass. The liquid is red.',
 'A clear, wine bottle with a green label that reads Mouton Cadet 2014 - Bordeaux, France. There is a red wax seal over the cork. The wine glass is next to the bottle and is also clear. It is slightly taller than the bottle and has a stem. The glass is empty.',
 'There is a brown counter with a light colored granite counter top. There is a white bottle of wine on the counter with a glass of wine next to it. The cork is still in the bottle of wine. The glass of wine is half full. There is a label on the wine bottle. The label is gold and has black text.',
 'There is a dark wood counter. On the counter there is a wine bottle and a wine glass. The wine bottle is clasped with a silver colored ring. The wine glass is filled hal

In [45]:
sg = get_sc_graph(target_id)
print(sg.image.url)
scores = vlm.compute_similarity_url(sg.image.url,candidates)
# scores = vlm.compute_similarity_url(sg.image.url,[rc[1]])

https://cs.stanford.edu/people/rak248/VG_100K_2/2404490.jpg


In [46]:
candidates[np.argmax(scores)]
# rc

'On the counter there is a wine bottle and a wine glass. The wine glass is filled halfway with red wine.'

In [None]:
# [str(x) for x in sentences.sents]
cands = []
for i in range(len(sentences)):
    c = copy.copy(sentences)
    c.pop(i)
    cands.append(' '.join(c))
# vlm.compute_similarity_url(sg.image.url,' '.join(sentences[:-1]))
vlm.compute_similarity_url(sg.image.url,cands)

In [None]:
def get_2_urls():
    [id1, id2] = np.random.choice(s3_ids,2)
    sg1 = get_sc_graph(id1)
    sg2 = get_sc_graph(id2)
    print(sg1.image.url)
    print(sg2.image.url)

In [None]:
sg = get_sc_graph(target_id)
print(sg.image.url)
gt_triplets = tuples_from_sg(sg)
pred_triplets = spice_get_triplets(gpt_rc[0])
recall = evaluator.recall_triplets_mean(gt_triplets,pred_triplets)
print("Mean (bert-based) total recall of ground truth triplets in ipc triplets is: {}".format(recall))


In [None]:
sg = get_sc_graph(2359808)

In [None]:
[(x.x, x.y, x.x+x.width, x.y+x.height) for x in sg.objects if x.width>=50 and x.height>=50]

In [None]:
rc = senter(nlp("Hello world. Hello world again."))

In [None]:
list(rc.sents)