# Latest, working great code for Scene Graph Predict

EXPECTED INPUT
```python
one_segment_CLIP_output = {
    "video_stem": str(Path(video_filepath).stem),
    "segment_id": str(Path(video_filepath).stem) + f"_{i}",
    "segment_index": i,
    "total_segments": len(whisper_segments),
    "segment_total_time": whisper_segments[i]['end'] - whisper_segments[i]['start'],
    "captions": whisper_segments[i]['caption'],
    "segment_start_time": whisper_segments[i]['start'],
    "segment_end_time": whisper_segments[i]['end'],
    "num_frames_per_segment": self.num_frames,
    "frame_embeddings": image_feature,
    "text_caption_embeddings": caption_feature,
    "segment_frames": segment_frame,
    "frame_embeddings_shape": image_feature.shape,          # trying the FLATTEN technique!
    "text_caption_embeddings_shape": caption_feature.shape,
    "segment_frames_shape": segment_frame.shape,
    "scene_graph_captions": "comma separated string (list of scene graph outputs)"  # <--- this is all we're adding
}
```

In [1]:
import yappi

import numpy as np
import cv2
from PIL import Image
import time
import glob
from faster_OpenPSG.predict import Predictor 
import os
import glob
import jsonlines
import json
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

parallel_dir      = 'parallel_15'
clip_input_dir    = f'/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/{parallel_dir}_clip_output/'
scene_output_path = f'/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/{parallel_dir}_scene_output.jsonl'
NUM_RELATIONS     = 10

# init model
my_pred = Predictor()
my_pred.setup()

start_time = time.monotonic()
# iterate over videos
for clip_npz_path in glob.glob(os.path.join(clip_input_dir, '*'), recursive = True):
  np_loaded = np.load(clip_npz_path, allow_pickle=True)
  object_list_of_str = []
  
  # iterate over segments
  for segment_index in range(np_loaded['arr_0'].item()['total_segments']):
    # print(np_loaded[f'arr_{segment_index}'].item()['captions'])
    frame = np_loaded[f'arr_{segment_index}'].item()['segment_frames']
    frame = frame.reshape(336, 336, 3) # todo: ideally, get this from the shape property.
    frame_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
    
    # display(Image.fromarray(frame_rgb))
    # print('loaded: ')
    # print(frame_rgb.shape)
    
    curr_sro = my_pred.predict(image=frame_rgb, num_rel=NUM_RELATIONS)
    for rel in curr_sro:
      if 'kissing' in rel:
        print(curr_sro)
        # display(Image.fromarray(frame_rgb))
    
    # print(", ".join(curr_sro))
    object_list_of_str.append(", ".join(curr_sro))
    
    # if segment_index > 20:
    #   break
    # break
  
  ''' Save the video stem to the output file. '''
  with jsonlines.open(scene_output_path, mode = 'a') as writer:
      writer.write(json.dumps(object_list_of_str))

  print(f"⏰ Ran scene graph {len(object_list_of_str)} frames in {(time.monotonic()-start_time):2f} seconds. Output to {scene_output_path}")

load checkpoint from local path: faster_OpenPSG/checkpoints/epoch_60.pth




[0.90953416 0.8543104  0.8264715  0.7525833  0.47158736 0.43570775
 0.4335805  0.4240688 ]
8
n_rel_topk 8
book on shelf, person in front of shelf, person in front of door-stuff, person wearing tie, door-stuff beside person, book beside shelf, door-stuff beside shelf, shelf beside person
[0.91427636 0.72285837 0.697936   0.6453563  0.58005834 0.49247465
 0.37187615 0.33688644 0.3202984 ]
9
n_rel_topk 9
door-stuff beside person, person in front of shelf, shelf beside person, person holding tie, person beside shelf, person wearing tie, person wearing tie, shelf beside shelf, shelf beside person
[0.89857644 0.6963436  0.6079165  0.5582388  0.45050624]
5
n_rel_topk 5
person in front of shelf, person in front of shelf, person wearing tie, shelf beside person, shelf beside person
[0.8536328  0.8374752  0.67497534 0.6437219  0.5806652  0.5766703
 0.45141283 0.42135707 0.3614172 ]
9
n_rel_topk 9
person in front of door-stuff, person in front of shelf, shelf beside person, person in front of she

In [5]:
import pprint
import jsonlines
import json
scene_output_path = '/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/parallel_15_scene_output.jsonl'
# load in results
with jsonlines.open(scene_output_path, mode='r') as reader:
  for obj in reader:
    obj = json.loads(obj)
    pprint.pprint(obj)
    # print('hi')
    break


['book on shelf, person in front of shelf, person in front of door-stuff, '
 'person wearing tie, door-stuff beside person, book beside shelf, door-stuff '
 'beside shelf, shelf beside person',
 'door-stuff beside person, person in front of shelf, shelf beside person, '
 'person holding tie, person beside shelf, person wearing tie, person wearing '
 'tie, shelf beside shelf, shelf beside person',
 'person in front of shelf, person in front of shelf, person wearing tie, '
 'shelf beside person, shelf beside person',
 'person in front of door-stuff, person in front of shelf, shelf beside '
 'person, person in front of shelf, person in front of door-stuff, person '
 'holding tie, person beside shelf, person wearing tie, shelf beside person',
 'person wearing tie, person beside shelf, person in front of shelf, shelf '
 'beside person',
 'person in front of umbrella, person in front of shelf, person holding '
 'umbrella, shelf beside person, umbrella beside person, shelf beside '
 'umbrella

# Performance tuning and old code

In [None]:
yappi.set_clock_type("cpu") # Use set_clock_type("wall") for wall time
yappi.start()

yappi.get_func_stats().print_all()
yappi.get_thread_stats().print_all()

  
path = '/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/parallel_15_clip_output/J-DG5O7gqdY_All4TubeVLOGS_15315_This Is Why We Had To Move To Florida.npz'



In [None]:
''' OLD OLD OLD'''

import jsonlines 
import cv2
import json_numpy
from PIL import Image

NUM_RELATIONS = 5
INPUT_FILENAME = '/mnt/storage_ssd/parallel_10_clip_output.jsonl'
# INPUT_FILENAME = '/mnt/storage_ssd/parallel_14_clip_output.jsonl'

with jsonlines.open(INPUT_FILENAME) as reader, jsonlines.open(INPUT_FILENAME, mode='a') as writer:
    sro_list = set()
    for obj_idx, test_sample in enumerate(reader):
        num_frames_counter = 0
        if test_sample['segment_frames'][num_frames_counter] is not None:
            print(test_sample['segment_frames'])
            print(test_sample['segment_frames'][0])
            frame = json_numpy.loads(test_sample['segment_frames'][num_frames_counter])
            frame_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
            # display(Image.display(frame_rgb))
            print('loaded: ')
            print(frame_rgb.shape)
            
            curr_sro = my_pred.predict(image=frame_rgb, num_rel=NUM_RELATIONS)
            sro_list.update(set(curr_sro))
            print(sro_list)
            num_frames_counter += 1
        test_sample['scene_graph_set'] = list(sro_list)
        writer.write(test_sample)
        if obj_idx >= 1:
            break
        break

# Embed with Flan-T5

In [None]:
# !pip install transformers

In [None]:
T5EncoderModel

In [None]:
import torch
import accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

In [None]:
CONTEXT = "{Example: A Two-Bit Gray Code Counter}Let's begin with a two-bit Gray code counter with no inputs.As we mentioned in Notes Set 2.1, a Gray code is a cycle over allbit patterns of a certain length in which consecutive patterns differin exactly one bit.For simplicity, our first few examples are based on counters anduse the internal stateof the FSM as the output values.  You should already knowhow to design combinational logic for the outputs if it were necessary.The inputs to a counter, if any, are typically limited to functionssuch as starting and stopping the counter, controlling the counting direction, and resetting the counter to a particular state.A fully-specified transition diagram for a two-bit Gray code counter appears below.With no inputs, the states simply form a loop, withthe counter moving from one state to the next each cycle.Each state in the diagram is marked with the internal state value S_1S_0 (before the ``/'') and the output Z_1Z_0 (after the ``/''), which are always equal for this counter.Based on the transition diagram, we can fill in the K-maps for the next-state values S_1^+ and S_0^+ as shown to the right of thetransition diagram, then derive algebraic expressions in the usual way to obtainS_1^+=S_0 and S_0^+={{S_1}}.We then use the next-state logic to develop the implementationshown on the far right, completing our first counter design."
PROMPT = "Please answer this person's question accurately, clearly and concicely. Context: "
QUESTION = "Question: What are the inputs and outputs of a Gray code counter? "
input_text = PROMPT + CONTEXT + QUESTION + "Answer: "

input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids, max_length=1024)
print(tokenizer.decode(outputs[0]))