<a href="https://colab.research.google.com/github/HassanJoumaa/VinVL_Object_Detection/blob/main/prepare_vinvl_input.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## VINVL input preparation
* Requirements: [microsoft sg_benchmark](https://github.com/microsoft/scene_graph_benchmark/blob/main/INSTALL.md) + pandas + yaml
* To generate `predictions.tsv` using SG_benchmark, see [this issue](https://github.com/microsoft/scene_graph_benchmark/issues/7#issuecomment-819357369)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import ast
import json
import base64
import numpy as np
import json
np.set_printoptions(suppress=True, precision=4)

### To feed the data into VinVL's run_captioning.py on COCO, we need to generate:
* test.yaml file with:
    * test.labels.tsv: labels with confidence intervals
        * image_id, [{'class':str, conf: float, 'rect':[x_tl, y_tl, x_br,y_br]},{},...,{}]
    * test.feature.tsv: features extracted via sg_benchmark
        * image_id, {"num_boxes": n, "features": concatenated base64-encoded features: 2048 spacial features, 6 additional features*}
    * other files in COCO's yaml (test.img.tsv, test.hw.tsv, captions.json) aren't used for inference



* 6 additional features (hypothesis): for each bounding box we have: 
    * [0]: x_top_left/image_w, 
    * [1]: y_top_left/image_h, 
    * [2]: x_bottom_right/image_w, 
    * [3]: y_bottom_right/image_h, 
    * [4]: box_height/image_h = (x_br - x_tl)/image_h = [2] -[0]
    * [5]: (y_br-y_tl)/image_w = [3]-[1]

### We need height and width of the pictures

In [None]:
hw_df = pd.read_csv('/content/drive/MyDrive/projects/scene_graph_benchmark-main/tools/mini_tsv/data/train.hw.tsv',sep='\t',header=None,converters={1:ast.literal_eval},index_col=0)
hw_df.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
img1,"[{'height': 801, 'width': 1200}]"
img2,"[{'height': 720, 'width': 1080}]"
img3,"[{'height': 787, 'width': 1400}]"
img4,"[{'height': 342, 'width': 512}]"
img5,"[{'height': 336, 'width': 500}]"


### We also need `predictions.tsv` with bboxes, its class, confidence and spacial features

In [None]:
sg_tsv = '/content/drive/MyDrive/projects/scene_graph_benchmark-main/output/X152C5_test/inference/config.yaml/predictions.tsv'
df = pd.read_csv(sg_tsv,sep='\t',header = None,converters={1:json.loads})#converters={1:ast.literal_eval})
df[1] = df[1].apply(lambda x: x['objects'])

#example
df.loc[0,1][0]

{'bbox_id': 0,
 'class': 'guitar',
 'conf': 0.9935479760169983,
 'feature': 'QAy2PuWTkT7I+8w9XK4aQG/b20AAAAAAgp8pPAAAAABr6xM+AAAAACvi3D6fqbY/AAAAAEWrOz7vnTc/AAAAAAAAAACIbVNAcyp0PQAAAADbPi5AiRvjPW3sJT8AAAAAAAAAAGkc8z0K/J49KDpnQHoqZT/SzF9AkuySPgAAAAD9rdlA/A0SP+oGJT+He58/AAAAAAAAAAAAAAAAykoFPwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPECT9KWXhALJwqPwAAAABObxY8wUuWORqwdj7e5ag/AAAAAAAAAAAEqKg/AAAAAOzCTz+S28BANQQ3QIxuoT8RigQ+ImnMPoLmWT60jFw/98TIP95akUDjKYo+SmCXPwAAAABba6I/8VSIQP9VBUAqD8A+AAAAAAGsmEAAAAAAiM1BPQAAAAC3H5ZASm5lPwAAAACG04U/AAAAAAAAAACKSkRA/fKoPQAAAABVWa0+AAAAAAAAAAAAAAAAAAAAAAAAAADbfIU/8yuUQGi7YUGG3x8+AAAAAAqTrT6SSWlAN1PlPqTogj1gA+M/yS95QLHan0AAAAAAk8McPzoQ3T8A4do+gPbFPgAAAAA7aIU/nO1BP4VU1D6NDDI/AAAAANauPj4AAAAAKgAbPbEKZkE3R1o/qx0zPfzY5T5enPk9o8k/PgAAAAAPqWc/RKbdQAAAAABX+HQ/e1GlP1ZvJz57u79AAAAAAAAAAAAKTvQ+z1hZPpsSAz8ACEQ7AAAAAB/pPz0AAAAA5tRhPZOioD717S4+dxgjQO5JGUAbzpA/xFuqPx4DAD6zzRRAP62PPnqI9D4AAAAAIufhPVlsij0AAAAA1iC+PQAAAACxYNU/AAAAAI/79T0AAAAAuVH3Pq3KE0Cic6M855adP1oUej4EUjxAAAAAAF5Ehj9

In [None]:
def generate_additional_features(rect,h,w):
    mask = np.array([w,h,w,h],dtype=np.float32)
    rect = np.clip(rect/mask,0,1)
    res = np.hstack((rect,[rect[3]-rect[1], rect[2]-rect[0]]))
    return res.astype(np.float32)

def generate_features(x):
    idx, data,num_boxes = x[0],x[1],len(x[1])
    h,w,features_arr = hw_df.loc[idx,1][0]['height'],hw_df.loc[idx,1][0]['width'],[]

    for i in range(num_boxes):
        features = np.frombuffer(base64.b64decode(data[i]['feature']),np.float32)
        pos_feat = generate_additional_features(data[i]['rect'],h,w)
        x = np.hstack((features,pos_feat))
        features_arr.append(x.astype(np.float32))
        
    features = np.vstack(tuple(features_arr))
    features = base64.b64encode(features).decode("utf-8")
    return {"features":features, "num_boxes":num_boxes}

def generate_labels(x):
    data = x[1]
    res = [{"class":el['class'].capitalize(),"conf":el['conf'], "rect": el['rect']} for el in data] 
    return res

In [None]:
df['feature'] = df.apply(generate_features,axis=1)
df['feature'] = df['feature'].apply(json.dumps)

df['label'] = df.apply(generate_labels,axis=1)
df['label'] = df['label'].apply(json.dumps)

In [None]:
df.head()

Unnamed: 0,0,1,feature,label
0,img1,"[{'rect': [319.3331298828125, 376.935791015625...","{""features"": ""QAy2PuWTkT7I+8w9XK4aQG/b20AAAAAA...","[{""class"": ""Guitar"", ""conf"": 0.993547976016998..."
1,img2,"[{'rect': [121.45631408691406, 11.671216011047...","{""features"": ""ClsLPZUHCEAAAAAAb8EyP/WpvUAAAAAA...","[{""class"": ""Racket"", ""conf"": 0.832623183727264..."
2,img3,"[{'rect': [544.0774536132812, 378.101715087890...","{""features"": ""fYqJPgAAAAAAAAAAAAAAAA22nD+3OSg9...","[{""class"": ""Short"", ""conf"": 0.8272700309753418..."
3,img4,"[{'rect': [29.02312469482422, 176.618881225585...","{""features"": ""AAAAAAAAAAAAAAAAksjNOwAAAAAAAAAA...","[{""class"": ""Short"", ""conf"": 0.9363991022109985..."
4,img5,"[{'rect': [180.96417236328125, 100.34861755371...","{""features"": ""wrNuPwAAAAAAAAAA6jKOP76V40AAAAAA...","[{""class"": ""Shirt"", ""conf"": 0.8880357146263123..."


### Write to tsv + generate lineidx

In [None]:
import os
OUTPUT_DIR = '/content/drive/MyDrive/vinvl_images_features/inference_test/'
LABEL_FILE = os.path.join(OUTPUT_DIR,'label.tsv')
FEATURE_FILE = os.path.join(OUTPUT_DIR,'feature.tsv')
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"path to {OUTPUT_DIR} created")

In [None]:
% cd /content/drive/MyDrive/projects/scene_graph_benchmark-main/
!pwd

/content/drive/MyDrive/projects/scene_graph_benchmark-main
/content/drive/My Drive/projects/scene_graph_benchmark-main


In [None]:
from maskrcnn_benchmark.structures.tsv_file_ops import tsv_reader, tsv_writer
tsv_writer(df[[0,'label']].values.tolist(),LABEL_FILE)
tsv_writer(df[[0,'feature']].values.tolist(),FEATURE_FILE)

## Generate test.yaml for vinvl run_captioning

In [None]:
import yaml
import os.path as op
yaml_dict = {"label": "label.tsv",
             "feature": "features.tsv"}

with open(op.join(OUTPUT_DIR, 'vinvl_test_yaml.yaml'), 'w') as file:
        yaml.dump(yaml_dict, file)

### Misc: Read generated tsvs

In [None]:
pd.read_csv('/content/drive/MyDrive/vinvl_images_features/inference_test/feature.tsv',header=None,sep='\t').shape

(5, 2)

### Encoding correctness check

In [None]:
df.loc[0,1][0]['rect']

[319.3331298828125, 376.935791015625, 604.7099609375, 507.1802673339844]

In [None]:
df

Unnamed: 0,0,1,feature,label
0,img1,"[{'rect': [319.3331298828125, 376.935791015625...","{""features"": ""QAy2PuWTkT7I+8w9XK4aQG/b20AAAAAA...","[{""class"": ""Guitar"", ""conf"": 0.993547976016998..."
1,img2,"[{'rect': [121.45631408691406, 11.671216011047...","{""features"": ""ClsLPZUHCEAAAAAAb8EyP/WpvUAAAAAA...","[{""class"": ""Racket"", ""conf"": 0.832623183727264..."
2,img3,"[{'rect': [544.0774536132812, 378.101715087890...","{""features"": ""fYqJPgAAAAAAAAAAAAAAAA22nD+3OSg9...","[{""class"": ""Short"", ""conf"": 0.8272700309753418..."
3,img4,"[{'rect': [29.02312469482422, 176.618881225585...","{""features"": ""AAAAAAAAAAAAAAAAksjNOwAAAAAAAAAA...","[{""class"": ""Short"", ""conf"": 0.9363991022109985..."
4,img5,"[{'rect': [180.96417236328125, 100.34861755371...","{""features"": ""wrNuPwAAAAAAAAAA6jKOP76V40AAAAAA...","[{""class"": ""Shirt"", ""conf"": 0.8880357146263123..."


In [None]:
features_t = np.frombuffer(base64.b64decode(json.loads(df.loc[0,'feature'])['features']),np.float32).reshape(json.loads(df.loc[0,'feature'])['num_boxes'],-1)
features_t.shape

(47, 2054)

In [None]:
features_t[0,-6:]

array([0.2661, 0.4706, 0.5039, 0.6332, 0.1626, 0.2378], dtype=float32)

In [None]:
def reverse_transform(feat,h=800,w=1200):
    return feat[:4] * np.array([w,h,w,h])

In [None]:
reverse_transform(features_t[0,-6:])

array([319.3331, 376.4652, 604.71  , 506.5471])