In [1]:
import os
import os.path as osp
import numpy as np
import scipy.io as sio

import torch
import torchvision
from torchvision import transforms

from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pickle
from utils import get_infor_ori, get_infor, convert_part_box, \
                check_bbox, check_left

In [59]:
part_names = ["ankle", "knee", "hip", "hand", "shoulder", "head"]
part_names_in_text = ["foot", "leg", "hip", "hand", "arm", "head"]
part_names_in_file = ["ankle2", "knee2", "hip", "hand2", "shoulder2", "head"]

num_classes = [12, 10, 5, 31, 5, 13]
bbox_index = [[0,3], [1, 2], [4, 4], [6, 9], [7, 8], [5, 5]]

In [2]:
label_folder="../../data/ActionRecognition/"
Trainval_GT = pickle.load(open(label_folder + '/' + 'Trainval_GT_all_part.pkl', "rb"), encoding='bytes')

In [14]:
hoi_list_fp = osp.join(label_folder, "hico_hoi_list.txt")
hoi_list = []
with open(hoi_list_fp) as fp:
    list_act = fp.readlines()
    for act in list_act:
        act = act.strip()
        act = act.split()
        hoi_list.append(act)

In [64]:
part_state_fp = osp.join(label_folder, "Part_State_76.txt")
temp_part_state = []
with open(part_state_fp) as fp:
    list_act = fp.readlines()
    for act in list_act:
        act = act.strip()
        act = act.split()
        temp_part_state.append(act)

part_state = []
for part_index, part_name in enumerate(part_names):
    part_state.append([])
    
print(part_state)
for part_index, part_name in enumerate(part_names):
    print(part_name)
    print(part_names_in_text[part_index])
    for each_part in temp_part_state:
        if each_part[0][:-1] == part_names_in_text[part_index]:
            part_state[part_index].append(each_part)
            print(each_part)

[[], [], [], [], [], []]
ankle
foot
['foot:', 'stand', 'on']
['foot:', 'tread', 'on']
['foot:', 'walk', 'with']
['foot:', 'walk', 'to']
['foot:', 'run', 'with']
['foot:', 'run', 'to']
['foot:', 'dribble']
['foot:', 'kick']
['foot:', 'jump', 'down']
['foot:', 'jump', 'with']
['foot:', 'walk', 'away']
['foot:', 'no_interaction']
knee
leg
['leg:', 'walk', 'with']
['leg:', 'walk', 'to']
['leg:', 'run', 'with']
['leg:', 'run', 'to']
['leg:', 'jump', 'with']
['leg:', 'is', 'close', 'with']
['leg:', 'straddle']
['leg:', 'jump', 'down']
['leg:', 'walk', 'away']
['leg:', 'no_interaction']
hip
hip
['hip:', 'sit', 'on']
['hip:', 'sit', 'in']
['hip:', 'sit', 'beside']
['hip:', 'be', 'close', 'with']
['hip:', 'no_interaction']
hand
hand
['hand:', 'hold']
['hand:', 'carry']
['hand:', 'reach', 'for']
['hand:', 'touch']
['hand:', 'put', 'on']
['hand:', 'twist']
['hand:', 'wear']
['hand:', 'throw']
['hand:', 'throw', 'out']
['hand:', 'write', 'on']
['hand:', 'point', 'with']
['hand:', 'point', 'to']
['

In [88]:
Trainval_GT[1]

[[1,
  [155, 152, 154, 153],
  [207, 32, 426, 299],
  [58, 97, 571, 404],
  {'pvp55_hand2': {0, 22},
   'part_bbox': array([[  0.        , 170.0196991 , 185.0196991 , 299.98031616,
           314.98031616],
          [  0.        , 231.8497467 , 103.8497467 , 340.15026855,
           212.1502533 ],
          [  0.        , 299.84973145, 128.8497467 , 408.15026855,
           237.1502533 ],
          [  0.        , 234.0196991 , 172.0196991 , 363.98031616,
           301.98031616],
          [  0.        , 198.0196991 ,  77.0196991 , 327.98031616,
           206.9803009 ],
          [  0.        , 334.01968384,  -2.98030472, 463.98031616,
           126.9803009 ],
          [  0.        , 320.26477051, 127.26477051, 417.73522949,
           224.73522949],
          [  0.        , 267.84973145,  42.8497467 , 376.15026855,
           151.1502533 ],
          [  0.        , 325.84973145,  47.8497467 , 434.15026855,
           156.1502533 ],
          [  0.        , 342.26477051, 126.264770

In [92]:
caption_dict = dict()
for image_id in Trainval_GT.keys():
    caption_dict[image_id] = list()

for image_id in Trainval_GT.keys():
# for image_id in {1}:
    ground_truth = Trainval_GT[image_id]
    for hoi_index, hoi_instance in enumerate(ground_truth):
        Human_bbox, Object_bbox, original_action, P_boxes, gt_object, PVP0, PVP1, PVP2, PVP3, PVP4, PVP5 = get_infor_ori(ground_truth, hoi_index)
        is_human_bbox, human_bbox = check_bbox(Human_bbox)
        if not is_human_bbox:
            continue
        for action in original_action:
            caption = ""
            act = hoi_list[action]
            if act[2] == "no_interaction":
                continue
            use_act = act[2].replace("_", " ").split()
            caption += "A person "
            for _act in use_act:
                caption += _act + " "
            caption += act[1]+"."
            for part_index, (part_name, num_class) in enumerate(zip(part_names, num_classes)):
                bbox_index_1, bbox_index_2 = bbox_index[part_index]
                part_bbox_ = hoi_instance[4]['part_bbox'][None, :, :][0]
                part_act = ground_truth[hoi_index][4]['pvp76_' + part_names_in_file[part_index]]
                part_act = list(part_act)
                if bbox_index_1 != bbox_index_2:
                    bbox_1 = part_bbox_[bbox_index_1][1:5] #  Left part
                    bbox_2 = part_bbox_[bbox_index_2][1:5] #  Right part
                    is_bbox_1, bbox_1 = check_bbox(bbox_1)
                    is_bbox_2, bbox_2 = check_bbox(bbox_2)
                    if len(part_act) == 1:
                        if part_state[part_index][part_act[0]][-1] == "no_interaction":
                            continue
                        if is_bbox_1:
                            caption += " Left " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                        if is_bbox_2:
                            caption += " Right " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                    else:
                        if is_bbox_1 and is_bbox_2:
                            caption += " Left " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                            caption += " Right " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[1]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                        elif is_bbox1 and not is_bbox_2: 
                            caption += " Left " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                        elif is_bbox_2 and not is_bbox1: 
                            caption += " Right " + part_name + " "
                            for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                            caption += "."
                else:
                    if part_state[part_index][part_act[0]][-1] == "no_interaction":
                        continue
                    bbox = part_bbox_[bbox_index_1][1:5] #  Left part
                    is_bbox, bbox = check_bbox(bbox)
                    if is_bbox:
                        caption += part_name + " "
                        for word_index, word in enumerate(part_state[part_index][part_act[0]]):
                                if word_index == 0:
                                    continue
                                caption += word + " "
                        caption += "."
            temp_list_cap = caption.split(".")
            set_cap = set()
            for each_cap in temp_list_cap:
                if "no_interaction" in each_cap:
                    continue
                set_cap.add(each_cap)
            caption_dict[image_id].append(set_cap)


1
1
1
1
1
1
1
1
2
2
2
1
1
1
2
4
1
2
1
1
3
1
1
1
1
1
3
3
1
1
1
1
2
2
5
1
4
1
4
1
5
1
2
2
2
2
1
1
1
3
1
1
2
1
1
1
3
3
2
15
4
3
3
1
2
1
4
1
2
1
2
1
2
2
6
3
1
11
3
2
1
1
66
2
1
1
1
2
2
4
1
1
5
1
1
1
1
1
1
1
1
3
1
1
2
4
1
1
1
5
6
1
2
1
1
1
6
1
2
26
3
2
2
1
3
2
3
2
1
2
6
2
1
1
1
1
5
1
2
1
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
4
1
1
3
1
2
10
2
2
1
1
1
2
8
1
1
1
5
1
1
2
3
1
1
2
1
3
2
3
2
4
14
5
1
1
1
2
1
2
1
1
3
4
2
1
1
2
2
1
1
1
2
1
1
4
4
1
1
1
1
1
4
1
1
1
1
1
1
1
1
8
2
1
1
2
4
1
3
1
1
1
2
2
1
2
1
4
1
1
2
4
1
1
1
1
1
2
7
1
2
1
2
1
3
1
18
1
2
1
1
1
9
2
3
2
1
1
7
1
1
1
1
2
2
1
4
4
1
3
1
5
1
1
3
1
1
2
2
1
2
1
1
1
1
1
1
1
1
6
1
2
1
1
2
2
6
1
3
1
1
1
8
1
1
1
2
1
1
1
5
1
2
1
1
1
1
19
1
1
1
2
1
1
7
1
4
3
3
1
1
2
1
1
9
12
1
1
1
2
1
3
1
1
1
1
1
1
1
2
2
1
1
1
6
1
1
4
1
2
2
2
4
1
1
1
2
1
4
1
1
3
1
1
3
1
1
28
1
2
1
2
4
1
2
2
6
4
1
9
1
2
3
15
1
2
2
1
1
1
1
1
2
4
2
2
1
1
1
1
1
1
1
1
1
1
10
2
16
5
4
1
1
1
1
1
1
1
2
1
1
3
1
1
2
1
2
2
9
1
1
2
2
1
1
3
1
1
2
1
2
6
8
1
1
1
1
6
1
1
11
1
1
3
2


NameError: name 'is_bbox1' is not defined

In [93]:
def save_file(file_content, file_name):
    if file_name[:-6] != 'pkl':
        file_name = file_name + '.pkl'
    new_path = osp.join('./image_id', file_name)
    with open(new_path, 'wb') as handle:
        pickle.dump(file_content, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [95]:
import os
if not osp.isdir("image_id"):
    os.mkdir("image_id")
save_file(caption_dict, "caption_dict_full.pkl")