In [2]:
import json
import os
import random
import pandas as pd

### WikiHow

In [19]:
wikihow_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/pkg_data/wikiHow'
# What is the overlap between CAE tasks and WikiHow?

with open(os.path.join(wikihow_dir, 'article_id_to_title.txt'), 'r') as f:
    lines = f.readlines()
    wikihow_tasks = [' '.join(l.split()[1:])for l in lines]    

### CAE

In [3]:
# data split ids
exp = '100_per'
exp_dir = f'/mount/arbeitsdaten/jp-silberer/ernie_vil/data/cae/single_result_verb_exp/42/txt_db/exp_{exp}'

# video segment ids
train_path = os.path.join(exp_dir, 'train_ids.json')
val_path = os.path.join(exp_dir, 'val_ids.json')
test_path = os.path.join(exp_dir, 'test_ids.json')

train_ids = json.load(open(train_path, 'r'))
val_ids = json.load(open(val_path, 'r'))
test_ids = json.load(open(test_path, 'r'))

In [11]:
data_by_vid_path = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/cae/subtitles/single_result_verbs_video_clips_by_vid.json'
cae_by_vid = json.load(open(data_by_vid_path, 'r'))

In [4]:
cae_path = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/cae/subtitles/cae.json'
cae_by_vidseg = json.load(open(cae_path, 'r'))

In [13]:
cae_vids = list(cae_by_vid.keys())

In [14]:
# sanity check 
cae_vidsegs = set(cae_by_vidseg.keys())

gather_vidsegs = set()
for vid in cae_by_vid.keys():
    for seg_id in cae_by_vid[vid].keys():
        vidseg_id = vid + '_' + seg_id
        gather_vidsegs.add(vidseg_id)
        
assert len(cae_vidsegs) == len(gather_vidsegs)

In [15]:
print(len(cae_vidsegs))
print(len(gather_vidsegs))

4190733
4190733


In [45]:
# write function cae_by_vid to cae_by_vidseg
# TODO: put in cae_dataset/prepare_cae.py
vidseg2data = {}

for vid in cae_by_vid.keys():
    for seg_id in cae_by_vid[vid].keys():
        vidseg_id = vid + '_' + seg_id
        info = {"vid": vid,
                "vid seg": cae_by_vid[vid][seg_id]['vid seg'],
                "time stamp": cae_by_vid[vid][seg_id]['time stamp'],
                "caption": cae_by_vid[vid][seg_id]['caption'],
                "domain": cae_by_vid[vid][seg_id]['domain'],
                "frames": cae_by_vid[vid][seg_id]['all_frames'],
                "verb": cae_by_vid[vid][seg_id]['verbs'][0],
                "nouns": cae_by_vid[vid][seg_id]['all_nouns'][0],
               }
        vidseg2data[vidseg_id] = info

In [12]:
howto100m_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/HowTo100M'
howto_df = pd.read_csv(os.path.join(howto100m_dir, 'HowTo100M_v1.csv'))
task_ids_df = pd.read_csv(os.path.join(howto100m_dir, 'task_ids.csv'), delimiter='\t', names=['task_id', 'task_title'])

In [17]:
# get task_id to task_title information
taskid2title_df = pd.merge(howto_df, task_ids_df, on='task_id')

# get all cae
cae_taskid2title_df = taskid2title_df[taskid2title_df['video_id'].isin(cae_vids)]

In [22]:
cae_tasktitles = set(cae_taskid2title_df['task_title'].to_list())
set(wikihow_tasks).issubset(cae_tasktitles)

False

In [24]:
set(wikihow_tasks).intersection(cae_tasktitles)

set()

#### Prepare Task Table (for CAE test set)

In [None]:
# get a subset where the video ids are in the test set
cae_test_taskid2title_df = taskid2title_df[taskid2title_df['video_id'].isin(test_vids)]

In [20]:
taskid2title_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id,task_title
0,nVbIUDjzWY4,Cars & Other Vehicles,Motorcycles,27,52907,Paint a Motorcycle
1,rwmt7Cbuvfs,Cars & Other Vehicles,Motorcycles,99,52907,Paint a Motorcycle
2,HnTLh99gcxY,Cars & Other Vehicles,Motorcycles,35,52907,Paint a Motorcycle
3,RAidUDTPZ-k,Cars & Other Vehicles,Motorcycles,10,52907,Paint a Motorcycle
4,tYQoPHwNkho,Cars & Other Vehicles,Motorcycles,18,52907,Paint a Motorcycle


In [35]:
len(cae_test_taskid2title_df)

242360

In [34]:
len(cae_taskid2title_df)

338892

#### Comphrehensive analysis (sample 5 tasks across domains) : 

In [22]:
categories = set(cae_taskid2title_df['category_1'].to_list())
domain2tasktitlles = {}

for cat in categories:
    domain2tasktitlles[cat] = list(set(cae_taskid2title_df[cae_taskid2title_df['category_1']==cat]['task_title'].to_list()))

In [24]:
sampled_tasktitles = []
num = 5
total_tasks = 0
for cat in domain2tasktitlles.keys():
    print(cat+':', len(domain2tasktitlles[cat]))
    total_tasks += len(domain2tasktitlles[cat])
    if len(domain2tasktitlles[cat]) > num:
        sampled_tasktitles.extend(random.sample(domain2tasktitlles[cat], num))
print('total tasks:', total_tasks)

Health: 489
Family Life: 56
Personal Care and Style: 573
Sports and Fitness: 357
Cars & Other Vehicles: 1395
Education and Communications: 667
Home and Garden: 6844
Hobbies and Crafts: 8543
Computers and Electronics: 137
Pets and Animals: 930
Holidays and Traditions: 1775
Arts and Entertainment: 199
Food and Entertaining: 13397
total tasks: 35362


In [25]:
len(sampled_tasktitles)

65

In [29]:
# get all videos for selective task titles
sampled_vids = cae_taskid2title_df[cae_taskid2title_df['task_title'].isin(sampled_tasktitles)]['video_id'].to_list()
len(sampled_vids)

831

In [30]:
saved_path = '/mount/projekte/jp-silberer/hyuyang/paprika/S3D_HowTo100M/sampled_vids.txt'
with open(saved_path, 'w') as f:
    for vid in sampled_vids:
        f.write(vid+'\n')

#### For comparing PAPRIKA, get the same WikiHow Task Titles

In [10]:
# sample HowTo100M video id from paprika
sampled_vids = ['8JevEqO_iS0', 'L8kRaQ-IwpM', 'O_sC9gyEER0', 'c0VlxPBSMDs', 'jRkJTww8HR0', 'kHb7POoH_gc']

In [18]:
# none of the provided videos are in CAE dataset
in_cae = []
for vid in sampled_vids:
    if vid in cae_by_vid:
        in_cae.append(vid)
print(in_cae)

[]


In [21]:
paprika_df = taskid2title_df[taskid2title_df['video_id'].isin(sampled_vids)]

In [22]:
paprika_titles = set(paprika_df['task_title'].to_list())

In [23]:
paprika_titles

{'Do a Hair Mask for Frizzy Hair',
 'Grill Tri Tip',
 'Make Pine Needle Tea',
 'Make an Envelope Advent Calendar'}

#### For comparing PAPRIKA, get the subset of CAE test set that share the same WikiHow Task Titles

In [8]:
# find the video id that has a long sequence of video segments
def get_vid2vidseg(test_set):
    test_vid2vidseg = {}
    for vid_seg_id, vid_info in test_set.items():
        vid_id = vid_info['vid']
        if vid_id not in test_vid2vidseg:
            test_vid2vidseg[vid_id] = []
            test_vid2vidseg[vid_id].append(vid_seg_id)
        else:
            test_vid2vidseg[vid_id].append(vid_seg_id)
    return test_vid2vidseg

In [9]:
test_vid2vidseg = get_vid2vidseg(test_set)

In [31]:
# add number of video segment information
cae_test_df = cae_test_taskid2title_df.copy()
num_vid_seg = [len(test_vid2vidseg.get(vid_id)) for vid_id in cae_test_taskid2title_df['video_id'].to_list()]
cae_test_df['num_vid_seg_in_test_set'] = num_vid_seg


# get a subset where the video ids has the paprika titles
subset_cae_test_df = cae_test_df[cae_test_df['task_title'].isin(paprika_titles)]


In [32]:
subset_cae_test_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id,task_title,num_vid_seg_in_test_set
245678,X3HyjpVl-aw,Holidays and Traditions,Christmas,100,106071,Make an Envelope Advent Calendar,4
245680,UNjtGiec1z8,Holidays and Traditions,Christmas,102,106071,Make an Envelope Advent Calendar,1
245681,ExMBGoSUInY,Holidays and Traditions,Christmas,103,106071,Make an Envelope Advent Calendar,3
245701,vVfXpqT-VzA,Holidays and Traditions,Christmas,101,106071,Make an Envelope Advent Calendar,2
245716,KuL4p4P1xbY,Holidays and Traditions,Christmas,112,106071,Make an Envelope Advent Calendar,7


In [None]:
#### Quick test: For comparing PAPRIKA on the task title "Grill Tri Tip". 

In [36]:
# get the video ids for the task title "Grill Tri Tip"
task_title = 'Grill Tri Tip'
video_ids = cae_taskid2title_df[cae_taskid2title_df['task_title'].isin([task_title])]['video_id'].to_list()
video_ids

['kzaCC0imIDw',
 'b7dPM26SIBU',
 '6T1jb4ekdGA',
 'Aj3FDRfWdQk',
 '0lHyjDnVGew',
 'ATTK3Z3N0OQ',
 'Zcc2lwRd4dU',
 'HMt9LgYKkCU',
 'rXNtObw3s6Q',
 'mlRFibSI_uw',
 '1-exqyHuliQ',
 'kXsUu6eMjnc']

In [40]:
# get the vid seg ids
selected_vidseg = []
for vid in video_ids:
    vid_segs = cae_by_vid.get(vid)
    for vid_seg in vid_segs.keys():
        vid_seg_id = vid + '_' + vid_seg
        selected_vidseg.append(vid_seg_id)
print(selected_vidseg[:5])
print(len(selected_vidseg))

['kzaCC0imIDw_30', 'kzaCC0imIDw_128', 'kzaCC0imIDw_121', 'kzaCC0imIDw_8', 'kzaCC0imIDw_13']
236


In [44]:
saved_path = '/mount/projekte/jp-silberer/hyuyang/paprika/S3D_HowTo100M/vid_segs.txt'
with open(saved_path, 'w') as f:
    for vid in selected_vidseg:
        f.write(vid+'\n')

#### Get Task Table with More Unseen Video Segments (for CAE test set)

In [38]:
cae_procedural_test_df = cae_test_df[cae_test_df['num_vid_seg_in_test_set']>5]

In [39]:
cae_procedural_test_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id,task_title,num_vid_seg_in_test_set
19,tTx8mid-aq4,Cars & Other Vehicles,Motorcycles,116,52907,Paint a Motorcycle,6
56,69sLb-hKrUo,Cars & Other Vehicles,Motorcycles,124,52907,Paint a Motorcycle,12
63,3PvbY30DMP8,Cars & Other Vehicles,Motorcycles,125,52907,Paint a Motorcycle,6
66,-qhlLvJFjxM,Cars & Other Vehicles,Motorcycles,11,52907,Paint a Motorcycle,8
258,NVKln6dCeyo,Cars & Other Vehicles,Cars,55,71845,Polish an Aftermarket Header,11


In [46]:
def to_dataframe(model_path):
    results = json.load(open(model_path, 'r'))
    df = pd.DataFrame.from_dict(results).T
    return df

#### Get Result Verb Dense Domains

In [19]:
# result verb dense domains
domains = ['Food and Entertaining', 'Sports and Fitness', 'Cars & Other Vehicles', 
                       'Pets and Animals', 'Home and Garden', 'Hobbies and Crafts']

food = cae_procedural_test_df[cae_procedural_test_df['category_1']=='Food and Entertaining']
home = cae_procedural_test_df[cae_procedural_test_df['category_1']=='Home and Garden']
craft = cae_procedural_test_df[cae_procedural_test_df['category_1']=='Hobbies and Crafts']

In [73]:
food.head()
print(len(food))
print(len(home))
print(len(craft))

10953
4084
4657


### Steps for checking action step sequences

- [x] Randomly take 1 taskid from result verb dense domains (check if the title requires procedural knowledge)
- [x] Find task id to video ids 
    - (Nonseq: for each video id, find WikiHow corresponding title id)
- [x] Get the sequence of actions of CAE, model prediction, and original Howto100M (only on verbs)

In [20]:
def get_random_title(domain_df):
    task_titles = domain_df['task_title'].to_list()
    title = random.choice(task_titles)
    return title
    
def get_top_rank(title_df, n=5):
    top_vids = []
    if len(title_df) < n:
        n = len(title_df)
    
    rank2vid = {}
    ranks = title_df['rank'].to_list()
    vids =  title_df['video_id'].to_list()

    ranks, videos = zip(*sorted(zip(ranks, vids)))
    return videos[:n]

def get_action_object_sequence(data_by_vid, vid):
    act_obj_seq = []
    vid_seg_seq = []
    vid_seg_dict = data_by_vid.get(vid)
    if vid_seg_dict is None:
        return act_obj_seq
    
    for idx, (k, v) in enumerate(vid_seg_dict.items()):
        step = f'step_action{idx}:'
        verb = vid_seg_dict[k]['verbs'][0]
        nouns = vid_seg_dict[k]['all_nouns'][0]
        vid_seg_id = vid + '_' + str(vid_seg_dict[k]['vid seg'])
        
        act_obj_seq.append(step + '(' + verb + ', ' + '[' + ', '.join(nouns) + ']' + ')')
        vid_seg_seq.append(vid_seg_id)
    
    return act_obj_seq, vid_seg_seq

# def get_task_action_sequence(task_name, data_by_vid, vid):
#     # TODO: how to get the video title name
#     act_obj_seq, vid_seg_seq = get_action_object_sequence(data_by_vid, vid)
#     return act_obj_seq, vid_seg_seq

In [21]:
def get_prediction_seq(pred_df, vid_seg_seq):
    preds = []
    for idx, vid_seg_id in enumerate(vid_seg_seq):
        step = f'step_action{idx}:'
        try:
            pred_verb = pred_df.loc[vid_seg_id, 'prediction']
            preds.append(step + pred_verb)
        except KeyError:
            preds.append('N/A')
    return preds    

#### GET CAE pretrained model prediction files 

In [47]:
# all model results on MULTIMODAL inference setting
root = '/mount/arbeitsdaten/jp-silberer/ernie_vil/model/cae'

VL_full_model_path = os.path.join(root, 'pretrain_cae_mam_verb_random_joint_100_val_10/MAP/100_per/results_test/results_97500_text_visual_all.json')
VL_full_df = to_dataframe(VL_full_model_path)
# VL_full_df.reset_index(inplace=True)

VL_full_sub_model_path = os.path.join(root, 'pretrain_cae_mam_verb_random_joint_sub_100_val_10/MAP/100_per/results_test/results_92500_text_visual_all.json')
VL_full_sub_df = to_dataframe(VL_full_sub_model_path)

VL_multi_model_path = os.path.join(root, 'pretrain_cae_mam_mem_100/MAP/100_per/results_test/results_69500_text_visual_all.json')
VL_multi_df = to_dataframe(VL_multi_model_path)

### Conditioned on the sampled task titles of Paprika

In [55]:
# paprika_titles
t = get_random_title(subset_cae_test_df)
t_df = subset_cae_test_df[subset_cae_test_df['task_title']==t]
t2vids = get_top_rank(t_df, n=10)

print('WikiHow Task Name:', t)
for vid in t2vids:
    print('video id:', vid)
    act_obj_seq, vid_seg_seq = get_action_object_sequence(cae_by_vid, vid)
    print('CAE Reference Action - Object Sequences:')
    print(act_obj_seq)
#     print(vid_seg_seq)
    print('\n')
    
    print('MAM_VL (input: multimodal):')
    print(get_prediction_seq(VL_full_df, vid_seg_seq))
    print('\n')
    
    print('MULTI-VL (input: multimodal):')
    print(get_prediction_seq(VL_full_sub_df, vid_seg_seq))
    print('\n')
    
    print('MAM_L (input: unimodal):')
    print(get_prediction_seq(VL_multi_df, vid_seg_seq))
    print('\n')

WikiHow Task Name: Make an Envelope Advent Calendar
video id: zc0TCgLuJRU
CAE Reference Action - Object Sequences:
['step_action0:(make, [variation, envelope])', 'step_action1:(make, [origami])', 'step_action2:(cut, [])', 'step_action3:(roll, [it, we])', 'step_action4:(roll, [edge, it])', 'step_action5:(roll, [it])', 'step_action6:(slide, [tip, they])', 'step_action7:(put, [card, what])', 'step_action8:(put, [piece])', 'step_action9:(pinch, [side])', 'step_action10:(set, [])']


MAM_VL (input: multimodal):
['step_action0:put', 'N/A', 'N/A', 'step_action3:make', 'step_action4:throw', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']


MULTI-VL (input: multimodal):
['step_action0:put', 'N/A', 'N/A', 'step_action3:put', 'step_action4:put', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']


MAM_L (input: unimodal):
['step_action0:put', 'N/A', 'N/A', 'step_action3:make', 'step_action4:throw', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']


video id: PQl6A18q09Y
CAE Reference Action - Object Sequences:
['step_

### Conditioned on a randomly selected task title across domains

#### food domain

In [110]:
# food domain
food_t = get_random_title(food)
food_t = 'Make Parsley Pesto'
food_t_df = food[food['task_title']==food_t]
food_t2vids = get_top_rank(food_t_df, n=10)

print('WikiHow Task Name:', food_t)
for vid in food_t2vids:
    print('video id:', vid)
    act_obj_seq, vid_seg_seq = get_action_object_sequence(cae_by_vid, vid)
    print('CAE Reference Action - Object Sequences:')
    print(act_obj_seq)
#     print(vid_seg_seq)
    print('\n')
    
    print('MAM_VL (input: multimodal):')
    print(get_prediction_seq(VL_full_df, vid_seg_seq))
    print('\n')
    
    print('MULTI-VL (input: multimodal):')
    print(get_prediction_seq(VL_full_sub_df, vid_seg_seq))
    print('\n')
    
    print('MAM_L (input: unimodal):')
    print(get_prediction_seq(VL_multi_df, vid_seg_seq))
    print('\n')



WikiHow Task Name: Make Parsley Pesto
video id Nv_acGiCu0M
CAE Reference Action - Object Sequences:
['step_action0:(make, [])', 'step_action1:(make, [puree, one, it, it])', 'step_action2:(make, [vinaigrette, salad])', 'step_action3:(break, [shell, they])', 'step_action4:(squeeze, [flavor, lemon, lemon])', 'step_action5:(blend, [thing, leave])', 'step_action6:(mix, [parsley])', 'step_action7:(mix, [meat, salad, bread, stick, risotto])', 'step_action8:(peel, [parsley, garlic])', 'step_action9:(shred, [product, cheese, it])', 'step_action10:(confuse, [the])']


MAM_VL (input: multimodal):
['N/A', 'N/A', 'N/A', 'step_action3:stick', 'step_action4:peel', 'step_action5:rinse', 'step_action6:spread', 'step_action7:wrapped', 'step_action8:going', 'N/A', 'N/A']


MULTI-VL (input: multimodal):
['N/A', 'N/A', 'N/A', 'step_action3:are', 'step_action4:put', 'step_action5:cut', 'step_action6:turn', 'step_action7:made', 'step_action8:going', 'N/A', 'N/A']


MAM_L (input: unimodal):
['N/A', 'N/A', 'N/

#### craft domain

In [49]:
# craft domain
craft_t = get_random_title(craft)
craft_t = 'Make a Ladder Golf Game'
craft_t_df = craft[craft['task_title']==craft_t]
craft_t2vids = get_top_rank(craft_t_df, cae_by_vid, n=3)
get_action_object_sequence(cae_by_vid, vid)


Make a Ladder Golf Game
DkjiWqflqpk
['make', 'glue', 'build', 'build', 'build', 'build', 'cut', 'grill', 'paint', 'spray', 'dry']
mmMWOC_CNDo
['make', 'throw', 'throw', 'throw', 'pull', 'wrap', 'wrap', 'wrap', 'set', 'shoot', 'shoot', 'shoot']
MhF9XU-9rZU
['throw', 'throw', 'throw', 'knock', 'knock', 'fix', 'set', 'spin']


#### home domain

In [50]:
home_t = get_random_title(home)
home_t = 'Remove Beer Stains from Fabric'
home_t_df = home[home['task_title']==home_t]
home_t2vids = get_top_rank(home_t_df, cae_by_vid, n=3)
get_action_object_sequence(cae_by_vid, vid)

Remove Beer Stains from Fabric
Qa7P5qKf1e4
['turn', 'squeeze', 'squeeze', 'set', 'soak']
wk_Vmywr-p0
['make', 'build', 'mix', 'mix', 'wash', 'wash']
i0WG4ZXODS4
['break', 'tape', 'rub', 'wash', 'collect']


### CAE pretrained model Action Sequences

'put'