In [1]:
import pickle
import numpy as np
import json
import os
from collections import defaultdict
import collections
import pandas as pd

In [2]:
wikihow_part_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/pkg_data/wikiHow/partial'
wikihow_full_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/pkg_data/wikiHow/full'

howto_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/pkg_data/howto100m'
cae_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/pkg_data/cae'

# Howto100M/CAE tasktitle files

### Howto100m

In [18]:
howto_df = pd.read_csv(os.path.join(howto_dir, 'HowTo100M_v1.csv'))
task_ids_df = pd.read_csv(os.path.join(howto_dir, 'task_ids.csv'), delimiter='\t', names=['task_id', 'task_title'])

In [5]:
howto_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id
0,nVbIUDjzWY4,Cars & Other Vehicles,Motorcycles,27,52907
1,CTPAZ2euJ2Q,Cars & Other Vehicles,Motorcycles,35,109057
2,rwmt7Cbuvfs,Cars & Other Vehicles,Motorcycles,99,52907
3,HnTLh99gcxY,Cars & Other Vehicles,Motorcycles,35,52907
4,EyP3HVhg1u0,Cars & Other Vehicles,Motorcycles,95,52906


In [11]:
print(len(task_ids_df))

123520


In [19]:
# get task_id to task_title information
howto_taskid2title_df = pd.merge(howto_df, task_ids_df, on='task_id')
howto_taskid2title_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id,task_title
0,nVbIUDjzWY4,Cars & Other Vehicles,Motorcycles,27,52907,Paint a Motorcycle
1,rwmt7Cbuvfs,Cars & Other Vehicles,Motorcycles,99,52907,Paint a Motorcycle
2,HnTLh99gcxY,Cars & Other Vehicles,Motorcycles,35,52907,Paint a Motorcycle
3,RAidUDTPZ-k,Cars & Other Vehicles,Motorcycles,10,52907,Paint a Motorcycle
4,tYQoPHwNkho,Cars & Other Vehicles,Motorcycles,18,52907,Paint a Motorcycle


In [20]:
howto_taskid2title = dict(zip(howto_taskid2title_df['task_id'], howto_taskid2title_df['task_title']))
howto_title2taskid = {'How to ' + title: taskid for taskid, title in howto_taskid2title.items()}

In [21]:
howto_tasknames = set(list(howto_title2taskid.keys()))
len(howto_tasknames)

25312

### CAE

In [6]:
cae_by_vid = json.load(open(os.path.join(cae_dir,'single_result_verbs_video_clips_by_vid.json'), 'r'))
cae_vids = list(cae_by_vid.keys())

In [10]:
cae_meta_df = howto_df[howto_df['video_id'].isin(cae_vids)]

In [12]:
with open(os.path.join(cae_dir,'cae_video_meta.csv'), 'w') as f:
    cae_meta_df.to_csv(f, index=False)

In [7]:
# get all cae
cae_taskid2title_df = howto_taskid2title_df[howto_taskid2title_df['video_id'].isin(cae_vids)]
cae_taskid2title_df.head()

Unnamed: 0,video_id,category_1,category_2,rank,task_id,task_title
3,RAidUDTPZ-k,Cars & Other Vehicles,Motorcycles,10,52907,Paint a Motorcycle
5,DUxVMAebfrM,Cars & Other Vehicles,Motorcycles,131,52907,Paint a Motorcycle
9,UHA1Q5JK3_A,Cars & Other Vehicles,Motorcycles,113,52907,Paint a Motorcycle
19,tTx8mid-aq4,Cars & Other Vehicles,Motorcycles,116,52907,Paint a Motorcycle
28,BYZcEXxdTjU,Cars & Other Vehicles,Motorcycles,103,52907,Paint a Motorcycle


In [14]:
cae_tasktitles = set(cae_taskid2title_df['task_title'])
print('CAE unique tasks:', len(cae_tasktitles))
print('Not in Howto100M:',len(howto100m_tasktitles - cae_tasktitles))

CAE unique tasks: 25258
Not in Howto100M 54


## Domains - Tasktitles Files

In [7]:
def get_domain2tasktitles(cae_taskid2title_df):
    categories = set(cae_taskid2title_df['category_1'].to_list())
    result = {}

    for cat in categories:
        result[cat] = list(set(cae_taskid2title_df[cae_taskid2title_df['category_1']==cat]['task_title'].to_list()))
    return result

def get_tasktitles2domain(domain2tasktitlles):
    tasktitles2domain = {}
    
    for d, tasktitles in domain2tasktitlles.items():
        for t in tasktitles:
            t = 'How to ' + t
            tasktitles2domain[t] = d
    return tasktitles2domain


domain2tasktitlles = get_domain2tasktitles(cae_taskid2title_df)
tasktitles2domain = get_tasktitles2domain(domain2tasktitlles)

# WikiHow files

In [8]:
# Applicable for Global
def get_taskid2stepid(wikihow):
    step_id = 0
    article_po_to_step_id = defaultdict()
    step_id_to_headline = defaultdict()
    article_to_step_id = defaultdict(list)

    for article_id in range(len(wikihow)):
        for article_step_idx in range(len(wikihow[article_id])):
            article_po_to_step_id[(article_id, article_step_idx)] = step_id
            step_id_to_headline[step_id] = wikihow[article_id][article_step_idx]['headline'].strip("\n")
            article_to_step_id[article_id].append(step_id)
            step_id += 1
    total_num_steps = len(article_po_to_step_id)
    return article_po_to_step_id, step_id_to_headline

## Full WikiHow version (num_tasks: 215, 365)

In [9]:
wikihow_all_df = pd.read_csv(os.path.join(wikihow_full_dir, 'wikihowAll.csv'))

In [10]:
wikihow_full_df = wikihow_all_df.dropna()
wikihow_full_tasknames = wikihow_full_df['title'].to_list()
print(len(wikihow_full_tasknames))
wikihow_full_df.head()

214294


Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


## Partial WikiHow version (num_tasks: 1053, used by Paprika)

In [6]:
# insepct already extracted step features
with open(os.path.join(wikihow_part_dir, 'step_headlines/s3d_text_feats', 'step_embeddings.pickle'), 'rb') as f:
    wikihow_step_feats = pickle.load(f)

In [7]:
# the version Paprika use
with open(os.path.join(wikihow_part_dir, 'step_label_text.json'), 'r') as f:
    wikihow = json.load(f)

with open(os.path.join(wikihow_part_dir, 'step_label.json'), 'r') as f:
    wikihow_steps = json.load(f)
    
with open(os.path.join(wikihow_part_dir, 'article_id_to_title.txt'), 'r') as f:
    article_id_to_wikihow_taskname = {
        int(line.rstrip().split('\t')[0]): line.rstrip().split('\t')[1] for line in f.readlines()}

with open(os.path.join(wikihow_part_dir, 'article_id_to_title.txt'), 'r') as f:
    wikihow_taskname_to_article_id = {
        line.rstrip().split('\t')[1]: int(line.rstrip().split('\t')[0]) for line in f.readlines()}

In [8]:
wikihow_tasknames = list(wikihow_taskname_to_article_id.keys())
print(len(wikihow_tasknames))

1053


### Misc.

In [18]:
print('?% of HowTo100M tasks has exact overlap with')
print('wikihow FULL:', len(howto_tasknames & set(wikihow_full_tasknames))/len(howto_tasknames)*100)
print('wikihow PARTIAL:', len(howto_tasknames & set(wikihow_tasknames))/len(howto_tasknames)*100)

?% of HowTo100M tasks has exact overlap with
wikihow FULL: 67.4581226295828
wikihow PARTIAL: 2.8168457648546146


In [19]:
joint_full = howto_tasknames & set(wikihow_full_tasknames)
joint_partial = howto_tasknames & set(wikihow_tasknames)

In [21]:
howto_joint_partial_taskid = [howto_title2taskid.get(t) for t in list(joint_partial)]
howto_joint_full_taskid = [howto_title2taskid.get(t) for t in list(joint_full)]

In [22]:
# TO USE: for evaluating the clustering quality in task similarities
howto_joint_partial_taskid[:3]

[74650, 44831, 95132]

### Task 2 Steps (Global Graph)

In [10]:
a_id = wikihow_taskname_to_article_id.get('How to Plant Parsnips')
wikihow_steps[a_id]

['Prepare the site.',
 'Lay a piece of string on the soil as a guide to make sure you dig the row in a straight line.',
 'Use a cane, or other tool, to dig a narrow hole that you will place the seeds in.',
 'Using a hoe, gently cover the seeds with soil.',
 'If the soil is dry, water the seeds in.',
 'To protect your plants from late frosts in Spring, build a frame to cover the drills.']

In [14]:
article_po_to_step_id, step_id_to_headline = get_taskid2stepid(wikihow)

### Node 2 Steps

In [16]:
# node2step & step2node (defaultdict)
with open(os.path.join(wikihow_part_dir, 'node2step.pickle'), 'rb') as f:
    node2step = pickle.load(f)

with open(os.path.join(wikihow_part_dir, 'step2node.pickle'), 'rb') as f:
    step2node = pickle.load(f)

In [17]:
num_steps = len(step2node) # 10,588
# TODO: quality check if the clustering of steps makes sense
num_nodes = len(node2step) # 10,388

# (Outdated) Obtain the video subset for analysis
- select 5 tasks across 13 domains, in total 65 task titles, resulting in 831 videos
- ?% of CAE tasks has exact overlap with
    - wikihow FULL 67.49
    - wikihow PARTIAL 2.82

In [8]:
def get_videoid2taskname(video_meta_df, task_ids_df):        
    task_id_to_task_name_original_map = dict()
    for index, row in task_ids_df.iterrows():
        task_id = row[0]
        task_name = row[1]
        task_id_to_task_name_original_map[task_id] = task_name
    
    video_id_to_task_name = dict()
    video_id_to_task_id = dict()
    for index, row in video_meta_df.iterrows():
        video_id = row['video_id']
        task_id = row['task_id']
        video_id_to_task_id[video_id] = task_id
        video_id_to_task_name[video_id] = 'How to ' + task_id_to_task_name_original_map[task_id]
    
    return video_id_to_task_name, video_id_to_task_id

video_id_to_task_name, video_id_to_task_id = get_videoid2taskname(howto_df, task_ids_df)

In [10]:
saved_path = '/mount/projekte/jp-silberer/hyuyang/paprika/S3D_HowTo100M/sampled_vids.txt'
with open(saved_path, 'r') as f:
    sampled_vids = [line.rstrip() for line in f.readlines()]

In [11]:
sampled_taskids = set([video_id_to_task_id.get(vid) for vid in sampled_vids])
len(sampled_taskids)

65

In [46]:
# sampled videos across domains
domains = cae_taskid2title_df[cae_taskid2title_df['video_id'].isin(sampled_vids)]['category_1'].to_list()
collections.Counter(domains).most_common()

[('Food and Entertaining', 208),
 ('Home and Garden', 139),
 ('Hobbies and Crafts', 126),
 ('Cars & Other Vehicles', 56),
 ('Arts and Entertainment', 49),
 ('Pets and Animals', 45),
 ('Education and Communications', 41),
 ('Sports and Fitness', 40),
 ('Family Life', 36),
 ('Holidays and Traditions', 26),
 ('Personal Care and Style', 23),
 ('Health', 23),
 ('Computers and Electronics', 19)]

# PKG

## Decoding PKG helper functions

In [3]:
def find_node_transitions(adj_matrix, start_node):
    # DFS traversal
    num_nodes = adj_matrix.shape[0]  # Use the number of rows from the shape of the matrix
    transitions = []
    visited = [False] * num_nodes

    def dfs(node, transition):
        visited[node] = True
        transition.append(node)
#         print('transition', transition)

        if not np.any(adj_matrix[node]):  # Use np.any to check if there are outgoing neighbors to serve as end nodes
            transitions.append(transition[:])  # Add a copy of the transition to the list

        for neighbor, connected in enumerate(adj_matrix[node]):
            if connected and not visited[neighbor]:
                dfs(neighbor, transition)

        visited[node] = False
        transition.pop()  # Remove the last node to backtrack and explore other options

    dfs(start_node, [])

    return transitions

def find_node_transitions_with_end_node(adj_matrix, start_node, end_node):
    num_nodes = adj_matrix.shape[0]  # Use the number of rows from the shape of the matrix
    transitions = []
    visited = [False] * num_nodes

    def dfs(node, transition):
        visited[node] = True
        transition.append(node)

        if node == end_node or not np.any(adj_matrix[node]):  # Define end node or use np.any to check if there are outgoing neighbors
            transitions.append(transition[:])  # Add a copy of the transition to the list

        for neighbor, connected in enumerate(adj_matrix[node]):
            if connected and not visited[neighbor]:
                print(connected)
                dfs(neighbor, transition)

        visited[node] = False # to 
        transition.pop()  # Remove the last node to backtrack and explore other options

    dfs(start_node, [])

    return transitions

def find_node_transitions_with_threshold(adj_matrix, start_node, end_node, threshold=0.9):
    num_nodes = adj_matrix.shape[0]  # Use the number of rows from the shape of the matrix
    transitions = []
    visited = [False] * num_nodes

    def dfs(node, transition):
        visited[node] = True
        transition.append(node)

        if node == end_node or not np.any(adj_matrix[node]):  # Define end node or use np.any to check if there are outgoing neighbors
            transitions.append(transition[:])  # Add a copy of the transition to the list

#         print('node', node)
        for neighbor, connected in enumerate(adj_matrix[node]):
            if connected > threshold and not visited[neighbor]:
#                 print('neighbor', neighbor)
#                 print('connected', connected)
                dfs(neighbor, transition)

        visited[node] = False # to 
        transition.pop()  # Remove the last node to backtrack and explore other options

    dfs(start_node, [])

    return transitions


def find_node_transitions_with_depth(adj_matrix, start_node, depth_count):
    # TODO: it would be better to take the score into account
    num_nodes = adj_matrix.shape[0]  # Use the number of rows from the shape of the matrix
    transitions = []
    visited = [False] * num_nodes

    def dfs(node, transition, depth):        
        visited[node] = True
        transition.append(node)

        if not np.any(adj_matrix[node]) or depth >= depth_count:  # Define end node or use np.any to check if there are outgoing neighbors
            transitions.append(transition[:])  # Add a copy of the transition to the list
            visited[node] = False  # Reset visited status to explore other options
            transition.pop()  # Remove the last node to backtrack
            return
        
        for neighbor, connected in enumerate(adj_matrix[node]):
            if connected and not visited[neighbor]:
                dfs(neighbor, transition, depth + 1)

        visited[node] = False  
        transition.pop()  # Remove the last node to backtrack and explore other options
        
    dfs(start_node, [], depth=0)

    return transitions

def find_node_transitions_within_range(adj_matrix, start_node, node_range=None):
    num_nodes = adj_matrix.shape[0]  # Use the number of rows from the shape of the matrix
    transitions = []
    visited = [False] * num_nodes

    def dfs(node, transition):        
        visited[node] = True
        transition.append(node)
            
        if not np.any(adj_matrix[node]):  # Define end node or use np.any to check if there are outgoing neighbors
            print('node meets end criteria', node)
            transitions.append(transition[:])  # Add a copy of the transition to the list
            visited[node] = False  # Reset visited status to explore other options
            transition.pop()  # Remove the last node to backtrack
            return
        
        for neighbor, connected in enumerate(adj_matrix[node]):
            # connected means > 0
            print('current node', node)
            if connected and not visited[neighbor] and neighbor in node_range:
                print('neighbor', neighbor)
                dfs(neighbor, transition)

        visited[node] = False  
        transition.pop()  # Remove the last node to backtrack and explore other options
        
    dfs(start_node, [])

    return transitions


### Quick Test:

In [10]:
# Quick test
# Example adjacency matrix as a NumPy array
adj_matrix = np.array([
    [0, 1, 0, 0, 0, 0, 0],
    [1, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 0, 0]
])

score_adj_matrix = np.array([
    [0, 1, 0, 0, 0, 0, 0],
    [1, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0.5, 1, 0.7],
    [0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 0, 0]
])

score_adj_matrix = np.array([
    [0.         , 0.96685672,  0.96839132, 0.85574313],
    [0.97175242, 0.         , 0.988693,   0.90284238],
    [0.95158719, 1.         , 0.      ,   0.87694331],
    [0.84444039, 0.90408969, 0.87373856, 0.        ]
])

start_node = 0
end_node = 3
node_range = [1, 2, 3, 4]

# transitions = find_node_transitions(score_adj_matrix, start_node)
# print("Node Transitions with Start Node:", transitions)

# FIX ME, 6 should not appear
# transitions = find_node_transitions_with_end_node(score_adj_matrix, start_node, end_node)
# print("Node Transitions with Start and End Node:", transitions)

transitions = find_node_transitions_with_threshold(score_adj_matrix, start_node, end_node, threshold=0.9)
print("Node Transitions with Start and End Node + Threshold:", transitions)


# transitions = find_node_transitions_with_depth(adj_matrix, start_node, depth_count=2)
# print("Node Transitions with Depth Count:", transitions)

# # FIX ME, [3, 6] should not happen
# transitions = find_node_transitions_within_range(score_adj_matrix, start_node, node_range=node_range)
# print("Node Transitions within Node Range:", transitions)

node 0
neighbor 1
connected 0.96685672
node 1
neighbor 2
connected 0.988693
node 2
neighbor 3
connected 0.90284238
node 3
neighbor 2
connected 0.96839132
node 2
neighbor 1
connected 1.0
node 1
neighbor 3
connected 0.90284238
node 3
Node Transitions with Start and End Node + Threshold: [[0, 1, 3], [0, 2, 1, 3]]


## Helper Functions For Global 

In [63]:
# Get Step Sequences from Node Transitions
def get_step_sequence_by_node_sequence(node_sequences, node2step):
    muliple_step_sequences = []
    for trans_seq in node_sequences:
        mapped_step_sequence = [node2step[n] for n in trans_seq]
        muliple_step_sequences.append(mapped_step_sequence)
    return muliple_step_sequences

def get_step_headlines_sequence_by_step_ids(step_sequences, step_id_to_headline):
    muliple_step_headlines = []
    for trans_seq in step_sequences:
        headlines = [step_id_to_headline[s] for s in trans_seq]
        muliple_step_sequences.append(headlines)
    return muliple_step_headlines

# Define the start node
def get_start_end_node_of_task_on_global(task_name):
    # This is for global graph
    a_id = wikihow_taskname_to_article_id[task_name]
    step_seq = article_to_step_id[a_id]
    start_step_id = step_seq[0]
    end_step_id = step_seq[-1]
    start_node_id = step2node[start_step_id]
    end_node_id = step2node[end_step_id]
    return start_node_id, end_node_id

## Helper Functions For Local

In [4]:
def get_step_id_to_step_headlines_on_local(wikihow_step, taskids):
    # This is for local graph => no step clustering => because it does not make sense
    step_id = 0
    step_id_to_step_headlines = dict()
    for taskid in taskids:
        step_headlines = wikihow_step[taskid]
        for step in step_headlines:
            step_id_to_step_headlines[step_id] = step
            step_id += 1
    return step_id_to_step_headlines

def get_start_end_step_of_task_on_local(wikihow_step, taskids):
    # This is for local graph => no step clustering => because it does not make sense
    start_id = 0
    taskids2start_step_end_step = dict()
    for taskid in taskids:
        step_headlines = wikihow_step[taskid]
        end_id = start_id + len(step_headlines)
        taskids2start_step_end_step[taskid] = (start_id, end_id-1)
        start_id = end_id 
 
    return taskids2start_step_end_step

## Tweaking Hyperparamers
-[ ]graph_find_matched_steps_criteria: topK

-[ ]graph_find_matched_steps_for_segments_thresh: 10

-[ ]graph_find_matched_steps_for_segments_topK: 1

-[ ]edge_min_aggconf: 10000

In [8]:
# infer 
import glob
vids = ['Luv3PTWdGyA', 'a9i7z2SHMR4', '87OGxcyQz6c', 'l_CzB-WJ0iQ', 'eAtmjjf8JsE']
max_transition_values = []
min_transition_values = []

for vid in vids:
    sim_score_path = os.path.join(cae_dir, f'subtitles/sim_scores/local/')
    sim_score_paths_of_segments_this_video = sorted(glob.glob(os.path.join(sim_score_path, vid, 'segment_*.npy')))
    edges_meta = list()
    # loop over segments
    for video_segment_idx in range(1, len(sim_score_paths_of_segments_this_video)):
        segment_pre_sim_scores = np.load(sim_score_paths_of_segments_this_video[video_segment_idx - 1])
        segment_suc_sim_scores = np.load(sim_score_paths_of_segments_this_video[video_segment_idx])
        max_pre = segment_pre_sim_scores.max()
        min_pre = segment_pre_sim_scores.min()
        
        max_suc = segment_suc_sim_scores.max()
        min_suc = segment_suc_sim_scores.min()
            
        max_transition_values.append(max_pre * max_suc)
        min_transition_values.append(min_pre * min_suc)

In [13]:
min(max_transition_values)

7668452.177084541

# Local Graph

## CAE_subtitles vs. HowTo100M_subtitles

### Decode Node Transitions

In [5]:
with open(os.path.join(cae_dir, f'task_titles/topic2task.pickle'), 'rb') as f:
    topic2task = pickle.load(f)

In [33]:
# Issues: step_lookup_table 
# Example local graph
# example topics = [11, 101, 133, 147, 258]
topic = 101
topk = 3
agg = 1000
cae_local_pkg = np.load(os.path.join(cae_dir, f'subtitles/graph_output/local/topic_{topic}/PKG-criteria_topK-threshold_10-topK_{topk}-agg_{agg}.npy'))
# print(cae_local_pkg.shape)
print('Cluster:', topic)
clustered_taskids = topic2task.get(topic)
wikihow_taskids = [task_id for (source, task_id) in clustered_taskids if source == 'wikihow']
tasknames = [article_id_to_wikihow_taskname.get(taskid) for taskid in wikihow_taskids]
print(tasknames)


# get a dictionary of local step id (starting from 0) to step headlines
# rewrite below function to adapt to topic2step.get(topic)
# topic0:{step_lookup_table.append((task_id, c_idx))}
local_step_id_2_step_headlines = get_step_id_to_step_headlines_on_local(wikihow_steps, wikihow_taskids)
# print('Oracle step transition:', local_step_id_2_step_headlines)
local_taskids2start_step_end_step = get_start_end_step_of_task_on_local(wikihow_steps, wikihow_taskids)


for taskid, taskname in zip(wikihow_taskids, tasknames):
    print('Task title:', taskname)
    
    start_node, end_node = local_taskids2start_step_end_step.get(taskid)
    print('start node:', start_node)
    print('end node:', end_node)
    oracle_node_trans = list(range(start_node, end_node+1))
    print(f'Oracle transition:', oracle_node_trans)
    oracle_step_trans = [local_step_id_2_step_headlines.get(step_id) for step_id in oracle_node_trans]
    print(oracle_step_trans)

    # TODO: threshold -->
    cae_node_trans = find_node_transitions_with_threshold(cae_local_pkg, start_node, end_node, threshold=0.9)
    print('Total node transitions found:', len(cae_node_trans))
    
    for idx, cae_node_tran in enumerate(cae_node_trans):
        cae_step_trans = [local_step_id_2_step_headlines.get(step_id) for step_id in cae_node_tran]
        print(f'Found step transition {idx}:', cae_node_tran)
        print(cae_step_trans)


Cluster: 11
['How to Make a Blood Sugar Stabilising Breakfast']
Task title: How to Make a Blood Sugar Stabilising Breakfast
start node: 0
end node: 3
Oracle transition: [0, 1, 2, 3]
['Soak all the ingredients together (except the milk) in a bowl of water.', 'Add the ingredients to the cup of milk on a stove.', 'Cook until the oats are mushy and the nuts are soft.', 'Pour into a bowl.']
Total node transitions found: 5
Found step transition 0: [0, 1, 2, 3]
['Soak all the ingredients together (except the milk) in a bowl of water.', 'Add the ingredients to the cup of milk on a stove.', 'Cook until the oats are mushy and the nuts are soft.', 'Pour into a bowl.']
Found step transition 1: [0, 1, 3]
['Soak all the ingredients together (except the milk) in a bowl of water.', 'Add the ingredients to the cup of milk on a stove.', 'Pour into a bowl.']
Found step transition 2: [0, 2, 1, 3]
['Soak all the ingredients together (except the milk) in a bowl of water.', 'Cook until the oats are mushy and

In [34]:
print(cae_local_pkg)

[[0.         0.99785013 0.99709511 0.925869  ]
 [0.99808304 0.         0.99920809 0.92926998]
 [0.99702714 1.         0.         0.91296639]
 [0.92377298 0.9249839  0.9103892  0.        ]]


## (No need) Define task similarities
-[X] Sanity check on whether the exact task title of HowTo100M is clustered into the same topic with WikiHow

In [None]:
# Check on the topics that have more overlap between wikihow and howto100m
get_task_titles_feats(howto_dir)

### Helper Function:

In [13]:
from sklearn.cluster import AgglomerativeClustering
import time


def get_task_titles_feats(input_dir):
    with open(os.path.join(input_dir,
                           'task_titles/s3d_text_feats/task_title_embeddings.pickle'), 'rb') as f:
            task_title_feats = pickle.load(f)
    
    return task_title_feats


def merge_task_title_feats(wikihow_tasktitle_feat_dict, howto100m_tasktitle_feat_dict):
    merged_task_id_2_original_task_id = defaultdict()
    merged_feats = []

    for idx, (article_id, tasktitle_feat) in enumerate(wikihow_tasktitle_feat_dict.items()):
        merged_task_id_2_original_task_id[idx] = ('wikihow', article_id)
        merged_feats.append(tasktitle_feat)

    for idx, (article_id, tasktitle_feat) in enumerate(howto100m_tasktitle_feat_dict.items()):
        idx += len(wikihow_tasktitle_feat_dict)
        merged_task_id_2_original_task_id[idx] = ('howto100m', article_id)
        merged_feats.append(tasktitle_feat)

    merged_tasktitle_feats = np.concatenate(merged_feats, axis=0)

    return merged_task_id_2_original_task_id, merged_tasktitle_feats


def get_topics_by_clustering_tasktitles(wikihow_tasktitle_feats:dict, 
                                        howto100m_tasktitle_feats:dict, 
                                        n_clusters,
                                        task_clustering_linkage,
                                        task_clustering_distance_thresh,
                                        task_clustering_affinity
                                       ):
    start_time = time.time()
    assert wikihow_tasktitle_feats is not None
    assert howto100m_tasktitle_feats is not None

    merged_task_id_2_original_task_id, merged_task_feats = \
        merge_task_title_feats(wikihow_tasktitle_feats, howto100m_tasktitle_feats)

    clustering = AgglomerativeClustering(n_clusters=n_clusters, 
                                         linkage=task_clustering_linkage,
                                         distance_threshold=task_clustering_distance_thresh, 
                                         affinity=task_clustering_affinity).fit(merged_task_feats)
    
    # distance_threshold:
    #   The linkage distance threshold above which, clusters will not be merged.
    num_nodes = clustering.n_clusters_

    topic2task, wikihow_task2topic, howto100m_task2topic = defaultdict(), defaultdict(), defaultdict()

    for cluster_id in range(num_nodes):
        cluster_members = np.where(clustering.labels_ == cluster_id)[0]
        topic2task[cluster_id] = [merged_task_id_2_original_task_id[task_id] for task_id in cluster_members]
        for task_id in cluster_members:
            original_tasktitle_source,  original_task_id = merged_task_id_2_original_task_id[task_id]
            if original_tasktitle_source == 'wikihow':
                wikihow_task2topic[original_task_id] = cluster_id
            else:
                howto100m_task2topic[original_task_id] = cluster_id
                
    print(('finding task similarity between {} wikihow tasks and {} howto100m tasks took {} seconds'.
           format(len(wikihow_tasktitle_feats), len(howto100m_tasktitle_feats), time.time() - start_time)))
    
    return topic2task, wikihow_task2topic, howto100m_task2topic

In [24]:
wikihow_part_task_feats = get_task_titles_feats(wikihow_part_dir)
howto_task_feats = get_task_titles_feats(howto_dir)
selected_howto_task_feats = {task_id: howto_task_feats.get(task_id) for task_id in howto_joint_partial_taskid}

print('exact title match', len(selected_howto_task_feats))

713


In [34]:
topic2task, wikihow_task2topic, howto_task2topic = get_topics_by_clustering_tasktitles(
                                                   wikihow_part_task_feats,
                                                   selected_howto_task_feats,
                                                   n_clusters=None,
                                                   task_clustering_linkage='average',
                                                   task_clustering_distance_thresh=0.5,
                                                   task_clustering_affinity='cosine')



finding task similarity between 1053 wikihow tasks and 713 howto100m tasks took 0.431593656539917 seconds


In [51]:
howto_taskid2title[29176]

'Change a Sway Bar Link'

In [39]:
print('WikiHow Tasks: (Partial)', len(wikihow_task2topic))
print('HowTo Tasks: (Partial)', len(howto_task2topic))
print('Num of topics:', len(topic2task))
print('\n')
c = 0
exact_map = 0
for key, tasks in topic2task.items():
    c += 1
    tasknames = []
    pure_tasknames = []
    for member in tasks:
        if member[0] == 'wikihow':
            # wikihow partial
            tasknames.append((member[0], article_id_to_wikihow_taskname.get(member[1])))
            pure_tasknames.append(article_id_to_wikihow_taskname.get(member[1]))

        elif member[0] == 'howto100m':    
            # howto100m
            tasknames.append((member[0], 'How to ' + howto_taskid2title.get(member[1])))
            pure_tasknames.append('How to ' + howto_taskid2title.get(member[1]))

    if len(set(pure_tasknames)) < len(pure_tasknames):
        exact_map += 1
        
# print(exact_map)
#     print(f'Cluster {key}: {tasknames}')
#     if c >= 10:
#         break


WikiHow Tasks: (Partial) 1053
HowTo Tasks: (Partial) 713
Num of topics: 589


443


### For Quick Test: (1) wikihow_part- howto100m/cae sampled tasktitles  

In [15]:
# get the sampled tasktitles embeddings for Howto100M & CAE
howto_task_feats = get_task_titles_feats(howto_dir)
selected_howto_task_feats = {task_id: howto_task_feats.get(task_id) for task_id in sampled_taskids}

saved_path = os.path.join(howto_dir, 'task_titles/s3d_text_feats/sampled_task_title_embeddings.pickle')
with open(saved_path, 'wb') as f:
    assert len(selected_howto_task_feats) == 65
    pickle.dump(selected_howto_task_feats, f)
    

cae_task_feats = get_task_titles_feats(cae_dir)
selected_cae_task_feats = {task_id: cae_task_feats.get(task_id) for task_id in sampled_taskids}

saved_path = os.path.join(howto_dir, 'task_titles/s3d_text_feats/sampled_task_title_embeddings.pickle')
with open(saved_path, 'wb') as f:
    assert len(selected_cae_task_feats) == 65
    pickle.dump(selected_cae_task_feats, f)

# Global Graph

## CAE_subtitles vs. HowTo100M_subtitles

In [64]:
# pkg (based o node id)
howto_pkg = np.load(os.path.join(howto_dir, 'subtitles/graph_output/PKG-criteria_topK-threshold_10-topK_3-agg_1000.npy'))
cae_pkg = np.load(os.path.join(cae_dir, 'subtitles/graph_output/PKG-criteria_topK-threshold_10-topK_3-agg_1000.npy'))

cae_pkg_matrix = np.asmatrix(cae_pkg)
howto_pkg_matrix = np.asmatrix(howto_pkg)

In [32]:
assert len(cae_pkg_matrix)==len(howto_pkg_matrix)

In [46]:
cae_pkg_matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
len(np.argwhere(cae_pkg_matrix>0))

9537

In [48]:
len(np.argwhere(howto_pkg_matrix>0))

536807

### Decode Node Transitions from PKG 

In [92]:
import sys
sys. setrecursionlimit(10000) 

In [90]:
for t in in_wikihow:
    print('Task title:', t)
    
    print('Domain:', tasktitles2domain.get(t))

    start_node, end_node = get_start_end_node_of_task(t)
    cae_node_trans = find_node_transitions(cae_pkg, start_node=start_node)
    print('Total node transistions found', len(cae_node_trans))
    cae_step_trans = get_step_sequence_by_node_sequence(cae_node_trans, node2step, step_id_to_headline)

    # TODO: rewrite, make cae_step_trans on one
    choie = 2 if len(cae_node_trans) > 2 else len(cae_node_trans) 
    num = 0 
    if len(cae_node_trans) > 2:
        for node_tran, step_tran in zip(cae_node_trans, cae_step_trans):
            num += 1
            output = {node: step_tran[idx] for idx, node in enumerate(node_tran)}
            print(output)
            if num > 2:
                break
    else:
        for node_tran, step_tran in zip(cae_node_trans, cae_step_trans):
            output = {node: step_tran[idx] for idx, node in enumerate(node_tran)}
            print(output)

#     howto100m_trans = find_topk_node_transitions(howto_pkg, start_node=start_node, depth_count=10)
#     print(len(howto100m_trans))


Task title: How to Install a Speedometer
Domain: Cars & Other Vehicles
Total node transistions found 285
{4684: ["Read your vehicle's owner's manual."], 9801: ['Find an appropriate speedometer.'], 154: ['Gather the appropriate tools and materials.', 'Gather the necessary tools.', 'Gather the necessary tools.'], 10: ['Disconnect the battery.', 'Disconnect the battery cables.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the positive battery cable.', 'Disconnect your vehicle’s battery.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the battery.', 'Disconnect the battery leads.', "Disconnect the negative power cable from the car's battery.", 'Disconnect the battery ground terminal.', 'Disconnect the battery.', 'Disconnect the negative battery cable.', 'Disconnect the negative terminal of your battery.', 'Disconnect the negative battery cable;', 'Disconnect the batter

In [89]:
node2step[10][0]

array([  530,   906,  1003,  1148,  1788,  1837,  2469,  4438,  5060,
        5529,  6703,  7281,  7870,  8065,  8136,  8226,  8428,  9047,
        9698, 10101, 10126, 10209])

# (Outdated) Analysis on PKG constructed base on cross-modal alignment (6 videos)

### Case A: irrelevant task id 

In [151]:
# get action-step sequences based on task id
# Case A: non-relevant task id
example_task_name = 'How to Plant Parsnips'
example_task_id = wikihow_task2taskid[example_task_name]
example_a_id = wikhow_taskname_to_article_id[example_task_name]

# get step sequence (step id transitions)
step_seq = article_to_step_id[example_a_id]
start_step_id = step_seq[0]
start_node_id = step2node[start_step_id]

node_trans = find_node_transitions(pkg, start_node=start_node_id)
print(node_trans)

get_step_sequence_by_node_sequence(node_trans, node2step, step_id_to_headline)

[[3059, 7793, 7260, 1631, 7054, 3334]]


[[['Prepare the site.'],
  ['Lay a piece of string on the soil as a guide to make sure you dig the row in a straight line.'],
  ['Use a cane, or other tool, to dig a narrow hole that you will place the seeds in.'],
  ['Using a hoe, gently cover the seeds with soil.'],
  ['If the soil is dry, water the seeds in.'],
  ['To protect your plants from late frosts in Spring, build a frame to cover the drills.']]]

### Case B: relevant task id 

In [20]:
# videoid2taskid
howto_meta_dir = '/mount/arbeitsdaten/jp-silberer/ernie_vil/data/HowTo100M'
video_meta_csv = pd.read_csv(os.path.join(howto_meta_dir, 'HowTo100M_v1.csv'))
task_ids_csv = pd.read_csv(os.path.join(howto_meta_dir, 'task_ids.csv'), sep='\t', header=None)
    
def get_videoid2taskname(video_meta_csv, task_ids_csv):
    task_id_to_task_name_original_map = dict()
    for index, row in task_ids_csv.iterrows():
        task_id = row[0]
        task_name = row[1]
        task_id_to_task_name_original_map[task_id] = task_name
    
    video_id_to_task_name = dict()
    for index, row in video_meta_csv.iterrows():
        video_id = row['video_id']
        task_id = row['task_id']
        video_id_to_task_name[video_id] = task_id_to_task_name_original_map[task_id]
    
    return video_id_to_task_name

video_id_to_task_name = get_videoid2taskname(video_meta_csv, task_ids_csv)

In [19]:
# sample videos
sample_videos = ['8JevEqO_iS0', 'L8kRaQ-IwpM', 'O_sC9gyEER0', 'c0VlxPBSMDs', 'jRkJTww8HR0', 'kHb7POoH_gc']
sample_task_names = [video_id_to_task_name.get(vid)for vid in sample_videos]
sample_task_names

['Grill Tri Tip',
 'Grill Tri Tip',
 'Do a Hair Mask for Frizzy Hair',
 'Make an Envelope Advent Calendar',
 'Make Pine Needle Tea',
 'Grill Tri Tip']

In [88]:
similar_tasks = []
for t in tasknames:
    if 'Grill' in t:
        similar_tasks.append(t)
similar_tasks

['How to Grill Tri Tip']

In [26]:
example_task_name = 'How to Grill Tri Tip'
example_task_id = wikihow_task2taskid[example_task_name]
example_a_id = wikihow_taskname_to_article_id[example_task_name]

# get step sequence (step id transitions)
step_seq = article_to_step_id[example_a_id]
start_step_id = step_seq[0]
# start_step_id = 4939
start_node_id = step2node[start_step_id]
# print(start_node_id)
node_trans = find_node_transitions(pkg, start_node=start_node_id)
print('node transition', node_trans)

# Find article name
print('step transition', [node2step[n] for n_seq in node_trans for n in n_seq])


get_step_sequence_by_node_sequence(node_trans, node2step, step_id_to_headline)


node transition [[1956, 1190, 1451, 619, 937, 1896]]
step transition [array([9463]), array([9464]), array([9465]), array([9466, 9473]), array([9474]), array([9475])]


[[['Head to your grocery store.'],
  ['Prep the roast.'],
  ['Use a dry rub.'],
  ['Give it a rest.', 'Give it another rest.'],
  ['Slice against the grain.'],
  ['Serve with your favorite sides.']]]

In [108]:
idx = np.nonzero(wikihow_step2task[4939]!=0)[0]
print(idx)
wikihow_taskid2task[556]

[556]


'How to Smoke Beef Ribs1'

### Q. Why there is no other sequence transitions?

In [None]:
# Find if the node_trans is in G_Howto100M; if yes, the order align with the wikihow's step headlines
# If not; that means, the alignment between the video segment and the step headlines are not well (Check how S3D is trained)
# Find the similarity score of 8JevEqO_iS0, L8kRaQ-IwpM, kHb7POoH_gc to get some insights


In [8]:
G_howto100m = np.load(os.path.join(cae_dir, 'subtitles/graph_output/G_howto100m-criteria_topK-threshold_10-topK_3-agg_1000.npy')) # on step-level transition
G_wikihow = np.load(os.path.join(cae_dir, 'subtitles/graph_output/G_wikihow-criteria_topK-threshold_10-topK_3.npy')) # on step-level transition

In [65]:
from scipy.sparse import csr_matrix
G_wikihow_csr, G_howto100m_csr = csr_matrix(G_wikihow), csr_matrix(G_howto100m)

In [75]:
G_wikihow_step_trans = np.argwhere(G_wikihow>0)
index = np.nonzero(G_wikihow)
G_wikihow_conf = G_wikihow[index]

formed_edges_in_nodes = [(step2node[formed_edge_in_steps[0]], step2node[formed_edge_in_steps[1]]) for formed_edge_in_steps in G_wikihow_step_trans]

True


In [78]:
print((937, 1896) in formed_edges_in_nodes)

True


In [18]:
G_howto100m_step_trans = np.argwhere(G_howto100m>0)
index = np.nonzero(G_howto100m)
G_howoto100m_conf = G_howto100m[index]


for conf, formed_edge_in_steps in zip(G_howoto100m_conf, G_howto100m_step_trans):
    print('step transition:', formed_edge_in_steps)
    print('step transition in headlines:', step_id_to_headline[formed_edge_in_steps[0]], step_id_to_headline[formed_edge_in_steps[1]])
    print('confidence score:', conf)

step transition: [ 759 7090]
step transition in headlines: Meanwhile, get the fermentors. Inspect the clean roof for cracks along the edges of the EPDM barrier where the self leveling sealant is starting to lift up or become degraded in any way.
confidence score: 0.8663709492554348
step transition: [ 759 7093]
step transition in headlines: Meanwhile, get the fermentors. Apply this sealant along the cracked or potential pin holed area that you have just cleaned from any dirt and grime.
confidence score: 0.8671828954962901
step transition: [ 759 9266]
step transition in headlines: Meanwhile, get the fermentors. Crack all the grains (Meaning should lightly grind up in coffee grinder, or roll over with a heavy rolling pin while in sleeve).
confidence score: 0.8734952834486645
step transition: [1498 3599]
step transition in headlines: Clean the hub, outer bearing and inner bearing with parts cleaner or a brush and pan of kerosene or gasoline. Make a nesting box out of a kitchen cabinet.
con

In [None]:
# how to find videoid2stepid, having this would be helpful to find a step sequence based on the start of a given video id
# maybe not set threshold on PKG edge transition