!pip install zss==1.2.0

In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2


# if cannot import the modules, add the parent directory to system path might help

import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/'
sys.path.append(parent_dir)

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, save

import pandas as pd
import numpy as np
import spacy
import copy
import re
import tqdm
import zss

'''useful for displaying dictionary
import pprint
pprint.pprint(recipe_inst)
'''
from gensim.models import KeyedVectors
gensim_model = KeyedVectors.load_word2vec_format('../data/vocab.bin', binary = True)
vocabulary = gensim_model.vocab.keys()
# gensim_model.get_vector('cook')
from ete3 import Tree

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


https://github.com/timtadh/zhang-shasha

### Build a basic tree and display the similarity

In [2]:
def scoring_function(a, b):
    if a == b:
        return 0
    else:
        return 1
    
class Node(object):
    def __init__(self, label, nodetype):
        # assert type(label) == str
        self.label = label
        self.children = list()
        self.nodetype = nodetype

    @staticmethod
    def get_children(node):
        return node.children

    @staticmethod
    def get_label(node):
        return node.label
    
    @staticmethod
    def get_nodetype(node):
        return node.nodetype
    
    def addkid(self, node, before=False):
        if before:  self.children.insert(0, node)
        else:   self.children.append(node)
        return self
    
A = (Node("A1","action").addkid(Node("I", "ingredient")))
B = (Node("A2","action").addkid(Node("I", "ingredient")))

dist = zss.simple_distance(A, B, Node.get_children, Node.get_label, scoring_function)
print('---tree edit distance:---')
print(dist)
print()
print('---examples of accessing the nodes---')
print(Node.get_label(A))
first_children_of_A = Node.get_children(A)[0]
print(Node.get_label(first_children_of_A))

---tree edit distance:---
1.0

---examples of accessing the nodes---
A1
I
time: 26 ms


### Build recipe tree and display the similarity

In [51]:
def make_nodes(sentence):
    action = Node(label=sentence['word'], nodetype='action')
    for ing in sentence['ingredient']:
        action.addkid(Node(label=ing, nodetype='ingredient'))
    return action

def build_tree(recipe_inst):
    '''
    recipe_inst = [{'word': 'verb1', 'ingredient':['A1','B1','C1','D1']},
                   {'word': 'verb2', 'ingredient':['A2','B2','C2','D2']},
                   {'word': 'verb3', 'ingredient':['A3','B3','C3','D3']}
                  ]
    '''
    tree1 = Node(label=recipe_inst[0]['word'], nodetype='action')
    for ing in recipe_inst[0]['ingredient']:
        tree1.addkid(Node(label=ing, nodetype='ingredient')
                    )
    myroot = tree1
    recipe_inst = recipe_inst[1:]
    for sentence in recipe_inst:
        myroot.addkid(make_nodes(sentence), before=True)
        myroot = Node.get_children(myroot)[0]
    return tree1

'''compare the number of edit'''
def strdist(a, b): return 0 if a == b else 1

def cosine_distance(vector_a, vector_b):
        cosine_similarity =  np.dot(vector_a, vector_b)/(np.linalg.norm(vector_a)* np.linalg.norm(vector_b))
        return 1 - cosine_similarity
    
'''compare the cosine distance of node'''
def wordvec_dist(a, b):
    assert a in vocabulary
    assert b in vocabulary
    if a == b: 
        return 0
    else:
        vector_a, vector_b = gensim_model.get_vector(a), gensim_model.get_vector(b)
        return cosine_distance(vector_a, vector_b)
    
def tree_distance(tree1, tree2):
    return zss.distance(tree1, tree2, 
             Node.get_children,
             insert_cost=lambda node: strdist('', Node.get_label(node)),
             remove_cost=lambda node: strdist(Node.get_label(node), ''),
             update_cost=lambda a, b: wordvec_dist(Node.get_label(a), Node.get_label(b))
            )

time: 59.1 ms


### Draw the tree

In [8]:
def draw_tree(recipe_inst):
    '''
    from ete3 import Tree
    recipe_inst = [{'word': 'heated', 'ingredient':['rice','banana','cookie','dishes']},
                   {'word': 'boil', 'ingredient':['apple','banana','cookie','dish']},
                   {'word': 'rince', 'ingredient':['apple','banana','cookie','dish']}
                  ]
    '''
    # sorting will not improve the tree edit distance
    # if sort:
    #    recipe_inst = [{'word':line['word'], 'ingredient': sorted(line['ingredient'])} for line in recipe_inst]
        
    output = Tree()
    temp = output
    for i in recipe_inst:
        t = Tree(name=i['word'])
        t.add_feature('type', 'action')
        if not i['ingredient']:
            pass
        else:
            for j in i['ingredient']:
                a = t.get_tree_root().add_child(name=j)
                a.add_feature('type', 'ingredient')
            temp = temp.add_child(t)
    print(output.get_ascii(show_internal=True))
    return output
recipe_inst = [{'word': 'heated', 'ingredient':['rice','banana','cookie','dishes']},
               {'word': 'boil', 'ingredient':['apple','banana','cookie','dish']},
               {'word': 'rince', 'ingredient':['apple','banana','cookie','dish']}
              ]
example_tree = draw_tree(recipe_inst)


         /-rice
        |
        |--banana
        |
        |--cookie
        |
-- /heated-dishes
        |
        |    /-apple
        |   |
        |   |--banana
        |   |
        |   |--cookie
         \boil
            |--dish
            |
            |     /-apple
            |    |
            |    |--banana
             \rince
                 |--cookie
                 |
                  \-dish
time: 25.1 ms


### Example

In [9]:
recipe_inst = [{'word': 'heated', 'ingredient':['rice','banana']},
               {'word': 'boil', 'ingredient':['cookie','dish']},
              ]
tree1 = build_tree(recipe_inst)
example_tree1 = draw_tree(recipe_inst)
recipe_inst = [{'word': 'heated', 'ingredient':['banana','rice']},
               {'word': 'boiled', 'ingredient':['cookie']},
              ]
tree2 = build_tree(recipe_inst)
example_tree2 = draw_tree(recipe_inst)
tree_distance(tree1, tree2)


         /-rice
        |
-- /heated-banana
        |
        |    /-cookie
         \boil
             \-dish

         /-banana
        |
-- /heated-rice
        |
         \boiled-cookie


3.8127352595329285

time: 57.3 ms


In [27]:
recipe_inst = [{'word': 'heated', 'ingredient':['rice','banana']},
               {'word': 'boil', 'ingredient':['cookie','dish']},
              ]
tree1 = build_tree(recipe_inst)
example_tree1 = draw_tree(recipe_inst)
recipe_inst = [{'word': 'heated', 'ingredient':['banana','rice']},
               {'word': 'boil', 'ingredient':['cookie']},
              ]
tree2 = build_tree(recipe_inst)
example_tree2 = draw_tree(recipe_inst)
tree_distance(tree1, tree2)


         /-rice
        |
-- /heated-banana
        |
        |    /-cookie
         \boil
             \-dish

         /-banana
        |
-- /heated-rice
        |
         \boil-cookie


3.0

time: 55.7 ms


In [30]:
wordvec_dist('rice', 'banana')

1.0272054951637983

time: 46.6 ms


### Conduct on real data

In [16]:
dic = load(dir_save = '../big_data/dic_20190927.pickle')

exist ../big_data/dic_20190927.pickle
time: 4.26 s


In [17]:
### STEP2 load and clean the generation

def reverse(text):
    '''
    Important data cleaning before NY times parser
    '''
    # replace things in brace
    text = re.sub(r'\([^)]*\)', '', text)

    # remove space before punct
    text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', text)

    # remove consecutive spaces
    text = re.sub(' +',' ',text).strip()
    return text

def reverse_list(listoftext):
    output=[]
    for text in listoftext:
        rev = reverse(text)
        if rev:
            output.append(rev)
    return output

def load_dir_data(filename):
    ls = []
    if os.path.isdir(filename):
        print('load', filename)
        # Directory
        for (dirpath, _, fnames) in os.walk(filename):
            for fname in fnames:
                path = os.path.join(dirpath, fname)
                with open(path, 'r') as fp:
                    raw_text = fp.read()
                    
                # if it contains instr
                if fname[-5] == 'd':
                    dic[int(fname[:-5])]['generated_instr'] = reverse_list(raw_text.split('.'))
                    # ls.append(int(fname[:-5])) # only interested in instr
                # if it contains ingred
                if fname[-5] == 'i':
                    dic[int(fname[:-5])]['generated_ingred'] = reverse_list(raw_text.split('$'))
                    
                # if it contains name
                if fname[-5] == 't':
                    dic[int(fname[:-5])]['generated_name'] = raw_text
                    
                ls.append(int(fname[:-5])) # three fields
                
                    
    return sorted(list(set(ls)))

filename = '../../to_gpt2/generation_28k_sorted/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_28k_sorted/
time: 226 ms


In [18]:
class instr2tree:
    def __init__(self):
        self.spacy = spacy.load('en_core_web_lg')
        self.vocabulary= list(vocabulary)
        
    def sents2tree(self, sents):
        '''
        contatenate the leaves to a big tree
        '''
        tree = []
        for sent in sents:
            temp = self.leaf(sent)
            if temp:
                for t in temp:
                    if t['word']:
                        tree.append(t)
                    else:
                        ''' while creating the leaf, I force every leaf to have a content in "word"
                            so that I can avoid the IndexError in tree[-1] '''
                        tree[-1]['ingredient'] += t['ingredient']
        return tree
    
    def leaf(self, sent):
        '''
        transform a sentence to a leaf
        '''
        doc = self.spacy(sent)
        verbs = [(token.i, token.lemma_) for token in doc \
                 if token.pos_ == 'VERB' and token.lemma_ in self.vocabulary]
        nouns = [(chunk.root.i, chunk.root.lemma_) for chunk in doc.noun_chunks \
                 if chunk.root.lemma_ not in ['-PRON-'] and chunk.root.lemma_ in self.vocabulary]
        
        if not verbs and not nouns:
            return 
        
        # if do not have a noun, just add the verb
        elif not nouns: 
            return [{'word': v, 'ingredient': []} for vidx, v in verbs]
        
        # if do not have a verb, automatically set the first word in noun to verb
        elif not verbs: 
            return [{'word': nouns[0][1], 'ingredient': [n for nidx, n in nouns[1:]]}]
        
        '''
        verbs=[(0, 'v1'),(1,'v2'),(6, 'v3'),(8,'v4'),(10,'v5')]
        nouns=[(2, 'n1'),(3,'n2'),(4, 'n3'),(7,'n4'), (9,'n5'),(12,'n6')]
        1) loop through verb
        2) check verb whether > noun
        3) if > then next noun
        4) if not then next verb
        '''
        output, sent, vidx, nidx = [], [], 1, 0
        while vidx < len(verbs):
            if nidx < len(nouns) and nouns[nidx] < verbs[vidx]:
                sent.append(nouns[nidx][1])
                nidx += 1
            else:
                output.append({'word':verbs[vidx-1][1], 'ingredient': sent})
                vidx +=1
                sent = []
        while nidx < len(nouns):
            sent.append(nouns[nidx][1])
            nidx += 1
        output.append({'word':verbs[vidx-1][1], 'ingredient': sent})
        return output

time: 27.5 ms


In [19]:
treemaker = instr2tree()
treemaker.sents2tree(['stir together ketchup and mustard on a plate and mix them.',
                     'stir together ketchup and mustard on a plate and mix them.'])

[{'word': 'stir', 'ingredient': ['ketchup', 'mustard', 'plate']},
 {'word': 'mix', 'ingredient': []},
 {'word': 'stir', 'ingredient': ['ketchup', 'mustard', 'plate']},
 {'word': 'mix', 'ingredient': []}]

time: 10.2 s


In [20]:
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'])
    tree1 = build_tree(dic[i]['tree_true'])
    tree2 = build_tree(dic[i]['tree_pred'])
    dic[i]['#tree_dist'] = tree_distance(tree1, tree2)
    len_true = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_true']])
    len_pred = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_pred']])
    BP = len_pred/len_true
    dic[i]['#BP'] = BP if BP < 1 else 1

100%|██████████| 500/500 [02:48<00:00,  3.68it/s]

time: 2min 48s





In [21]:
### 28k full
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '#' in col]].iloc[ls].mean()
temp

#tree_dist    42.738515
#BP            0.817516
dtype: float64

time: 2.36 s


In [14]:
#(df2.iloc[ls]['#tree_dist']/df2.iloc[ls]['#BP']).mean()

time: 26.5 ms


In [22]:
filename = '../../to_gpt2/generation_333k_sorted/'
ls = load_dir_data(filename)
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'])
    tree1 = build_tree(dic[i]['tree_true'])
    tree2 = build_tree(dic[i]['tree_pred'])
    dic[i]['#tree_dist'] = tree_distance(tree1, tree2)
    len_true = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_true']])
    len_pred = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_pred']])
    BP = len_pred/len_true
    dic[i]['#BP'] = BP if BP < 1 else 1
### 333k full
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '#' in col]].iloc[ls].mean()
temp

load ../../to_gpt2/generation_333k_sorted/


100%|██████████| 500/500 [02:42<00:00,  3.09it/s]


#tree_dist    41.158559
#BP            0.800247
dtype: float64

time: 2min 45s


In [23]:
filename = '../../to_gpt2/generation_28k_sorted/'
ls = load_dir_data(filename)
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'][:2])
    tree1 = build_tree(dic[i]['tree_true'])
    tree2 = build_tree(dic[i]['tree_pred'])
    dic[i]['#tree_dist'] = tree_distance(tree1, tree2)
    len_true = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_true']])
    len_pred = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_pred']])
    BP = len_pred/len_true
    dic[i]['#BP'] = BP if BP < 1 else 1
### 333k full
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '#' in col]].iloc[ls].mean()
temp

load ../../to_gpt2/generation_28k_sorted/


100%|██████████| 500/500 [01:14<00:00,  6.51it/s]


#tree_dist    40.422150
#BP            0.263393
dtype: float64

time: 1min 17s


In [24]:
filename = '../../to_gpt2/generation_333k_sorted/'
ls = load_dir_data(filename)
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'][:2])
    tree1 = build_tree(dic[i]['tree_true'])
    tree2 = build_tree(dic[i]['tree_pred'])
    dic[i]['#tree_dist'] = tree_distance(tree1, tree2)
    len_true = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_true']])
    len_pred = sum([len(sent['ingredient']) + 1 for sent in dic[i]['tree_pred']])
    BP = len_pred/len_true
    dic[i]['#BP'] = BP if BP < 1 else 1
### 333k full
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '#' in col]].iloc[ls].mean()
temp

load ../../to_gpt2/generation_333k_sorted/


100%|██████████| 500/500 [01:13<00:00,  6.90it/s]


#tree_dist    39.816640
#BP            0.274905
dtype: float64

time: 1min 16s


In [67]:
def avg_embedding(tree):
    '''
    [{'word': 'stir', 'ingredient': ['ketchup', 'mustard', 'plate']},
    {'word': 'stir', 'ingredient': ['']}]
    '''
    words = [ing for line in tree for ing in line['ingredient'] if ing]
    words += [line['word'] for line in tree]
    X = np.array([gensim_model.get_vector(word) for word in words if word in vocabulary])
    return X.mean(axis=0)

time: 46.6 ms


In [69]:
filename = '../../to_gpt2/generation_28k_sorted/'
ls = load_dir_data(filename)
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'])
    dic[i]['@cosine dist'] = cosine_distance(avg_embedding(dic[i]['tree_true']), avg_embedding(dic[i]['tree_pred']))

df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
temp

load ../../to_gpt2/generation_28k_sorted/


100%|██████████| 500/500 [01:32<00:00,  5.65it/s]


@cosine dist    0.221484
dtype: float64

time: 1min 35s


In [70]:
filename = '../../to_gpt2/generation_333k_sorted/'
ls = load_dir_data(filename)
for i in tqdm.tqdm(ls): 
    dic[i]['tree_true'],dic[i]['tree_pred'] = [],[]
    dic[i]['tree_true'] = treemaker.sents2tree(dic[i]['directions'])
    dic[i]['tree_pred'] = treemaker.sents2tree(dic[i]['generated_instr'])
    dic[i]['@cosine dist'] = cosine_distance(avg_embedding(dic[i]['tree_true']), avg_embedding(dic[i]['tree_pred']))

df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
temp

load ../../to_gpt2/generation_333k_sorted/


100%|██████████| 500/500 [01:30<00:00,  5.61it/s]


@cosine dist    0.229657
dtype: float64

time: 1min 33s
