In [1]:
from trains import Task
task = Task.init(project_name="HelenaIngrF1", task_name="default")
logger = Task.current_task().get_logger()

TRAINS Task: overwriting (reusing) task id=87bcc6ea17524420bea9f1333d57b2da
2019-09-27 16:35:54,744 - trains.Task - INFO - No repository found, storing script code instead
TRAINS Monitor: GPU monitoring is not available, run "pip install gpustat"
TRAINS results page: files_server: http://10.0.106.144:5909/projects/d7092e97e1174fb4bc1d441040d57553/experiments/87bcc6ea17524420bea9f1333d57b2da/output/log


In [11]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2


# if cannot import the modules, add the parent directory to system path might help

import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/'
sys.path.append(parent_dir)

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, save
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from collections import Counter
import spacy
import copy
import re
random_seed = 2019

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.3 µs
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 49.2 ms


In [2]:
dic = load(dir_save = '../big_data/dic_20190830.pickle')
ls = [i for i,v in dic.items() if len(v['ingredients'])>1]
print('drop %d recipes with less than 2 ingredients' %(len(dic)-len(ls)))
ls = [i for i in ls if len(dic[i]['directions'])>1]
print('furthur drop %d recipes with less than 2 instructions' %(len(dic)-len(ls)))
desc = [i for i in ls if len(dic[i]['description'])<1]
print('drop %d recipes with no description' %(len(desc)))
print('now we are using recipe54k %d' % len(ls))

exist ../big_data/dic_20190830.pickle
drop 46 recipes with less than 2 ingredients
furthur drop 1026 recipes with less than 2 instructions
drop 0 recipes with no description
now we are using recipe54k 54076
time: 2.99 s


In [4]:
### STEP2 load and clean the generation

def reverse(text):
    '''
    Important data cleaning before NY times parser
    '''
    # replace things in brace
    text = re.sub(r'\([^)]*\)', '', text)

    # remove space before punct
    text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', text)

    # remove consecutive spaces
    text = re.sub(' +',' ',text).strip()
    return text

def reverse_list(listoftext):
    output=[]
    for text in listoftext:
        rev = reverse(text)
        if rev:
            output.append(rev)
    return output

def load_dir_data(filename):
    ls = []
    if os.path.isdir(filename):
        print('load', filename)
        # Directory
        for (dirpath, _, fnames) in os.walk(filename):
            for fname in fnames:
                path = os.path.join(dirpath, fname)
                with open(path, 'r') as fp:
                    raw_text = fp.read()
                    
                # if it contains instr
                if fname[-5] == 'd':
                    dic[int(fname[:-5])]['generated_instr'] = reverse_list(raw_text.split('.'))

                # if it contains ingred
                if fname[-5] == 'i':
                    dic[int(fname[:-5])]['generated_ingred'] = reverse_list(raw_text.split('$'))
                    
                # if it contains name
                if fname[-5] == 't':
                    dic[int(fname[:-5])]['generated_name'] = raw_text
                ls.append(int(fname[:-5]))# only interested in instr
                    
    return sorted(list(set(ls)))

time: 45.9 ms


In [12]:
### STEP3 sent to the NYtimes
### assign indices to each ingredient <---> NYtimes
class ny_ingredients:
    def __init__(self, fields):
        # this function will take the global variable ls and dic
        # static & reuseable
        self.ny_ingred = '../../NYtime-parser2/ingred.txt'
        self.ny_result = '../../NYtime-parser2/result.json'
        
        # spacy
        self.spacy = spacy.load('en_core_web_lg')
        self.fields = fields #['ingredients', 'generated_ingred']

    def to_ny(self):
        '''
        using global variables dic and ls
        '''
        to_write = []
        for i, v in dic.items():
            if i in ls:
                # assing index
                for field in self.fields:
                    line_ids = []
                    for line in v[field]:
                        reversed_line = reverse(line)
                        if line in to_write:
                            ny_id = to_write.index(reversed_line)
                        else:
                            ny_id = len(to_write)
                            to_write.append(reversed_line)
                        line_ids.append(ny_id)
                    dic[i]['ny_%s'%(field)] = line_ids

        # save the file to the folder under NYtime-parser2
        save(filename = self.ny_ingred, 
             to_write = '\n'.join(to_write),
             overwrite = True, 
             print_=True)

        self.to_write = to_write
        
    # step 3
    def to_ingred(self):
        '''
        using global variables dic and ls
        '''
        ny_result = pd.read_json(self.ny_result)
        to_write = []
        for i, v in dic.items():
            if i in ls:
                # assing index
                for field in self.fields:
                    temp = [ny_result.loc[ny_id]['name'] for ny_id in v['ny_%s'%(field)]]
                    exact, root = self.extract(temp)
                    dic[i]['ny_%s'%(field)] = {'ny':temp, 'exact':exact, 'root':root}
                    
    def extract(self, ny_ingred):
        '''
        Args: ny_ingred: a list of ingredient names
        '''
        phrases_to_sentences = ' '.join(['Mix the %s and water.'%ingr for ingr in ny_ingred])
        doc = self.spacy(phrases_to_sentences)
        exact_match, root_match = [],[]
        for chunk in doc.noun_chunks:
            if chunk.text != 'water':
                root_lemma = [token.lemma_ for token in doc if token.text == chunk.root.text][0]
                exact_match.append(chunk.lemma_.replace('the ',''))
                root_match.append(root_lemma)
        return exact_match, root_match

time: 44.8 ms


In [5]:
for i, v in dic.items():
    if i in ls:
        for field in ['name', 'ingredients', 'directions']:
            dic[i][field] = reverse_list(v[field])

time: 40.9 s


### send the ground truth to ny parser

In [6]:
### start                    
ny_ingr = ny_ingredients(fields = ['ingredients'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 4min 13s


In [7]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 23min 31s


In [8]:
# for the field ingredients
save_pickle(obj = dic, filename='../big_data/dic_20190927.pickle', overwrite=False)

time: 4.22 s


### send the generated text to ny parser

In [17]:
dic = load(dir_save = '../big_data/dic_20190927.pickle')

exist ../big_data/dic_20190927.pickle
time: 3.52 s


In [18]:
filename = '../../to_gpt2/generation_333k_sorted/'
ls = load_dir_data(filename)
#filename = '../../to_gpt2/generation_1221k_top0.95_new/'
#ls = load_dir_data(filename)

load ../../to_gpt2/generation_333k_sorted/
time: 205 ms


In [19]:
ny_ingr = ny_ingredients(fields = ['generated_ingred'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 10.2 s


In [20]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 10.6 s


In [21]:
class metrics:
    def __init__(self, list_true, list_pred):
        self.y_true = [word for word in list_true if word != 'nan']
        self.y_pred = [word for word in list_pred if word != 'nan']
        self.y_true_string = ' %s '%(' '.join(self.y_true))
        self.y_pred_string = ' %s '%(' '.join(self.y_pred))
        # from collections import Counter
        
    # frequency weighted
    def f1_freq(self):
        precision = self.precision_freq()
        recall = self.recall_freq()
        try:
            f1 = 2*precision*recall/(precision + recall)
        except ZeroDivisionError:
            f1 = 0
        return f1
    
    def precision_freq(self):
        return self.scoring(self.y_pred, self.y_true)
    
    def recall_freq(self):
        return self.scoring(self.y_true, self.y_pred)
    
    def scoring(self, n1, n2):
        n1c, n2c = copy.deepcopy(n1), copy.deepcopy(n2)
        score = 0
        for word in n1c:
            if word in n2c:
                score +=1
                n2c.remove(word)
        if len(n1c): 
            return score/len(n1c)
        
    # without frequency weighted
    def precision(self):
        if len(self.y_pred): 
            return len(set(self.y_true) & set(self.y_pred))/len(set(self.y_pred))
    def recall(self):
        if len(self.y_true):
            return len(set(self.y_true) & set(self.y_pred))/len(set(self.y_true))
    def f1(self):
        precision = self.precision()
        recall = self.recall()
        try:
            f1 = 2*precision*recall/(precision + recall)
        except ZeroDivisionError:
            f1 = 0
        return f1
    
    # return scores in a dict    
    def all_recall(self, name):
        output = {}
        output['recall_%s'%(name)] = self.recall()
        output['recall_freq_%s'%(name)] = self.recall_freq()
        return output
    
    def all_precision(self, name):
        output = {}
        output['precision_%s'%(name)] = self.precision()
        output['precision_freq_%s'%(name)] = self.precision_freq()
        return output
    
    # frequency weighted
    def ngram_scoring(self, n1, n2):
        '''
        n1 = [' ddd d ',' der ',' w ',' w ']
        n2 = ' ddd d mnnm,n,m der ddd d w ow '
        '''
        n1c, n2c = copy.deepcopy([' %s ' % word for word in n1]), copy.deepcopy(n2)
        true_counter = Counter(n1)
        denomintater = sum(true_counter.values())
        score = 0
        for word in true_counter:
            occurrence = n2.count(word)
            occurrence = occurrence if occurrence < true_counter[word] else true_counter[word]
            score += occurrence
        return score/sum(true_counter.values())
    
    def ngram_recall_freq(self):
        return self.ngram_scoring(self.y_true, self.y_pred_string)
    
    def ngram_precision_freq(self):
        return self.ngram_scoring(self.y_pred, self.y_true_string)
    
    #  unweighted
    def ngram_recall(self):
        return np.mean([True if word in self.y_pred_string else False for word in set(self.y_true)])

    def ngram_precision(self):
        return np.mean([True if word in self.y_true_string else False for word in set(self.y_pred)])
    
    def all_ngram_recall(self, name):
        output = {}
        output['recall_ngram_%s'%(name)] = self.ngram_recall()
        output['recall_ngram_freq_%s'%(name)] = self.ngram_recall_freq()
        return output
    
    def all_ngram_precision(self, name):
        output = {}
        output['precision_ngram_%s'%(name)] = self.ngram_precision()
        output['precision_ngram_freq_%s'%(name)] = self.ngram_precision_freq()
        return output

time: 38.6 ms


In [22]:
class spacy_sentences(ny_ingredients):
    def __init__(self, fields):
        # spacy
        self.spacy = spacy.load('en_core_web_lg')
        
        for field in fields:
            assert field in dic[ls[0]].keys()
        self.fields = fields # list ['directions']
        
    def sent(self, listofsent):
        doc = self.spacy(' '.join(listofsent))
        return [token.lemma_ for token in doc]
    def lemma(self):
        '''
        using the global variables ls and dic
        '''
        for i, v in dic.items():
            if i in ls:
                for field in self.fields:
                    temp = self.sent(v[field])
                    dic[i]['lemma_%s'%(field)]=temp

time: 23.9 ms


In [23]:
sp_insr = spacy_sentences(['directions','generated_ingred', 'generated_instr', 'ingredients'])

time: 9.53 s


In [24]:
sp_insr.lemma()

time: 41.5 s


In [25]:
for i, v in tqdm.tqdm(dic.items()):
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['lemma_directions'])
        dic[i].update(score.all_ngram_recall(name='@baseline_exact'))
        score = metrics(v['ny_ingredients']['exact'], v['lemma_generated_ingred'])
        dic[i].update(score.all_ngram_recall(name='@test_exact'))
        score = metrics(v['ny_ingredients']['exact'], v['lemma_generated_instr'])
        dic[i].update(score.all_recall(name='@test2_exact'))
        
        score = metrics(v['ny_ingredients']['root'], v['lemma_directions'])
        dic[i].update(score.all_recall(name='@baseline_root'))
        score = metrics(v['ny_ingredients']['root'], v['lemma_generated_ingred'])
        dic[i].update(score.all_recall(name='@test_root'))
        score = metrics(v['ny_ingredients']['root'], v['lemma_generated_instr'])
        dic[i].update(score.all_recall(name='@test2_root'))
        
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
#logger.report_text(str(temp))

100%|██████████| 55102/55102 [00:00<00:00, 78100.32it/s]


time: 3.42 s


In [26]:
# 333k
print(str(temp))

recall_ngram_@baseline_exact         0.670009
recall_ngram_freq_@baseline_exact    0.671239
recall_ngram_@test_exact             0.644259
recall_ngram_freq_@test_exact        0.644269
recall_@test2_exact                  0.356363
recall_freq_@test2_exact             0.357149
recall_@baseline_root                0.918265
recall_freq_@baseline_root           0.915307
recall_@test_root                    0.772434
recall_freq_@test_root               0.768093
recall_@test2_root                   0.777106
recall_freq_@test2_root              0.770032
dtype: float64
time: 47.8 ms


In [322]:
# 28k
print(str(temp))

recall_ngram_@baseline_exact         0.670009
recall_ngram_freq_@baseline_exact    0.671239
recall_ngram_@test_exact             0.681311
recall_ngram_freq_@test_exact        0.680254
recall_@test2_exact                  0.363954
recall_freq_@test2_exact             0.362827
recall_@baseline_root                0.918265
recall_freq_@baseline_root           0.915307
recall_@test_root                    0.803410
recall_freq_@test_root               0.795026
recall_@test2_root                   0.776585
recall_freq_@test2_root              0.767976
dtype: float64
time: 31.4 ms


In [27]:
for i, v in tqdm.tqdm(dic.items()):
    if i in ls:
        score = metrics(v['lemma_directions'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_precision(name='#test_exact'))
        score = metrics(v['lemma_ingredients'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_precision(name='#exact'))
        score = metrics(v['lemma_directions'], v['ny_ingredients']['exact'])
        dic[i].update(score.all_ngram_precision(name='#baseline_exact'))
        
        score = metrics(v['lemma_directions'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_ngram_precision(name='#test_root'))
        score = metrics(v['lemma_ingredients'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_ngram_precision(name='#root'))
        score = metrics(v['lemma_directions'], v['ny_ingredients']['root'])
        dic[i].update(score.all_ngram_precision(name='#baseline_root'))

df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '#' in col]].iloc[ls].mean()

100%|██████████| 55102/55102 [00:00<00:00, 79626.79it/s]


time: 3.94 s


In [28]:
# 333k
print(str(temp))

precision_ngram_#test_exact             0.675705
precision_ngram_freq_#test_exact        0.608270
precision_ngram_#exact                  0.769029
precision_ngram_freq_#exact             0.672073
precision_ngram_#baseline_exact         0.670009
precision_ngram_freq_#baseline_exact    0.671239
precision_ngram_#test_root              0.869120
precision_ngram_freq_#test_root         0.773899
precision_ngram_#root                   0.876402
precision_ngram_freq_#root              0.745420
precision_ngram_#baseline_root          0.919592
precision_ngram_freq_#baseline_root     0.916711
dtype: float64
time: 42.7 ms


In [319]:
# 28k
print(str(temp))

precision_ngram_#test_exact             0.682268
precision_ngram_freq_#test_exact        0.631267
precision_ngram_#exact                  0.778827
precision_ngram_freq_#exact             0.703531
precision_ngram_#baseline_exact         0.670009
precision_ngram_freq_#baseline_exact    0.671239
precision_ngram_#test_root              0.877475
precision_ngram_freq_#test_root         0.800930
precision_ngram_#root                   0.887265
precision_ngram_freq_#root              0.781157
precision_ngram_#baseline_root          0.919592
precision_ngram_freq_#baseline_root     0.916711
dtype: float64
time: 30.9 ms


### Calculate the BLEU score

In [6]:
def add_space(line):
    # add space before punct
    line = re.sub('([.,!?()])', r' \1 ', line)
    line = re.sub('\s{2,}', ' ', line)
    return line

to_write = {'truth_t':'', 'truth_i':'', 'truth_d':'',
            'pred_t':'', 'pred_i':'', 'pred_d':''
           }
for i, v in dic.items():
    if i in ls:
        to_write['truth_t'] += add_space(' '.join(v['name'])) + '\n'
        to_write['truth_i'] += add_space(' $ '.join(v['ingredients']))+ ' $ \n'
        to_write['truth_d'] += add_space(' '.join(v['directions'])) + ' . \n'
        to_write['pred_t'] += add_space(v['generated_name']) + '\n'
        to_write['pred_i'] += add_space(' $ '.join(v['generated_ingred'])) + ' $ \n'
        to_write['pred_d'] += add_space(' . '.join(v['generated_instr'])) + ' . \n'
        
for k, v in to_write.items():
    save('../../to_gpt2/generation_333k_sorted_%s.txt'%(k), v ,overwrite = True)

saved ../../to_gpt2/generation_333k_sorted_truth_t.txt
saved ../../to_gpt2/generation_333k_sorted_truth_i.txt
saved ../../to_gpt2/generation_333k_sorted_truth_d.txt
saved ../../to_gpt2/generation_333k_sorted_pred_t.txt
saved ../../to_gpt2/generation_333k_sorted_pred_i.txt
saved ../../to_gpt2/generation_333k_sorted_pred_d.txt
time: 534 ms


In [7]:
cd tools

/data/yueliu/RecipeAnalytics_201906/AA6/NLG_notebooks/tools
time: 49 ms


In [8]:
!perl multi-bleu.perl ../../../to_gpt2/generation_333k_sorted_truth_t.txt < ../../../to_gpt2/generation_333k_sorted_pred_t.txt

BLEU = 5.80, 34.3/10.7/3.5/0.9 (BP=1.000, ratio=1.002, hyp_len=1988, ref_len=1985)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 203 ms


In [9]:
!perl multi-bleu.perl ../../../to_gpt2/generation_333k_sorted_truth_i.txt < ../../../to_gpt2/generation_333k_sorted_pred_i.txt

BLEU = 27.12, 60.6/37.7/21.6/11.0 (BP=1.000, ratio=1.030, hyp_len=27837, ref_len=27015)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 582 ms


In [10]:
!perl multi-bleu.perl ../../../to_gpt2/generation_333k_sorted_truth_d.txt < ../../../to_gpt2/generation_333k_sorted_pred_d.txt

BLEU = 12.83, 55.1/22.4/9.6/4.6 (BP=0.840, ratio=0.852, hyp_len=51323, ref_len=60267)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 955 ms


### ROUGE

In [None]:
!pip install rouge 

In [326]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_t.txt ../../to_gpt2/generation_28k_sorted/pred_t.txt --avg

{
  "rouge-1": {
    "f": 0.3494049357953457,
    "p": 0.34968650793650813,
    "r": 0.3886880952380953
  },
  "rouge-2": {
    "f": 0.11146750909357371,
    "p": 0.11326428571428578,
    "r": 0.12848968253968257
  },
  "rouge-l": {
    "f": 0.3112737859663617,
    "p": 0.33570476190476206,
    "r": 0.37238015873015884
  }
}
time: 470 ms


In [327]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_i.txt ../../to_gpt2/generation_28k_sorted/pred_i.txt --avg

{
  "rouge-1": {
    "f": 0.658054120038043,
    "p": 0.6195348040467759,
    "r": 0.7168047848154568
  },
  "rouge-2": {
    "f": 0.43489691600028596,
    "p": 0.41772517944310966,
    "r": 0.46327416881162253
  },
  "rouge-l": {
    "f": 0.36508606093440943,
    "p": 0.35378023860347446,
    "r": 0.408473446063344
  }
}
time: 2.21 s


In [325]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_d.txt ../../to_gpt2/generation_28k_sorted/pred_d.txt --avg

{
  "rouge-1": {
    "f": 0.5010280412638892,
    "p": 0.4857465311056858,
    "r": 0.5544363885970187
  },
  "rouge-2": {
    "f": 0.19623533217129435,
    "p": 0.19298545738851539,
    "r": 0.21920300656555444
  },
  "rouge-l": {
    "f": 0.4349484306791758,
    "p": 0.4443676911244383,
    "r": 0.5080808218109966
  }
}
time: 8.48 s
