In [1]:
from trains import Task
task = Task.init(project_name="HelenaIngrF1", task_name="default")
logger = Task.current_task().get_logger()

TRAINS Task: overwriting (reusing) task id=87bcc6ea17524420bea9f1333d57b2da
2019-09-27 16:35:54,744 - trains.Task - INFO - No repository found, storing script code instead
TRAINS Monitor: GPU monitoring is not available, run "pip install gpustat"
TRAINS results page: files_server: http://10.0.106.144:5909/projects/d7092e97e1174fb4bc1d441040d57553/experiments/87bcc6ea17524420bea9f1333d57b2da/output/log


In [29]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2


# if cannot import the modules, add the parent directory to system path might help

import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/'
sys.path.append(parent_dir)

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, save
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from collections import Counter
import spacy
import copy
import re
random_seed = 2019

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 13.6 µs
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 50.2 ms


In [2]:
dic = load(dir_save = '../big_data/dic_20190830.pickle')
ls = [i for i,v in dic.items() if len(v['ingredients'])>1]
print('drop %d recipes with less than 2 ingredients' %(len(dic)-len(ls)))
ls = [i for i in ls if len(dic[i]['directions'])>1]
print('furthur drop %d recipes with less than 2 instructions' %(len(dic)-len(ls)))
desc = [i for i in ls if len(dic[i]['description'])<1]
print('drop %d recipes with no description' %(len(desc)))
print('now we are using recipe54k %d' % len(ls))

exist ../big_data/dic_20190830.pickle
drop 46 recipes with less than 2 ingredients
furthur drop 1026 recipes with less than 2 instructions
drop 0 recipes with no description
now we are using recipe54k 54076
time: 2.99 s


In [4]:
### STEP2 load and clean the generation

def reverse(text):
    '''
    Important data cleaning before NY times parser
    '''
    # replace things in brace
    text = re.sub(r'\([^)]*\)', '', text)

    # remove space before punct
    text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', text)

    # remove consecutive spaces
    text = re.sub(' +',' ',text).strip()
    return text

def reverse_list(listoftext):
    output=[]
    for text in listoftext:
        rev = reverse(text)
        if rev:
            output.append(rev)
    return output

def load_dir_data(filename):
    ls = []
    if os.path.isdir(filename):
        print('load', filename)
        # Directory
        for (dirpath, _, fnames) in os.walk(filename):
            for fname in fnames:
                path = os.path.join(dirpath, fname)
                with open(path, 'r') as fp:
                    raw_text = fp.read()
                    
                # if it contains instr
                if fname[-5] == 'd':
                    dic[int(fname[:-5])]['generated_instr'] = reverse_list(raw_text.split('.'))

                # if it contains ingred
                if fname[-5] == 'i':
                    dic[int(fname[:-5])]['generated_ingred'] = reverse_list(raw_text.split('$'))
                    
                # if it contains name
                if fname[-5] == 't':
                    dic[int(fname[:-5])]['generated_name'] = raw_text
                ls.append(int(fname[:-5]))# only interested in instr
                    
    return sorted(list(set(ls)))

time: 45.9 ms


In [12]:
### STEP3 sent to the NYtimes
### assign indices to each ingredient <---> NYtimes
class ny_ingredients:
    def __init__(self, fields):
        # this function will take the global variable ls and dic
        # static & reuseable
        self.ny_ingred = '../../NYtime-parser2/ingred.txt'
        self.ny_result = '../../NYtime-parser2/result.json'
        
        # spacy
        self.spacy = spacy.load('en_core_web_lg')
        self.fields = fields #['ingredients', 'generated_ingred']

    def to_ny(self):
        '''
        using global variables dic and ls
        '''
        to_write = []
        for i, v in dic.items():
            if i in ls:
                # assing index
                for field in self.fields:
                    line_ids = []
                    for line in v[field]:
                        reversed_line = reverse(line)
                        if line in to_write:
                            ny_id = to_write.index(reversed_line)
                        else:
                            ny_id = len(to_write)
                            to_write.append(reversed_line)
                        line_ids.append(ny_id)
                    dic[i]['ny_%s'%(field)] = line_ids

        # save the file to the folder under NYtime-parser2
        save(filename = self.ny_ingred, 
             to_write = '\n'.join(to_write),
             overwrite = True, 
             print_=True)

        self.to_write = to_write
        
    # step 3
    def to_ingred(self):
        '''
        using global variables dic and ls
        '''
        ny_result = pd.read_json(self.ny_result)
        to_write = []
        for i, v in dic.items():
            if i in ls:
                # assing index
                for field in self.fields:
                    temp = [ny_result.loc[ny_id]['name'] for ny_id in v['ny_%s'%(field)]]
                    exact, root = self.extract(temp)
                    dic[i]['ny_%s'%(field)] = {'ny':temp, 'exact':exact, 'root':root}
                    
    def extract(self, ny_ingred):
        '''
        Args: ny_ingred: a list of ingredient names
        '''
        phrases_to_sentences = ' '.join(['Mix the %s and water.'%ingr for ingr in ny_ingred])
        doc = self.spacy(phrases_to_sentences)
        exact_match, root_match = [],[]
        for chunk in doc.noun_chunks:
            if chunk.text != 'water':
                root_lemma = [token.lemma_ for token in doc if token.text == chunk.root.text][0]
                exact_match.append(chunk.lemma_.replace('the ',''))
                root_match.append(root_lemma)
        return exact_match, root_match

time: 44.8 ms


In [5]:
for i, v in dic.items():
    if i in ls:
        for field in ['name', 'ingredients', 'directions']:
            dic[i][field] = reverse_list(v[field])

time: 40.9 s


### send the ground truth to ny parser

In [6]:
### start                    
ny_ingr = ny_ingredients(fields = ['ingredients'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 4min 13s


In [7]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 23min 31s


In [8]:
# for the field ingredients
save_pickle(obj = dic, filename='../big_data/dic_20190927.pickle', overwrite=False)

time: 4.22 s


### send the generated text to ny parser

In [30]:
dic = load(dir_save = '../big_data/dic_20190927.pickle')

exist ../big_data/dic_20190927.pickle
time: 3.92 s


In [48]:
filename = '../../to_gpt2/generation_28k_sorted/'
ls = load_dir_data(filename)
#filename = '../../to_gpt2/generation_1221k_top0.95_new/'
#ls = load_dir_data(filename)

load ../../to_gpt2/generation_28k_sorted/
time: 236 ms


In [59]:
ny_ingr = ny_ingredients(fields = ['generated_ingred'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 11.6 s


In [60]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 11.8 s


In [45]:
from utils.evaluation import metrics

time: 46.6 ms


In [46]:
for i, v in tqdm.tqdm(dic.items()):
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_recall(name='@recall_exact'))
        dic[i].update(score.all_ngram_precision(name='@precision_exact'))
        score = metrics(v['ny_ingredients']['root'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_recall(name='@recall_root'))
        dic[i].update(score.all_precision(name='@precision_root'))        
        
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
# 333k
print(str(temp))

100%|██████████| 55102/55102 [00:00<00:00, 105638.31it/s]


time: 4.11 s


In [51]:
# 28k
for i, v in tqdm.tqdm(dic.items()):
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_recall(name='@recall_exact'))
        dic[i].update(score.all_ngram_precision(name='@precision_exact'))
        score = metrics(v['ny_ingredients']['root'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_recall(name='@recall_root'))
        dic[i].update(score.all_precision(name='@precision_root'))        
        
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
#logger.report_text(str(temp))
print(str(temp))

100%|██████████| 55102/55102 [00:00<00:00, 107341.56it/s]


recall_ngram_@recall_exact               0.654547
recall_ngram_freq_@recall_exact          0.653133
precision_ngram_@precision_exact         0.750958
precision_ngram_freq_@precision_exact    0.675934
recall_@recall_root                      0.761633
recall_freq_@recall_root                 0.754487
precision_@precision_root                0.833324
precision_freq_@precision_root           0.728754
dtype: float64
time: 3.45 s


In [58]:
filename = '../../to_gpt2/generation_28k_train/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_28k_train/
time: 212 ms


In [61]:
for i, v in tqdm.tqdm(dic.items()):
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_recall(name='@recall_exact'))
        dic[i].update(score.all_ngram_precision(name='@precision_exact'))
        score = metrics(v['ny_ingredients']['root'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_recall(name='@recall_root'))
        dic[i].update(score.all_precision(name='@precision_root'))        
        
df2 = pd.DataFrame.from_dict(dic, orient = 'index')
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls].mean()
#logger.report_text(str(temp))
print(str(temp))

100%|██████████| 55102/55102 [00:00<00:00, 104246.71it/s]


recall_ngram_@recall_exact               0.671040
recall_ngram_freq_@recall_exact          0.667939
precision_ngram_@precision_exact         0.752466
precision_ngram_freq_@precision_exact    0.676208
recall_@recall_root                      0.758691
recall_freq_@recall_root                 0.752314
precision_@precision_root                0.831289
precision_freq_@precision_root           0.726411
dtype: float64
time: 3.81 s


In [111]:
filename = '../../to_gpt2/generation_28k_sorted/'
ls = load_dir_data(filename)

time: 20.6 ms


In [116]:
cd '/data/yueliu/RecipeAnalytics_201906/AA6/NLG_notebooks'

/data/yueliu/RecipeAnalytics_201906/AA6/NLG_notebooks
time: 28.3 ms


### Calculate the BLEU score

In [118]:
def add_space(line):
    # add space before punct
    line = re.sub('([.,!?()])', r' \1 ', line)
    line = re.sub('\s{2,}', ' ', line)
    return line

to_write = {'truth_t':'', 'truth_i':'', 'truth_d':'',
            'pred_t':'', 'pred_i':'', 'pred_d':''
           }
for i, v in dic.items():
    if i in ls:
        to_write['truth_t'] += add_space(' '.join(v['name'])) + '\n'
        to_write['truth_i'] += add_space(' $ '.join(v['ingredients']))+ ' $ \n'
        to_write['truth_d'] += add_space(' '.join(v['directions'])) + ' . \n'
        to_write['pred_t'] += add_space(v['generated_name']) + '\n'
        to_write['pred_i'] += add_space(' $ '.join(v['generated_ingred'])) + ' $ \n'
        to_write['pred_d'] += add_space(' . '.join(v['generated_instr'])) + ' . \n'
        
for k, v in to_write.items():
    save('../../to_gpt2/generation_20191008_inv_%s.txt'%(k), v ,overwrite = True)

saved ../../to_gpt2/generation_20191008_inv_truth_t.txt
saved ../../to_gpt2/generation_20191008_inv_truth_i.txt
saved ../../to_gpt2/generation_20191008_inv_truth_d.txt
saved ../../to_gpt2/generation_20191008_inv_pred_t.txt
saved ../../to_gpt2/generation_20191008_inv_pred_i.txt
saved ../../to_gpt2/generation_20191008_inv_pred_d.txt
time: 38.2 ms


In [105]:
cd tools

/data/yueliu/RecipeAnalytics_201906/AA6/NLG_notebooks/tools
time: 37 ms


In [102]:
!perl multi-bleu.perl ../../../to_gpt2/generation_28k_train_truth_t.txt < ../../../to_gpt2/generation_28k_train_pred_t.txt

BLEU = 5.51, 34.4/11.0/2.9/0.9 (BP=0.976, ratio=0.976, hyp_len=1952, ref_len=2000)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 337 ms


In [106]:
!perl multi-bleu.perl ../../../to_gpt2/generation_28k_train_truth_d.txt < ../../../to_gpt2/generation_28k_train_pred_d.txt

BLEU = 13.95, 53.2/22.0/9.8/4.9 (BP=0.906, ratio=0.910, hyp_len=57317, ref_len=62953)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 1.12 s


In [107]:
!perl multi-bleu.perl ../../../to_gpt2/generation_28k_train_truth_i.txt < ../../../to_gpt2/generation_28k_train_pred_i.txt

BLEU = 30.68, 64.7/41.7/25.0/13.3 (BP=0.997, ratio=0.997, hyp_len=28097, ref_len=28172)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 712 ms


### ROUGE

In [None]:
!pip install rouge 

In [326]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_t.txt ../../to_gpt2/generation_28k_sorted/pred_t.txt --avg

{
  "rouge-1": {
    "f": 0.3494049357953457,
    "p": 0.34968650793650813,
    "r": 0.3886880952380953
  },
  "rouge-2": {
    "f": 0.11146750909357371,
    "p": 0.11326428571428578,
    "r": 0.12848968253968257
  },
  "rouge-l": {
    "f": 0.3112737859663617,
    "p": 0.33570476190476206,
    "r": 0.37238015873015884
  }
}
time: 470 ms


In [327]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_i.txt ../../to_gpt2/generation_28k_sorted/pred_i.txt --avg

{
  "rouge-1": {
    "f": 0.658054120038043,
    "p": 0.6195348040467759,
    "r": 0.7168047848154568
  },
  "rouge-2": {
    "f": 0.43489691600028596,
    "p": 0.41772517944310966,
    "r": 0.46327416881162253
  },
  "rouge-l": {
    "f": 0.36508606093440943,
    "p": 0.35378023860347446,
    "r": 0.408473446063344
  }
}
time: 2.21 s


In [325]:
!rouge -f ../../to_gpt2/generation_28k_sorted/truth_d.txt ../../to_gpt2/generation_28k_sorted/pred_d.txt --avg

{
  "rouge-1": {
    "f": 0.5010280412638892,
    "p": 0.4857465311056858,
    "r": 0.5544363885970187
  },
  "rouge-2": {
    "f": 0.19623533217129435,
    "p": 0.19298545738851539,
    "r": 0.21920300656555444
  },
  "rouge-l": {
    "f": 0.4349484306791758,
    "p": 0.4443676911244383,
    "r": 0.5080808218109966
  }
}
time: 8.48 s
