In [2]:
import os
import re
import numpy as np
import transformers
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, AutoTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Defining some utility functions

In [13]:
def get_ingredients_list(recipe):
    '''
    Gets the list of input ingredients from a raw recipe
    '''
    ingr_start_index = recipe.find("<INPUT_START>")
    ingr_end_index = recipe.find("<INPUT_END>")

    ingredients_sequence = " ".join(recipe[ingr_start_index + len("<INPUT_START>"):ingr_end_index].strip().split())  # Find the input ingredients list sequence
    ingredients_list = ingredients_sequence.split("<NEXT_INPUT>")  # split the ingredients when the next input token is reached
    return [x.strip() for x in ingredients_list]  # strip whitespaces before and after ingredients


def print_raw_recipe(full_raw_recipe):
    '''
    Print a raw recipe (containing the special tokens) to be easier to read
    '''
    markdown = re.sub("<RECIPE_(START|END)>", "", full_raw_recipe)
    recipe_n_title = markdown.split("<TITLE_START>")
    title = "# " + recipe_n_title[1].replace("<TITLE_END>", "") + " #\n"
    markdown = recipe_n_title[0].replace("<INPUT_START>", "## Input ingredients ##\n`").replace("<INPUT_END>", "`\n")
    markdown = markdown.replace("<NEXT_INPUT>", "`\n`").replace("<INGR_START>","## Ingredients ##\n* ").replace("<NEXT_INGR>","\n* ").replace("<INGR_END>", "\n")
    markdown = markdown.replace("<INSTR_START>", "## Instructions ##\n1) ")

    # Count each instruction
    count = 2
    while markdown.find("<NEXT_INSTR>") != -1:
        markdown = markdown.replace("<NEXT_INSTR>", f"\n{count}) ", 1)
        count += 1

    markdown = markdown.replace("<INSTR_END>", "\n")
    markdown = re.sub("$ +#", "#", markdown)
    markdown = re.sub("( +`|` +)", "`", markdown)
    print('\n' + title + markdown)

## Creating the dataset

In [4]:
local_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
sample_path = local_path + "/results/2023-07-20_12-25-38/sample_gpt2.txt"
finetuned_path = local_path + "/results/2023-07-20_12-25-38/finetuned_gpt2.txt"
print(sample_path)
print(finetuned_path)

/home/hazot/code/recipe-generation-project/results/2023-07-20_12-25-38/sample_gpt2.txt
/home/hazot/code/recipe-generation-project/results/2023-07-20_12-25-38/finetuned_gpt2.txt


In [5]:
data_dir="data"

data = {
    "sample": [],
    "finetuned": [],
    "vanilla": []
}

In [6]:
with open(sample_path, 'r') as f:
    content = f.readlines()
    data["sample"] = [content[i * 2].replace('\n','') for i in range(len(content) // 2)]

In [7]:
with open(finetuned_path, 'r') as f:
    content = f.readlines()
    data["finetuned"] = [content[i * 2].replace('\n','') for i in range(len(content) // 2)]

In [8]:
print(len(data['sample']))
print(len(data['finetuned']))

100
1000


In [9]:
print_raw_recipe(data['finetuned'][0])


#  Pina Colada Cookie Cookies With Coconut Glaze   #
 ## Input ingredients ##
`sweet rice flour`
`coconut`
`coconut milk`
`cooking oil`
`Coating`
`brown sugar`
`coconut milk`
`coconut milk`
`powdered sugar`
`flour`
`ground cinnamon`
`baking powder`
`baking soda`
`salt`
`vegetable shortening`
`coconut`
`sugar`
`water`
`light corn syrup`
`coconut milk`
`egg whites`
 ## Ingredients ##
*  Chocolate Glaze: 
*  1 cup sweet rice flour 
*  1 cup coconut flakes (recommended: Rollit) 
*  3/4 cup coconut milk, plus 
*  1 tablespoon cooking oil, plus 
*  Coating: 
*  1/2 cup packed light brown sugar 
*  1/2 cup coconut milk 
*  1/2 cup coconut milk 
*  1 1/4 cups powdered sugar 
*  1 cup all-purpose flour 
*  1 teaspoon ground cinnamon 
*  1 teaspoon baking powder 
*  1 teaspoon baking soda 
*  1/2 teaspoon salt 
*  3/4 cup vegetable shortening 
*  1 cup sweetened flaked coconut 
*  1/2 cup sugar 
*  1/4 cup water 
*  3 tablespoons light corn syrup 
*  2 tablespoons coconut milk 
*  2 large egg w

## Cosine similarity

In [None]:
tokenizer = AutoTokenizer.from_pretrained(local_path + '/checkpoints/gpt2/checkpoint-gpt2/')

In [None]:
sample_tensor = [tokenizer.encode(recipe) for recipe in data['sample']]
finetuned_tensor = [tokenizer.encode(recipe) for recipe in data['finetuned']]

In [None]:
avg = 0
for k, rec1 in enumerate(sample_tensor):
    best = 0
    for i in range(0,10):
        rec2 = finetuned_tensor[k*10 + i]
        
        # pad
        pad_len = np.abs(len(rec1) - len(rec2))
        if len(rec1) < len(rec2):
            rec1.extend([0]*pad_len)
        else:
            rec2.extend([0]*pad_len)
        
        cos = cosine_similarity([rec1], [rec2])
        best = max(best, cos)
    avg += best

avg = avg/len(sample_tensor)
print("avg: ", avg)

## Language check

In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
#tool.disable_spellchecking()
results = tool.check(data["finetuned"][0])
results_filtered = [result for result in results if result.ruleId!='WHITESPACE_RULE' ]

In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
#tool.disable_spellchecking()

avg = 0
dataset = "finetuned"

for rec in tqdm(data[dataset], desc="Iteration", disable=False, position=0, leave=True):
    results = tool.check(rec)
    results_filtered = [result for result in results if result.ruleId!='WHITESPACE_RULE' ]
    avg += len(results_filtered)

print(avg / len(data[dataset]))

## Readibility

* textstat.smog_index(test_data)
* textstat.flesch_kincaid_grade(test_data)
* textstat.coleman_liau_index(test_data)
* textstat.automated_readability_index(test_data)
* textstat.dale_chall_readability_score(test_data)
* textstat.difficult_words(test_data)
* textstat.linsear_write_formula(test_data)
* textstat.gunning_fog(test_data)
* textstat.text_standard(test_data)

In [None]:
import numpy as np
import textstat
from scipy import stats

In [None]:
ret = []
dataset = "finetuned"

for rec in data[dataset]:
    result = textstat.flesch_reading_ease(rec)
    #print(result)
    ret.append(result)
    
print(np.mean(ret), np.median(ret), stats.mode(ret))

In [None]:
ret = []
dataset = "sample"

for rec in data[dataset]:
    result = textstat.smog_index(rec)
    #print(result)
    ret.append(result)
    
print(np.mean(ret), np.median(ret), stats.mode(ret))

In [None]:
ret = []
dataset = "finetuned"

for rec in data[dataset]:
    result = textstat.gunning_fog(rec)
    #print(result)
    ret.append(result)
    
print(np.mean(ret), np.median(ret), stats.mode(ret))

In [None]:
ret = []
dataset = "sample"

for rec in data[dataset]:
    result = textstat.dale_chall_readability_score(rec)
    #print(result)
    ret.append(result)
    
print(np.mean(ret), np.median(ret), stats.mode(ret))

## Translation

In [None]:
import nltk
import nltk.translate.bleu_score as bleu
from nltk.translate.bleu_score import SmoothingFunction

import nltk.translate.gleu_score as gleu
import nltk.translate.meteor_score as meteor
from jiwer import wer, mer

In [None]:
def wer_count(hyp, ref, print_matrix=False):
    N = len(hyp)
    M = len(ref)
    L = np.zeros((N,M))
    for i in range(0, N):
        for j in range(0, M):
            if min(i,j) == 0:
                L[i,j] = max(i,j)
            else:
                deletion = L[i-1,j] + 1
                insertion = L[i,j-1] + 1
                sub = 1 if hyp[i] != ref[j] else 0
                substitution = L[i-1,j-1] + sub
                L[i,j] = min(deletion, min(insertion, substitution))
    return int(L[N-1, M-1])

def bleu_score(recipe, refer):
    hyp = recipe
    refs = refer
    smoothie = SmoothingFunction().method4
    score_ref_a = bleu.sentence_bleu(refs, hyp, smoothing_function=smoothie)
    return score_ref_a

def gleu_score(recipe, refer):
    hyp = recipe
    refs = refer
    score_ref_a = gleu.sentence_gleu(refs, hyp)
    return score_ref_a

def wer_score(recipe, refer):
    hyp = recipe
    refs = refer

    mn = 99999
    for ref in refs:
        b = wer(ref, hyp)
        mn = min(mn, b)
       
    return mn

In [None]:
ret = []
for k, rec1 in enumerate(tqdm(data["sample"])):
    rec2 = data["finetuned"][k*10: k*10 + 10]
    res = bleu_score(rec1, rec2)
    ret.append(res)

np.mean(ret)

In [None]:
ret = []
for k, rec1 in enumerate(tqdm(data["sample"])):
    rec2 = data["finetuned"][k*10: k*10 + 10]
    res = gleu_score(rec1, rec2)
    ret.append(res)

np.mean(ret)

In [None]:
ret = []
for k, rec1 in enumerate(tqdm(data["sample"])):
    rec2 = data["finetuned"][k*10: k*10 + 10]
    res = wer_score(rec1, rec2)
    ret.append(res)

np.mean(ret)

# Ingredients evaluations

#### Test if all the ingredients from the input list are used inside the generated instructions

In [None]:
def get_instructions(recipe):
    '''
    Gets the string sequence of instructions from a raw recipe
    '''
    instr_start_index = recipe.find("<INSTR_START>")
    instr_end_index = recipe.find("<INSTR_END>")

    instruction_sequence = " ".join(recipe[instr_start_index + len("<INSTR_START>"):instr_end_index].strip().split())  # Find the input ingredients list sequence
    instructions = instruction_sequence.split("<NEXT_INSTR>")  # split the ingredients when the next input token is reached
    instructions = [x.strip() for x in instructions]  # strip whitespaces before and after ingredients
    return " ".join(instructions)

In [67]:
def get_ingredient_coverage_in_instructions(recipes):
    results = []
    for recipe in recipes:
        ingredients = get_ingredients_list(recipe)
        instructions = get_instructions(recipe).lower()

        ingredients = list(dict.fromkeys(ingredients))  # remove duplicate ingredient to remove bias

        count = sum([1 if re.search(ingredient.lower(), instructions) else 0 for ingredient in ingredients])
    
        results.append(count / len(ingredients))
    return results

In [None]:
res = get_ingredient_coverage_in_instructions(data['finetuned'])
print(np.mean(res))
print(np.argmin(res))

##### test

In [95]:
recipe = data['finetuned'][-5]
recipe

'<RECIPE_START> <INPUT_START> margarine <NEXT_INPUT>  cooking oats <NEXT_INPUT>  sugar <NEXT_INPUT>  cinnamon <NEXT_INPUT>  applesauce <NEXT_INPUT>  whipped cream<INPUT_END> <INGR_START> 1 stick margarine <NEXT_INGR> 2 c. quick cooking oats <NEXT_INGR> 1/2 c. sugar <NEXT_INGR> 1/2 tsp. cinnamon <NEXT_INGR> 1 (12 oz.) can applesauce <NEXT_INGR> 1 c. whipped cream <INGR_END> <INSTR_START> Mix first 4 ingredients; set aside. <NEXT_INSTR> Combine apple sauce and 1 cup whipped cream. <NEXT_INSTR> Combine all ingredients and chill. <INSTR_END> <TITLE_START> Mountain Oatmeal Pie <TITLE_END> <RECIPE_END>'

In [96]:
get_ingredients_list(recipe)

['margarine',
 'cooking oats',
 'sugar',
 'cinnamon',
 'applesauce',
 'whipped cream']

In [97]:
print_raw_recipe(recipe)


#  Mountain Oatmeal Pie   #
 ## Input ingredients ##
`margarine`
`cooking oats`
`sugar`
`cinnamon`
`applesauce`
`whipped cream`
 ## Ingredients ##
*  1 stick margarine 
*  2 c. quick cooking oats 
*  1/2 c. sugar 
*  1/2 tsp. cinnamon 
*  1 (12 oz.) can applesauce 
*  1 c. whipped cream 
 ## Instructions ##
1)  Mix first 4 ingredients; set aside. 
2)  Combine apple sauce and 1 cup whipped cream. 
3)  Combine all ingredients and chill. 
 


In [98]:
ingredients = get_ingredients_list(recipe)
ingredients

['margarine',
 'cooking oats',
 'sugar',
 'cinnamon',
 'applesauce',
 'whipped cream']

In [99]:
instructions = get_instructions(recipe)
instructions

'Mix first 4 ingredients; set aside. Combine apple sauce and 1 cup whipped cream. Combine all ingredients and chill.'

In [101]:
ingredients_cleared = list(dict.fromkeys(ingredients))  # remove duplicate ingredient to remove bias
for ingr in ingredients_cleared:
    x = re.search(ingr, instructions)
    print(x)

None
None
None
None
None
<re.Match object; span=(66, 79), match='whipped cream'>


#### Test if all the ingredients from the ingredients list are used inside the generated instructions

#### Test if there are any duplicates ingredients in the input list

#### Test if there are any duplicates ingredients in the ingredient list

#### Test if all the ingredients found in the generated instructions are mentioned in the input list

#### Test if all the ingredients found in the generated instructions are mentioned in the ingredients list