In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
# !pip install matplotlib numpy pandas tqdm nltk

# for separating ingredients vs non-ingredients
# NOTE: if using Windows to run this, need to download GNU Wget
# !wget -c https://raw.githubusercontent.com/williamLyh/RecipeWithPlans/main/ingredient_set.json -O ingredient_set.json

In [3]:
import os
import math
import re
import string
import numpy as np
import pandas as pd
import random
import json
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR, MultiStepLR, CosineAnnealingWarmRestarts
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate import meteor

from data import *
from encoder_decoder import *
from train import *
from eval import *
from utils import *

# required for bleu
# nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm


---

In [4]:
SEED = 31989101
HIDDEN_SIZE = 256
MAX_INGR_LEN = 150 # fixed from assignment
MAX_RECIPE_LEN = 600
DROPOUT = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## ensuring reproducibility
def reset_rng():
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

reset_rng()

# to easily read ingredients and instructions
pd.set_option('display.max_colwidth', 2000)

print(f"Using device: {DEVICE}")

Using device: cuda


In [5]:
data_root = "./Cooking_Dataset"
add_intermediate_tag=False

train_df_orig = pd.read_csv(os.path.join(data_root, "train.csv"), usecols=['Ingredients', 'Recipe'])
dev_df_orig = pd.read_csv(os.path.join(data_root, "dev.csv"), usecols=['Ingredients', 'Recipe'])
test_df_orig = pd.read_csv(os.path.join(data_root, "test.csv"), usecols=['Ingredients', 'Recipe'])

In [6]:
train_df = preprocess_data(train_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)

Number of data samples before preprocessing: 101340
Number of data samples after preprocessing: 100637 (99.306%)


In [7]:
dev_df = preprocess_data(dev_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)

Number of data samples before preprocessing: 797
Number of data samples after preprocessing: 793 (99.498%)


In [8]:
test_df = preprocess_data(test_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)

Number of data samples before preprocessing: 778
Number of data samples after preprocessing: 774 (99.486%)


In [9]:
vocab = Vocabulary(add_intermediate_tag=add_intermediate_tag)
vocab.populate(train_df)
vocab.n_unique_words

100%|██████████| 100637/100637 [00:03<00:00, 31340.65it/s]


44315

In [10]:
train_ds = RecipeDataset(train_df, vocab)
dev_ds = RecipeDataset(dev_df, vocab, train=False)
test_ds = RecipeDataset(test_df, vocab, train=False)

### Encoder-Decoder (Base)

In [11]:
embedding_size=300
encoder = EncoderRNN(vocab.n_unique_words, embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, padding_value=vocab.word2index(PAD_WORD)).to(DEVICE)
# in the training script, decoder is always fed a non-end token and thus never needs to generate padding
# also it should never generate "<UNKNOWN>"
decoder = DecoderRNN(embedding_size=embedding_size,hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-2).to(DEVICE)

In [12]:
initial_lr=1e-3
min_lr = 1e-5
n_epochs = 30
batch_size=128
encoder_optimizer = optim.Adam(encoder.parameters(), lr=initial_lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=initial_lr)
# enc_scheduler = CosineAnnealingWarmRestarts(encoder_optimizer, T_0=math.ceil(len(train_ds) / 128), 
#                                             verbose=True, eta_min=min_lr)
# dec_scheduler = CosineAnnealingWarmRestarts(decoder_optimizer, T_0=math.ceil(len(train_ds) / 128), 
#                                             verbose=True, eta_min=min_lr)
enc_scheduler = CosineAnnealingLR(encoder_optimizer, T_max=n_epochs, eta_min=min_lr)
dec_scheduler = CosineAnnealingLR(decoder_optimizer, T_max=n_epochs, eta_min=min_lr)
# enc_scheduler = MultiStepLR(encoder_optimizer, milestones=[15], gamma=0.1)
# dec_scheduler = MultiStepLR(decoder_optimizer, milestones=[15], gamma=0.1)
identifier="adam_without_intermediate_tags_wd0_lr1e-3"
epoch_losses, log = train(encoder, decoder, encoder_optimizer, decoder_optimizer, train_ds, 
                     n_epochs=n_epochs, vocab=vocab, decoder_mode="basic", batch_size=batch_size, 
                     enc_lr_scheduler=enc_scheduler, dec_lr_scheduler=dec_scheduler, 
                     dev_ds = dev_ds, identifier=identifier,
                     verbose_iter_interval=50)

save_log(identifier, log, encoder_optimizer, decoder_optimizer, enc_scheduler, dec_scheduler)

Starting epoch 1/30, enc lr scheduler: [0.001], dec lr scheduler: [0.001]
(Epoch 0, iter 50/787) Average loss so far: 7.192
(Epoch 0, iter 100/787) Average loss so far: 6.000
(Epoch 0, iter 150/787) Average loss so far: 5.805


In [None]:
save_model(encoder, decoder, "adam_without_intermediate_tags_wd0_lr1e-3_last")

## Encoder-Decoder (Attention)

In [None]:
embedding_size=300
encoder_attn = EncoderRNN(vocab.n_unique_words, embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, padding_value=vocab.word2index(PAD_WORD)).to(DEVICE)
# in the training script, decoder is always fed a non-end token and thus never needs to generate padding
# also it should never generate "<UNKNOWN>"
# decoder = DecoderRNN(embedding_size=embedding_size,hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-2).to(DEVICE)
decoder_attn = AttnDecoderRNN(embedding_size, hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-2, padding_val=vocab.word2index(PAD_WORD), dropout=DROPOUT).to(DEVICE)

In [None]:
initial_lr=0.8
min_lr = 0.01
n_epochs = 30
batch_size=128
encoder_attn_optimizer = optim.SGD(encoder_attn.parameters(), lr=initial_lr)
decoder_attn_optimizer = optim.SGD(decoder_attn.parameters(), lr=initial_lr)
# enc_attn_scheduler = CosineAnnealingLR(encoder_attn_optimizer, T_max=n_epochs, eta_min=min_lr)
# dec_attn_scheduler = CosineAnnealingLR(decoder_attn_optimizer, T_max=n_epochs, eta_min=min_lr)
enc_attn_scheduler = MultiStepLR(encoder_attn_optimizer, milestones=[15], gamma=0.1)
dec_attn_scheduler = MultiStepLR(decoder_attn_optimizer, milestones=[15], gamma=0.1)

epoch_losses = train(encoder_attn, decoder_attn, encoder_attn_optimizer, decoder_attn_optimizer, recipe_ds, 
                     n_epochs=n_epochs, vocab=vocab, decoder_mode="attention", batch_size=batch_size, 
                     enc_lr_scheduler=enc_attn_scheduler, dec_lr_scheduler=dec_attn_scheduler, 
                     verbose_iter_interval=10)

## Encoder-Decoder (Extension: pretrained embeddings)

In [None]:
pretrained_embedding_dict = create_pretrained_embedding_dict("./glove.840B.300d.txt")

In [None]:
embedding_size=300
encoder_pretrained_embed = EncoderRNN(
    input_size=vocab.n_unique_words, embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, 
    padding_value=vocab.word2index(PAD_WORD), pretrained_embedding_dict=pretrained_embedding_dict, 
    vocab=vocab).to(DEVICE)
# in the training script, decoder is always fed a non-end token and thus never needs to generate padding
# also it should never generate "<UNKNOWN>"
decoder_pretrained_embed = DecoderRNN(
    embedding_size=embedding_size,hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-2,
    pretrained_embedding_dict=pretrained_embedding_dict, vocab=vocab).to(DEVICE)

In [None]:
initial_lr=0.8
min_lr = 0.01
n_epochs = 20
batch_size=128
encoder_pretrained_embed_optimizer = optim.SGD(encoder_pretrained_embed.parameters(), lr=initial_lr)
decoder_pretrained_embed_optimizer = optim.SGD(decoder_pretrained_embed.parameters(), lr=initial_lr)
# enc_scheduler = CosineAnnealingLR(encoder_pretrained_embed_optimizer, T_max=n_epochs, eta_min=min_lr)
# dec_scheduler = CosineAnnealingLR(decoder_pretrained_embed_optimizer, T_max=n_epochs, eta_min=min_lr)
enc_pretrained_embed_scheduler = MultiStepLR(encoder_pretrained_embed_optimizer, milestones=[15], gamma=0.2)
dec_pretrained_embed_scheduler = MultiStepLR(decoder_pretrained_embed_optimizer, milestones=[15], gamma=0.2)

epoch_losses = train(encoder_pretrained_embed, decoder_pretrained_embed, 
                     encoder_pretrained_embed_optimizer, decoder_pretrained_embed_optimizer, recipe_ds, 
                     n_epochs=n_epochs, vocab=vocab, decoder_mode="basic", batch_size=batch_size, 
                     enc_lr_scheduler=enc_pretrained_embed_scheduler, dec_lr_scheduler=dec_pretrained_embed_scheduler, 
                     verbose_iter_interval=10)

In [None]:
initial_lr=0.8
min_lr = 0.01
n_epochs = 20
batch_size=128
encoder_pretrained_embed_optimizer = optim.SGD(encoder_pretrained_embed.parameters(), lr=initial_lr)
decoder_pretrained_embed_optimizer = optim.SGD(decoder_pretrained_embed.parameters(), lr=initial_lr)
# enc_scheduler = CosineAnnealingLR(encoder_pretrained_embed_optimizer, T_max=n_epochs, eta_min=min_lr)
# dec_scheduler = CosineAnnealingLR(decoder_pretrained_embed_optimizer, T_max=n_epochs, eta_min=min_lr)
enc_pretrained_embed_scheduler = MultiStepLR(encoder_pretrained_embed_optimizer, milestones=[15], gamma=0.2)
dec_pretrained_embed_scheduler = MultiStepLR(decoder_pretrained_embed_optimizer, milestones=[15], gamma=0.2)

epoch_losses = train(encoder_pretrained_embed, decoder_pretrained_embed, 
                     encoder_pretrained_embed_optimizer, decoder_pretrained_embed_optimizer, recipe_ds, 
                     n_epochs=n_epochs, vocab=vocab, decoder_mode="basic", batch_size=batch_size, 
                     enc_lr_scheduler=enc_pretrained_embed_scheduler, dec_lr_scheduler=dec_pretrained_embed_scheduler, 
                     verbose_iter_interval=10)

---

### Run testing

In [None]:
# decoder_attn = AttnDecoderRNN(embedding_size=embedding_size, hidden_size=HIDDEN_SIZE,
#                               output_size=vocab.n_unique_words-2, 
#                               padding_val=vocab.word2index(PAD_WORD)).to(DEVICE)

In [None]:
# dataloader = DataLoader(recipe_ds, batch_size=4, shuffle=True, collate_fn=pad_collate(vocab))
# ingredients, recipes, ing_lens, rec_lens = next(iter(dataloader))

In [None]:
# rec_lens

In [None]:
# initial_lr=0.8
# min_lr = 0.01
# n_epochs = 30
# batch_size=128
# encoder_optimizer = optim.SGD(encoder.parameters(), lr=initial_lr)
# decoder_optimizer = optim.SGD(decoder.parameters(), lr=initial_lr)

In [None]:
# train_iter(ingredients, recipes, ing_lens, rec_lens, encoder, decoder_attn, 
#            encoder_optimizer, decoder_optimizer, criterion=nn.NLLLoss(),
#            decoder_mode="attention", vocab=vocab)

---

## Evaluation

In [24]:
load_model(encoder, decoder, "adam_with_intermediate_tags_wd0_lr1e-3_ep_9")

In [25]:
all_decoder_outs, all_gt_recipes = eval(encoder, decoder, test_ds, vocab,
                                        max_recipe_len=MAX_RECIPE_LEN)

100%|██████████| 190/190 [00:15<00:00, 11.88it/s]


In [27]:
all_gt_recipes[10]

['<RECIPE_START>',
 'mix',
 'the',
 'crushed',
 'graham',
 'crackers',
 ',',
 '2',
 'tsp',
 '<RECIPE_STEP>',
 'of',
 'cinnamon',
 ',',
 'and',
 'the',
 'brown',
 'sugar',
 'or',
 'substitute',
 'together',
 '<RECIPE_STEP>',
 'stir',
 'in',
 'the',
 'melted',
 'margarine',
 'and',
 'set',
 'the',
 'mixture',
 'to',
 'one',
 'side',
 '<RECIPE_STEP>',
 'mix',
 'the',
 'sliced',
 'apples',
 ',',
 '5',
 'tsp',
 '<RECIPE_STEP>',
 'of',
 'cinnamon',
 ',',
 'corn',
 'starch',
 ',',
 'and',
 '1',
 'cup',
 'of',
 'brown',
 'sugar',
 'or',
 'substitute',
 'together',
 'and',
 'pour',
 'into',
 'a',
 '13',
 "''",
 'x',
 '9',
 "''",
 'pan',
 '<RECIPE_STEP>',
 'sprinkle',
 'on',
 'the',
 'topping',
 'and',
 'pat',
 'down',
 '<RECIPE_STEP>',
 'bake',
 '35',
 'minutes',
 'in',
 'an',
 'oven',
 'preheated',
 'to',
 '350',
 'degrees',
 '<RECIPE_STEP>',
 'let',
 'cool',
 'and',
 'enjoy',
 '!',
 '!',
 '<RECIPE_END>']

In [29]:
all_decoder_outs[10]

['<RECIPE_START>',
 'combine',
 'all',
 'ingredients',
 'except',
 'nuts',
 '<RECIPE_STEP>',
 'in',
 'a',
 'large',
 'bowl',
 ',',
 'combine',
 'butter',
 ',',
 'sugar',
 ',',
 'and',
 'vanilla',
 '<RECIPE_STEP>',
 'mix',
 'well',
 '<RECIPE_STEP>',
 'pour',
 'into',
 'greased',
 '9',
 "''",
 'x',
 '13',
 "''",
 'baking',
 'pan',
 '<RECIPE_STEP>',
 'bake',
 'at',
 '350',
 'degrees',
 'for',
 '30',
 'minutes',
 '<RECIPE_STEP>',
 'cool',
 '<RECIPE_STEP>',
 '<RECIPE_END>']

In [26]:
calc_bleu(all_gt_recipes, all_decoder_outs)

0.04852470977830739

In [None]:
calc_meteor(all_gt_recipes, all_decoder_outs, split_gt=False)

---

## Metric Sample

In [None]:
all_ings = get_all_ingredients("./ingredient_set.json")
all_ings_regex = get_ingredients_regex(all_ings)
metric_sample_ings, metric_sample_gold_recipe, metric_sample_generated_recipe = \
    load_metric_sample("./metric_sample.txt")

In [None]:
prop_inp_ings, n_extra_ings = get_prop_input_num_extra_ingredients(
    metric_sample_ings, metric_sample_generated_recipe, all_ings_regex, verbose=True,
    metric_sample=True)
print(f"\nproportion of input ingredients: {prop_inp_ings}\nnumber of extra ingredients: {n_extra_ings}")

In [None]:
bleu_score = calc_bleu([metric_sample_gold_recipe], [metric_sample_generated_recipe], split_gen=True)
meteor_score = calc_meteor([metric_sample_gold_recipe], [metric_sample_generated_recipe], split_gen=True)
print(f"BLEU score: {bleu_score}, METEOR score: {meteor_score}")