In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import math
import re
import string
import numpy as np
import pandas as pd
import random
import json
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR, MultiStepLR, CosineAnnealingWarmRestarts
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate import meteor

from data import *
from encoder_decoder import *
from train import *
from eval import *
from utils import *

# required for bleu
# nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 31989101
HIDDEN_SIZE = 256
MAX_INGR_LEN = 150 # fixed from assignment
MAX_RECIPE_LEN = 600
DROPOUT = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## ensuring reproducibility
def reset_rng():
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

reset_rng()

# to easily read ingredients and instructions
pd.set_option('display.max_colwidth', 2000)

print(f"Using device: {DEVICE}")

Using device: cuda


In [4]:
data_root = "./Cooking_Dataset"
add_intermediate_tag=False

train_df_orig = pd.read_csv(os.path.join(data_root, "train.csv"), usecols=['Ingredients', 'Recipe'])
dev_df_orig = pd.read_csv(os.path.join(data_root, "dev.csv"), usecols=['Ingredients', 'Recipe'])
test_df_orig = pd.read_csv(os.path.join(data_root, "test.csv"), usecols=['Ingredients', 'Recipe'])
train_df = preprocess_data(train_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)
dev_df = preprocess_data(dev_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)
test_df = preprocess_data(test_df_orig, max_ingr_len=MAX_INGR_LEN, max_recipe_len=MAX_RECIPE_LEN, add_intermediate_tag=add_intermediate_tag)
vocab = Vocabulary(add_intermediate_tag=add_intermediate_tag)
vocab.populate(train_df)
vocab.n_unique_words
train_ds = RecipeDataset(train_df, vocab)
# subset_train_ds = RecipeDataset(train_df[:250], vocab) # ! REMOVE LATER
dev_ds_val_loss = RecipeDataset(dev_df, vocab, train=True) # used for getting validation loss
dev_ds_val_met = RecipeDataset(dev_df, vocab, train=False) # used for getting validation BLEU, and other metrics
test_ds = RecipeDataset(test_df, vocab, train=False)

Number of data samples before preprocessing: 101340
Number of data samples after preprocessing: 100637 (99.306%)
Number of data samples before preprocessing: 797
Number of data samples after preprocessing: 793 (99.498%)
Number of data samples before preprocessing: 778
Number of data samples after preprocessing: 774 (99.486%)


100%|██████████| 100637/100637 [00:06<00:00, 15209.62it/s]


In [5]:
embedding_size=300
encoder_attn = EncoderRNN(vocab.n_unique_words, embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, padding_value=vocab.word2index(PAD_WORD)).to(DEVICE)
# in the training script, decoder is always fed a non-end token and thus never needs to generate padding
# also it should never generate "<UNKNOWN>"
# decoder = DecoderRNN(embedding_size=embedding_size,hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-2).to(DEVICE)
decoder_attn = AttnDecoderRNN(embedding_size, hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-1, padding_val=vocab.word2index(PAD_WORD), 
                              dropout=DROPOUT).to(DEVICE)

In [6]:
load_model(encoder_attn, decoder_attn, "attn_adam_without_intermediate_tags_wd0_lr1e-3_ep_26")

In [10]:
all_decoder_outs, all_gt_recipes, all_gt_ingredients = eval(encoder_attn, decoder_attn, test_ds, vocab, batch_size=128, decoder_mode="attention",
                                        max_recipe_len=MAX_RECIPE_LEN)

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:06<00:00,  1.04it/s]


In [14]:
len(all_decoder_outs)

774

In [21]:
def ingredients_idxs_to_lst(ing_list):
    # convert from idx to words
    ingredient_txts = []
    for ingredients in ing_list:
        ing_text = " ".join([vocab.index2word[i] for i in ingredients
                           if i != vocab.word2index(PAD_WORD)])
        ingredient_txts += [ing_text]
    return ingredient_txts

In [22]:
ingredient_txts = ingredients_idxs_to_lst(all_gt_ingredients)

In [25]:
generated_recipes_concat = [" ".join(l) for l in all_decoder_outs]

In [8]:
calc_bleu(all_gt_recipes, all_decoder_outs)

0.04975297680717382

In [9]:
all_ingredients_lst = get_all_ingredients("./ingredient_set.json")

In [27]:
all_ingredient_regex = get_ingredients_regex(all_ingredients_lst)
invalid_ingredient_regex = get_invalid_ingredients_regex(all_ingredients_lst)

In [31]:
get_prop_input_num_extra_ingredients(ingredient_txts, generated_recipes_concat,
                                     all_ingredient_regex, invalid_ingredient_regex)

774it [00:00, 932.46it/s] 


(0.3339937848423436, 2.292267365661861)