In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import random

from data import *
from encoder_decoder import *
from train import *

In [3]:
SEED = 31989101
HIDDEN_SIZE = 256
MAX_INGR_LEN = 150
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## ensuring reproducibility
def reset_rng():
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

reset_rng()

print(f"Using device: {DEVICE}")

Using device: cpu


In [4]:
# to easily read ingredients and instructions
pd.set_option('display.max_colwidth', 2000)

In [5]:
data_root = "./Cooking_Dataset"

train_df_orig = pd.read_csv(os.path.join(data_root, "train.csv"), usecols=['Ingredients', 'Recipe'])
dev_df_orig = pd.read_csv(os.path.join(data_root, "dev.csv"), usecols=['Ingredients', 'Recipe'])
test_df_orig = pd.read_csv(os.path.join(data_root, "test.csv"), usecols=['Ingredients', 'Recipe'])

In [6]:
train_df = preprocess_data(train_df_orig, max_ingr_len=MAX_INGR_LEN)

Number of data samples before preprocessing: 101340
Number of data samples after preprocessing: 99036 (97.726%)


In [7]:
vocab = Vocabulary()
vocab.populate(train_df)
vocab.n_unique_words

100%|██████████| 99036/99036 [00:42<00:00, 2325.28it/s]


44683

In [8]:
recipe_ds = RecipeDataset(train_df, vocab)

In [9]:
encoder = EncoderRNN(vocab.n_unique_words, hidden_size=HIDDEN_SIZE, padding_value=vocab.word2index[PAD_WORD]).to(DEVICE)
# in the training script, decoder is always fed a non-end token and thus never needs to generate padding
decoder = DecoderRNN(hidden_size=HIDDEN_SIZE, output_size=vocab.n_unique_words-1).to(DEVICE)

In [10]:
epoch_losses = train(encoder, decoder, recipe_ds, n_epochs=5, vocab=vocab, batch_size=4, learning_rate=0.01, verbose_iter_interval=1)

TypeError: train() missing 1 required positional argument: 'vocab'