In [97]:
import torch
from torch.utils.data import Dataset
import os
import glob
import re
# from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
from tqdm import trange
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import Vocab, vocab
from torchtext.data.utils import get_tokenizer
import csv
from collections import Counter

### Config

In [98]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BS = 128

In [99]:
def text_to_recipe_processing(line):
    title = re.findall(r'Title: (.*)', line)
    ingredients = re.findall(r'ingredients: (.*)', line)
    steps = re.findall(r'ingredients: .*\n([\s\S]*)', line)
    try:
        title = title[0]
        title = re.sub(r'[^a-zA-Z0-9_ ]', '', title)
        ingredients = ingredients[0].replace('''\t''', " ")
        ingredients = re.sub(r'[^a-zA-Z0-9_ ]', '', ingredients)
        steps = steps[0].replace('''\n''', " ")
        steps = re.sub(r'[^a-zA-Z0-9_ ]', '', steps)
    except:
        return None
    return (str(title + " " + ingredients), str(steps))

def process_rawtext(path):
    print("Processing text data from {}".format(path))
    recipes = []
    files = glob.glob(path + "/*.txt")
    for file in files:
        lines = open(file, encoding='utf-8').read().strip().split("END RECIPE")
        for l in lines:
            recipe = text_to_recipe_processing(l)
            if recipe is not None:
                recipes.append(recipe)
    return recipes

def write_to_tsv(destination, recipe_list):
    with open(destination, 'w',  newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        for recipe in recipe_list:
            writer.writerow(recipe)

def build_language(data_path):
    all_ingredients = []
    all_recipes = []
    for path in data_path: #manual path list:
        recipes = process_rawtext(path)
        for r in recipes:
            all_ingredients.append(r[0])
            all_recipes.append(r[1])
    ingredient_lang = Language("ingredients")
    recipe_lang = Language("recipes")
    for ing in all_ingredients:
        ingredient_lang.addSentence(ing)
    for rec in all_recipes:
        recipe_lang.addSentence(rec)
    return ingredient_lang, recipe_lang

In [100]:
train_recipes = process_rawtext("Cooking_Dataset/train")
write_to_tsv("Dataset/train.tsv", train_recipes)
test_recipes = process_rawtext("Cooking_Dataset/test")
write_to_tsv("Dataset/test.tsv", test_recipes)
dev_recipes = process_rawtext("Cooking_Dataset/dev")
write_to_tsv("Dataset/dev.tsv", dev_recipes)

Processing text data from Cooking_Dataset/train
Processing text data from Cooking_Dataset/test
Processing text data from Cooking_Dataset/dev


In [101]:
tk = get_tokenizer(tokenizer=None)
# ct = Counter()
# for recipe in train_recipes:
#     ct.update(tk(recipe[0]))
# src_vocab = vocab(ct, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

# count = Counter()
# for recipe in train_recipes:
#     count.update(tk(recipe[1]))
# trg_vocab = vocab(ct, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))


In [102]:
class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "<BOS>", 2: "<EOS>"}
        self.n_words = 3  # Count PAD, SOS, and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def __len__(self):
        return self.n_words

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def stoi(self, word):
        return self.word2index[word]
    
    def itos(self, ndx):
        return self.index2word[ndx]

def build_language(data_path):
    all_ingredients = []
    all_recipes = []
    for path in data_path: #manual path list:
        recipes = process_rawtext(path)
        for r in recipes:
            all_ingredients.append(r[0])
            all_recipes.append(r[1])
    ingredient_lang = Language("ingredients")
    recipe_lang = Language("recipes")
    for ing in all_ingredients:
        ingredient_lang.addSentence(ing)
    for rec in all_recipes:
        recipe_lang.addSentence(rec)
    return ingredient_lang, recipe_lang


dataset_path = ["Cooking_Dataset/test", "Cooking_Dataset/dev", "Cooking_Dataset/train"]
ingredient_vocabulary, recipe_vocabulary = build_language(dataset_path)


Processing text data from Cooking_Dataset/test
Processing text data from Cooking_Dataset/dev
Processing text data from Cooking_Dataset/train


In [103]:

print(ingredient_vocabulary.stoi("hungry"))

5159


In [104]:
# source_f = Field(tokenize = 'spacy', init_token = '<SOS>', eos_token = '<EOS>', lower=True)
# target_f = Field(tokenize = 'spacy', init_token = '<SOS>', eos_token = '<EOS>', lower=True)
# train_data = TabularDataset(path="Dataset/train.tsv", format="tsv", fields=[('source', source_f),('target', target_f)])
# source_f.build_vocab(train_data, min_freq = 2)
# target_f.build_vocab(train_data, min_freq = 2)

# iterator = Iterator(train_data, batch_size = BS, device=DEVICE, shuffle=True)

class RecipeDataset(Dataset):
    def __init__(self, path):
        self.all_recipes = []
        file = open(path, 'r')
        lines = file.readlines()
        for l in lines:
            items = l.split("""\t""")
            self.all_recipes.append(items)

    def __len__(self):
        return len(self.all_recipes)
    
    def __getitem__(self, index):
        ing = self.all_recipes[index][0]
        step = self.all_recipes[index][1]
        return ing, step

src_transform = lambda x: [ingredient_vocabulary.stoi('<BOS>')] + [ingredient_vocabulary.stoi(token) for token in tk(x)] + [ingredient_vocabulary.stoi('<EOS>')]
trg_transform = lambda x: [recipe_vocabulary.stoi('<BOS>')] + [recipe_vocabulary.stoi(token) for token in tk(x)] + [recipe_vocabulary.stoi('<EOS>')]

def collate_fn(batch):
    source_list, target_list = [], [] 
    for source, target in batch: 
        src_process = torch.tensor(src_transform(source)) 
        source_list.append(src_process) 
        trg_process = torch.tensor(trg_transform(target))
        target_list.append(trg_process)
    return pad_sequence(source_list, padding_value=3.0), pad_sequence(target_list, padding_value=3.0)

recipe_data = RecipeDataset("Dataset/train.tsv")
print(recipe_data[1])
trainloader = DataLoader(recipe_data, BS, shuffle=True, collate_fn=collate_fn)
devloader = DataLoader(recipe_data, BS, shuffle=True, collate_fn=collate_fn)

('brendas nectarineorange bavarian cream 2    envelopes unflavored gelatin 2 c  heavy cream divided 1 12 23 c  sugar divided 13  13 12 34 c  orange juice fresh if sweet 12 ts vanilla 2    egg yolks 12 c  blue berries optional but 4 lg nectarines divided highly recommended', 'brenda s nectarineorange bavarian cream  in heavy saucepan  mix gelatin and 13 cup sugar  stir in orange juice and egg yolks and blend well  let stand one minute  stir over low heat until gelatin is completely dissolved and mixture thickens slightly about 57 minutes  remove from heat  slice 2 nectarines and puree in blender or processor with the remaining 13 cup sugar  stir puree into gelatin mixture  chill  stirring occasionally  only until mixture mounds slightly when dropped from a spoon  whip 112 cups cream until stiff  fold in fruitgelatin mixture  pour into 5 cup ring or other mold  chill until firm  unmold on pretty platter for finishing  slice remaining 2 nectarines and whip remaining 12 cup cream until sti

In [105]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        # src : [sen_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        
        # embedded : [sen_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [sen_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_direction, batch_size, hid_dim]
        # cell = [n_layers * n_direction, batch_size, hid_dim]
        return hidden, cell
    

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=self.n_layers, dropout=dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        # input = [batch_size]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        # input : [1, ,batch_size]
        
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq_len, batch_size, hid_dim * n_dir]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]
        
        # seq_len and n_dir will always be 1 in the decoder
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch_size, output_dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            'hidden dimensions of encoder and decoder must be equal.'
        assert encoder.n_layers == decoder.n_layers, \
            'n_layers of encoder and decoder must be equal.'
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [sen_len, batch_size]
        # trg = [sen_len, batch_size]
        # teacher_forcing_ratio : the probability to use the teacher forcing.
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        # first input to the decoder is the <sos> token.
        input = trg[0, :]
        for t in range(1, trg_len):
            # insert input token embedding, previous hidden and previous cell states 
            # receive output tensor (predictions) and new hidden and cell states.
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            # replace predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            # decide if we are going to use teacher forcing or not.
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from our predictions.
            top1 = output.argmax(1)
            # update input : use ground_truth when teacher_force 
            input = trg[t] if teacher_force else top1
            
        return outputs

In [106]:
# First initialize our model.
INPUT_DIM = len(ingredient_vocabulary)
OUTPUT_DIM = len(recipe_vocabulary)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = 0

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [107]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        # trg = [sen_len, batch_size]
        # output = [trg_len, batch_size, output_dim]
        output = model(src, trg)
        output_dim = output.shape[-1]
        
        # transfrom our output : slice off the first column, and flatten the output into 2 dim.
        output = output[1:].view(-1, output_dim) 
        trg = trg[1:].view(-1)
        # trg = [(trg_len-1) * batch_size]
        # output = [(trg_len-1) * batch_size, output_dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            output = model(src, trg, 0) # turn off teacher forcing.
            
            # trg = [sen_len, batch_size]
            # output = [sen_len, batch_size, output_dim]
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [108]:
for example, ex2 in trainloader:
    print(ex2)


KeyError: '<BOS>'

In [None]:
N_EPOCHS = 10

CLIP = 1

best_valid_loss = float('inf')

for epoch in trange(N_EPOCHS):
    
    
    train_loss = train(model, trainloader, optimizer, criterion, CLIP)
    print("Progress <3 ")
    valid_loss = evaluate(model, devloader, criterion)
    
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:.3f} | Valid PPL: {math.exp(valid_loss):7.3f}")

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: Token watery not found and default index is not set