In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import json

In [9]:
with open("../data/Epicurious-Recipes/full_format_recipes.json","r") as json_data:
    data = json.load(json_data)

In [10]:
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [11]:
filtered_data[0]

'Recipe for Lentil, Apple, and Turkey Wrap  | 1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool. 2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper. 3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.'

In [12]:
from tensorflow.data import Dataset
from tensorflow.keras import layers
import string
import re

In [13]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [14]:
text_ds = Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)

In [15]:
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=10000,
    output_mode="int",
    output_sequence_length=200+1,
)

In [16]:
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [18]:
best_vocab = vocab[:10]
for i,word in enumerate(best_vocab):
    print(i,word)

0 
1 [UNK]
2 .
3 ,
4 and
5 to
6 in
7 the
8 with
9 a


In [17]:
example = text_data[3]
encoded_example = vectorize_layer(example)
print(encoded_example.numpy())

[  26   16 2783   13 2783    6  265  252   54   27   17   37    6   78
   30   56   20   29   13   75   17    2   18  115   22  130   10  861
    4  473    5   91    3   19   32   12    2   18  249    4 1413  345
    2   69   10  288    5   36   23   32   52    3   19   36   12    2
   18  182    8  104   22   84    5   69    2   88  213    8   24    4
   33    2   18  213    5   56  475  265   31    2  153   17    5  134
    3   49    3    4   70   10  213   38  183  102    3   19  334   12
    2   87  308  445  384    3   40  213    5  219    4  940    8  167
    5  186  152    2  116  531    3   15  410  639    3    4  247  175
   25   54    6   56    2  547   17    5   75    4   69   10   54   38
  288    4  426    3   19  118   12    2   63    5  132    8   24    4
   33    2   64   11  213  982   28   66   14   32  280    2  107   54
   20    4  276  213    3   88    8   45   11  142  639    3    4   68
    8  152  589  195    2    0    0    0    0    0    0    0    0    0
    0 

In [19]:
def prepare_inputs(text):
    text = tf.expand_dims(text,-1)
    tokenized_sentence = vectorize_layer(text)
    x = tokenized_sentence[:,:-1]
    y = tokenized_sentence[:,1:]
    return x,y
train_ds = text_ds.map(prepare_inputs)