In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import re
import nltk
from collections import Counter
from sklearn.model_selection import train_test_split

PAD_IDX = 0
UNK_IDX = 1
#BATCH_SIZE = 32

Get Data 

In [25]:
# the data is the output of "Consolidated Data Cleaning"
data_all = pd.read_csv('cleaned_recipe_data.csv')

In [26]:
data_all.columns

Index(['Unnamed: 0', 'external_id', 'title', 'subtitle', 'carbs', 'fat',
       'protein', 'chef_id', 'calories', 'cooking_tips', 'story',
       'ingredients_display', 'step_one', 'step_two', 'step_three',
       'step_four', 'step_five', 'step_six', 'recipe_tags',
       'tag_cuisine_indian', 'tag_cuisine_nordic', 'tag_cuisine_european',
       'tag_cuisine_asian', 'tag_cuisine_mexican',
       'tag_cuisine_latin-american', 'tag_cuisine_french',
       'tag_cuisine_italian', 'tag_cuisine_african',
       'tag_cuisine_mediterranean', 'tag_cuisine_american',
       'tag_cuisine_middle-eastern'],
      dtype='object')

In [27]:
data_intruction = data_all[['step_one','step_two', 'step_three', 'step_four', 'step_five', 'step_six']]

In [28]:
data_cuisine_tags = data_all[['tag_cuisine_indian', 'tag_cuisine_nordic', 'tag_cuisine_european',
       'tag_cuisine_asian', 'tag_cuisine_mexican',
       'tag_cuisine_latin-american', 'tag_cuisine_french',
       'tag_cuisine_italian', 'tag_cuisine_african',
       'tag_cuisine_mediterranean', 'tag_cuisine_american',
       'tag_cuisine_middle-eastern']]

In [29]:
data_intruction.step_one[1083]

'Preheat oven to 425°F. Pat chicken dry with paper towel and season all over with .5 teaspoon salt and black pepper as desired. Heat 1 tablespoon olive oil in a medium pan over medium-high heat. When oil is shimmering, add chicken and sear until cooked through and no longer pink, about 6 minutes per side. Transfer chicken to a plate or cutting board, and set aside to rest. Wipe pan clean and reserve for cooking salsa.'

Tokenization

In [30]:
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

In [137]:
# lowercase and remove punctuation
def tokenizer(sent):
    #print(sent)
    if pd.isnull(sent):
        words = []
    else:
        tokens = word_tokenize(sent)
        # convert to lower case
        tokens = [w.lower() for w in tokens]
        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
    return words

In [129]:
def tokenize_dataset(step_n):
    """returns tokenization for each step, training set tokenizatoin"""
    token_dataset = []
    for sample in step_n:
        tokens = tokenizer(sample)
        token_dataset.append(tokens)
    return token_dataset

def all_tokens_list(train_data):
    """returns all tokens of instruction (all steps) for creating vocabulary"""
    all_tokens = []
    for columns in train_data.columns:
        for sample in train_data[columns]:
            all_tokens += sample[:]
    return all_tokens

In [74]:
# tokenize each steps
data_instruction_tokenized = pd.DataFrame()
for steps in data_intruction.columns:
    data_instruction_tokenized[steps] = tokenize_dataset(data_intruction[steps])
    print(steps, 'has been tokenized.')

step_one has been tokenized.
step_two has been tokenized.
step_three has been tokenized.
step_four has been tokenized.
step_five has been tokenized.
step_six has been tokenized.


In [72]:
data_instruction_tokenized.head()

Unnamed: 0,step_one,step_two,step_three,step_four,step_five,step_six
0,"[preheat, oven, to, place, butter, in, a, smal...","[on, a, baking, sheet, toss, carrots, green, b...","[while, vegetables, roast, mince, garlic, and,...","[pat, steaks, dry, with, paper, towel, and, se...","[once, roasted, remove, vegetables, from, oven...","[once, steaks, have, rested, find, the, direct..."
1,"[in, a, small, pot, combine, rice, cups, water...","[rinse, bell, pepper, and, halve, lengthwise, ...","[pat, steaks, dry, with, paper, towel, and, se...","[while, steaks, sear, heat, teaspoons, canola,...","[return, pan, from, steaks, to, medium, heat, ...","[once, rested, cut, steaks, against, the, grai..."
2,"[in, a, small, pot, combine, rice, cups, water...","[while, rice, cooks, halve, bok, choy, lengthw...","[heat, sesame, oil, in, a, medium, nonstick, p...","[add, cooked, rice, and, tablespoon, canola, o...","[pat, pork, chops, dry, with, a, paper, towel,...","[once, rested, cut, pork, into, slices, then, ..."
3,"[preheat, oven, to, in, a, small, pot, combine...","[rinse, all, produce, halve, cucumber, lengthw...","[on, half, of, baking, sheet, toss, chickpeas,...","[while, chickpeas, and, tomatoes, roast, place...","[while, feta, bakes, in, a, large, bowl, whisk...","[divide, quinoa, between, serving, bowls, then..."
4,"[preheat, oven, to, pat, chicken, dry, with, p...","[while, chicken, cooks, halve, lime, cut, half...","[return, pan, from, chicken, to, mediumhigh, h...","[stack, tortillas, wrap, in, foil, and, place,...","[stir, chipotle, paste, and, teaspoon, salt, i...","[divide, warmed, tortillas, between, serving, ..."


In [122]:
assert (data_instruction_tokenized.shape[0] == data_cuisine_tags.shape[0])

Split train, validation, test sets

In [161]:
RANDOM_STATE = 42
X_train, test_data, y_train, test_tags = train_test_split(data_instruction_tokenized, data_cuisine_tags, test_size=0.1, random_state=RANDOM_STATE)
train_data, val_data, train_tags, val_tags = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_STATE)

 All tokens from training set

In [130]:
# form all tokens list
all_train_tokens = all_tokens_list(train_data)

Let's decide which tag to predict for trail

In [17]:
data_cuisine_tags.sum()/data_cuisine_tags.shape[0]

tag_cuisine_indian            0.023525
tag_cuisine_nordic            0.000399
tag_cuisine_european          0.012360
tag_cuisine_asian             0.182217
tag_cuisine_mexican           0.013557
tag_cuisine_latin-american    0.094896
tag_cuisine_french            0.077352
tag_cuisine_italian           0.233254
tag_cuisine_african           0.003987
tag_cuisine_mediterranean     0.076555
tag_cuisine_american          0.273525
tag_cuisine_middle-eastern    0.046252
dtype: float64

Choose tag: tag_cuisine_american, which 27.3525% are 1 

Build vocabulary and indexing 

In [133]:
len(list(set(all_train_tokens)))

3489

In [134]:
token_counter = Counter(all_train_tokens)

In [95]:
#token_counter.most_common

In [90]:
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = len(list(set(all_train_tokens)))):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [135]:
token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = len(list(set(all_train_tokens))))

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 49 ; token while
Token while; token id 49


Reconstruct data strcuture for datasets

In [158]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    """returns [[[step1 indices],[step2 indices],...,[step6 indices]],[],[],...]"""
    recipie_indices_data = []
    for recipie in tokens_data.iterrows():
        step_indices_data = []
        for step in recipie[1]:
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in step]
            step_indices_data.append(index_list)
        recipie_indices_data.append(step_indices_data)
    return recipie_indices_data

train_data_indices = token2index_dataset(train_data)
val_data_indices = token2index_dataset(val_data)
test_data_indices = token2index_dataset(test_data)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 2031
Val dataset size is 226
Test dataset size is 251


In [203]:
MAX_SENTENCE_LENGTH = 100

class IntructionDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, tags_list):
        """
        
        @param data_list: list of recipie tokens 
        @param target_list: list of single tag, i.e. 'tag_cuisine_american'

        """
        self.data_list = data_list
        self.tags_list = tags_list
        assert (len(self.data_list) == len(self.tags_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call recipie[i]
        """
        recipie = self.data_list[key][:MAX_SENTENCE_LENGTH]
        step1_idx = recipie[0][:MAX_SENTENCE_LENGTH]
        step2_idx = recipie[1][:MAX_SENTENCE_LENGTH]
        step3_idx = recipie[2][:MAX_SENTENCE_LENGTH]
        step4_idx = recipie[3][:MAX_SENTENCE_LENGTH]       
        step5_idx = recipie[4][:MAX_SENTENCE_LENGTH]
        step6_idx = recipie[5][:MAX_SENTENCE_LENGTH]
        label = self.tags_list[key]
        return [[step1_idx, step2_idx, step3_idx, step4_idx, step5_idx, step6_idx], 
                [len(step1_idx),len(step2_idx), len(step3_idx),len(step4_idx), len(step5_idx),len(step6_idx)], 
                label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        label_list.append(datum[-1])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_list = []
        for i, step in enumerate(datum[0]):
            padded_vec = np.pad(np.array(step), 
                                    pad_width=((0, MAX_SENTENCE_LENGTH-datum[1][i])), 
                                    mode="constant", constant_values=0)
            padded_list.append(padded_vec)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [206]:
# Build train, valid and test dataloaders
BATCH_SIZE = 10

train_dataset = IntructionDataset(train_data_indices, list(train_tags['tag_cuisine_american']))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

val_dataset = IntructionDataset(val_data_indices, list(val_tags['tag_cuisine_american']))
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = IntructionDataset(test_data_indices, list(test_tags['tag_cuisine_american']))
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [207]:
for i, (data, lengths, labels) in enumerate(train_loader):
    print (data)
    print(lengths)
    print (labels)
    break

tensor([[   7,  297,  105,   17,   74,   20,   14,    2,  120,   15,    3,   23,
            5,  572,    2,  780,    2,   39,    3,   52,   85,   59,   17, 1359,
           80,  550,  952,  245,   43,  525,  100,  550,    5,  360,   69,  346,
          587,   34,  780,  140,    2,  447,   53,  155,   31,   77,  385,  360,
         2509,    5,  296,  655,    2,   34, 1359,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [  55,  152, 3088,  177,    3,  114,   96,   90,    5,  996,  181,  510,
          121,   11,  465,  746,    2,  131,    6,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 