# Personalized Food Recipes

## Import Food.com Data

In [1]:
# Imports for Project
import pandas as pd
from tensorflow import keras
import numpy as np

In [2]:
# Load data
df_interactions = pd.read_csv('RAW_interactions.csv')
df_recipes = pd.read_csv('RAW_recipes.csv')

In [3]:
# Print interactions
df_interactions[:1]

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...


In [4]:
# Rename id column of recipec to recipe_id for further joining interactions
df_recipes = df_recipes.rename(columns = {'id':'recipe_id'})
df_recipes[:1]

Unnamed: 0,name,recipe_id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [5]:
# Joining df_interactions and df_recipes 
df_merged = pd.merge(df_interactions, df_recipes, on='recipe_id')
df_merged[:1]

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,white bean green chile pepper soup,495,1533,2002-09-21,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9


In [6]:
# Check shape
df_merged.shape

(1132367, 16)

In [7]:
# Check for NaNs
df_merged.isnull().sum()

user_id               0
recipe_id             0
date                  0
rating                0
review              169
name                  1
minutes               0
contributor_id        0
submitted             0
tags                  0
nutrition             0
n_steps               0
steps                 0
description       23510
ingredients           0
n_ingredients         0
dtype: int64

In [8]:
# Delete NaNs
df_merged.dropna(subset=['review'], inplace=True)
df_merged.shape

(1132198, 16)

In [9]:
# Substract Data to smaller Form
df_compact = df_merged[['user_id', 'name', 'recipe_id', 'ingredients', 'n_ingredients', 'n_steps', 'nutrition','review']]
df_compact[:1]

Unnamed: 0,user_id,name,recipe_id,ingredients,n_ingredients,n_steps,nutrition,review
0,38094,white bean green chile pepper soup,40893,"['great northern beans', 'yellow onion', 'dice...",9,4,"[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",Great with a salad. Cooked on top of stove for...


In [10]:
# Check Data Types
df_compact.dtypes

user_id           int64
name             object
recipe_id         int64
ingredients      object
n_ingredients     int64
n_steps           int64
nutrition        object
review           object
dtype: object

## Clean Data

In [11]:
# Restrict to recipes with at least 3 steps
df_n_steps_restricted = df_compact[df_compact.n_steps >=3]
df_n_steps_restricted.shape

(1092089, 8)

In [12]:
# Restrit to at least 4 and no more than 20 ingredients
df_n_ingredients_restricted =df_n_steps_restricted[df_n_steps_restricted.n_ingredients >=4]
df_n_ingredients_restricted =df_n_steps_restricted[df_n_steps_restricted.n_ingredients <=20]
df_n_ingredients_restricted.shape


(1084934, 8)

In [13]:
# Sort Data per User_ID
df_sorted = df_n_ingredients_restricted.sort_values(by=['user_id'])
df_sorted.shape

(1084934, 8)

In [14]:
# Create new DataFrame with Views having 5 or more Reviews
df_cleaned = df_sorted.groupby('user_id').filter(lambda x: len(x)>=5)
df_cleaned.shape

(831736, 8)

In [15]:
# Create new DataFrame with necessary columns
df = df_cleaned[['name', 'ingredients', 'nutrition']]
df[:5]

Unnamed: 0,name,ingredients,nutrition
892328,famous walper spinach salad,"['fresh spinach', 'whipping cream', 'white vin...","[289.2, 42.0, 2.0, 8.0, 11.0, 77.0, 2.0]"
784943,beef casserole and dumplings,"['steak', 'onion', 'carrots', 'parsnip', 'flou...","[280.4, 17.0, 15.0, 38.0, 23.0, 23.0, 11.0]"
326034,bacon and cheese egg appetizers,"['hardboiled egg', 'mayonnaise', 'crisp bacon'...","[101.9, 11.0, 2.0, 8.0, 15.0, 13.0, 0.0]"
217851,super stuffed taters,"['baking potatoes', 'milk', 'butter', 'salt', ...","[364.7, 36.0, 5.0, 26.0, 18.0, 74.0, 10.0]"
253068,marna s killer rib rub,"['brown sugar', 'paprika', 'onion powder', 'ga...","[113.1, 1.0, 85.0, 36.0, 3.0, 0.0, 9.0]"


In [16]:
# Split train and test data
df_train = df.sample(frac=0.8, random_state=42)
idx_train = df_train.index.tolist()
idx_test = list(set(df.index.tolist())-set(df_train.index.tolist()))
df_test = df.loc[idx_test]
df_train

Unnamed: 0,name,ingredients,nutrition
860527,carrot relish,"['carrots', 'granulated sugar', 'fresh lemon j...","[79.2, 0.0, 57.0, 2.0, 1.0, 0.0, 6.0]"
929230,sticken chicken,"['chicken thighs', 'flour', 'garlic salt', 'pa...","[512.8, 33.0, 145.0, 48.0, 56.0, 30.0, 16.0]"
533373,tortilla omelet wrap,"['eggs', 'half-and-half', 'salt and pepper', '...","[706.6, 64.0, 13.0, 35.0, 80.0, 96.0, 13.0]"
489237,chicken breast with white wine and mushroom cr...,"['chicken breasts', 'olive oil', 'salt', 'pepp...","[486.2, 55.0, 6.0, 6.0, 67.0, 80.0, 1.0]"
271805,crock pot normandy pork with apples shallots ...,"['pork shoulder', 'apples', 'shallots', 'balsa...","[658.8, 73.0, 29.0, 8.0, 64.0, 98.0, 8.0]"
...,...,...,...
850437,grilled tofu with grapefruit and avocado salsa,"['firm tofu', 'red grapefruits', 'navel orange...","[208.0, 13.0, 75.0, 8.0, 9.0, 6.0, 10.0]"
1105848,broccoli and tofu with spicy peanut sauce,"['firm tofu', 'broccoli', 'peanut oil', 'onion...","[457.5, 40.0, 90.0, 68.0, 47.0, 26.0, 13.0]"
574647,fresh peach brown sugar pie,"['pie crusts', 'sugar', 'brown sugar', 'flour'...","[3014.6, 222.0, 875.0, 99.0, 63.0, 222.0, 137.0]"
131251,tea marbled eggs,"['eggs', 'black tea', 'cinnamon sticks', 'star...","[79.9, 7.0, 4.0, 16.0, 13.0, 7.0, 0.0]"


## Text Processing for RNN

In [17]:
# Generate text sequences
import re

def generate_sequences_single_text(text, train_len):
    
    # Create empty list that will contain all text sequences from one review
    sequences = []

    # Create a list that contains the review as lower case tokens
    words = [word.lower() for word in text.split() if re.match('[a-zA-Z]', word)]
    
    for i in range(train_len, len(words)):

        # Create a text sequence with sliding window
        seq = words[i-train_len:i]

        # Add sequence to list
        sequences.append(seq)
        
    return sequences

def generate_sequences(data, train_len):
    text_sequences = []
    
    for d in list(data):
    
        # Use function generate_sequences on text
        seq = generate_sequences_single_text(d, train_len)

        if len(seq) > train_len: 

            # Add list of sequences to text_sequences and text_labels
            text_sequences += seq
    return text_sequences

In [18]:
train_len = 10+1 # Länge der Inputsequenz + 1 Label
train_sequences = generate_sequences(df_train['ingredients'], train_len)
test_sequences = generate_sequences(df_test['ingredients'], train_len)

## Create Model

In [19]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.models import Sequential

In [20]:
# Tokenize the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sequences)

In [21]:
# Convert the text sequences to numerical sequences
train_sequences = tokenizer.texts_to_sequences(train_sequences)
test_sequences = tokenizer.texts_to_sequences(test_sequences)

In [22]:
# Pad the numerical sequences to have equal length
train_padded = pad_sequences(train_sequences, maxlen=train_len, padding='pre')
test_padded = pad_sequences(test_sequences, maxlen=train_len, padding='pre')

In [23]:
# Define the input shape of the model
input_shape = (train_len, )

In [24]:
# Define the model
model = Sequential()

In [25]:
# Add an embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=train_len))

In [26]:
# Add a bidirectional LSTM layer

model.add(Bidirectional(LSTM(128, return_sequences=False)))

In [27]:
# Add a dense layer with sigmoid activation function
model.add(Dense(1, activation='sigmoid'))

In [28]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
# Store sequences as sublists of original
train_labels = []
for seq in train_sequences:
    train_labels.append(seq[-1])
    
test_labels = []
for seq in test_sequences:
    test_labels.append(seq[-1])

train_sequences = [seq[:-1] for seq in train_sequences]
test_sequences = [seq[:-1] for seq in test_sequences]

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [30]:
# Train the model
model.fit(train_padded, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e0e13a2ee0>