In [59]:
import pandas as pd
import re
from datetime import datetime, timedelta
import nltk
import unidecode
import string
import ast
from nltk.stem import WordNetLemmatizer

In [60]:
# Read in .csv files of recipes
jamie_df = pd.read_csv('Jamie_Oliver_Recipes.csv')
lau_df = pd.read_csv('Lau_Recipes.csv')

In [61]:
jamie_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Time,URL
0,0,Asparagus stir-fry,"['350 g firm tofu', '2 teaspoons groundnut oil...",4,Not too tricky,248,,https://www.jamieoliver.com/recipes/vegetable-...
1,1,Sweet potato fishcakes,"['500 g potatoes', '500 g sweet potatoes', '2 ...",4,Not too tricky,423,1 hour 10 minutes,https://www.jamieoliver.com/recipes/fish-recip...
2,2,Spring chicken stew,"['olive oil', '2 rashers of higher-welfare smo...",4,Not too tricky,549,1 hour 10 minutes,https://www.jamieoliver.com/recipes/chicken-re...
3,3,Chicken goujons,"['6 wholemeal pittas', '4 x 120 g free-range s...",4,Not too tricky,520,23 minutes,https://www.jamieoliver.com/recipes/chicken-re...
4,4,Sweet & sour stir-fry,"['100 g fine rice noodles', '1 x 227 g tin of ...",2,Super easy,492,21 minutes,https://www.jamieoliver.com/recipes/vegetable-...


In [62]:
lau_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Prep_time,Total_time,URL
0,0,Lo Mein (鷄肉撈麵),"['1 oz dried shiitake mushroom', '2 oz seafood...",4 servings,25 min,55 min,https://www.madewithlau.com/recipes/lo-mein-101
1,1,Egg Drop Soup (蛋花湯),"['5 oz shrimp (26/30 size)', '0.25 tsp white p...",4 servings,20 min,10 min,https://www.madewithlau.com/recipes/egg-drop-soup
2,2,Pipa Tofu (琵琶豆腐),"['1 lb medium firm tofu', '4 oz shrimp', '1 oz...",4 servings,35 min,45 min,https://www.madewithlau.com/recipes/pipa-tofu
3,3,Chow Mein: A Chinese Chef's Masterclass (鷄肉炒麵),"['18 oz thick chow mein noodles', '2 oz red on...",4 servings,20 min,35 min,https://www.madewithlau.com/recipes/chow-mein
4,4,Lemon Chicken (檸檬雞),"['14 oz chicken breast', '3 whole lemon', '1 d...",4 servings,35 min,50 min,https://www.madewithlau.com/recipes/lemon-chicken


I want to combine the 2 dataframes together to clean them at the same time, but first I need to decide on whether to use "Prep_time" and/or "Total_time" from the Lau dataframe.

In [63]:
# Define a function that converts every time value into minutes integers
def time_parser(time_str):
    hours_match = re.search(r"(\d+)\s*hour", time_str) # RegEx to capture hours
    minutes_match = re.search(r"(\d+)\s*min", time_str) # RegEx to capture minutes

    hours = int(hours_match.group(1)) if hours_match else 0
    minutes = int(minutes_match.group(1)) if minutes_match else 0

    time_delta = timedelta(hours=hours, minutes=minutes) # Convert to timedelta
    base_date = datetime(1900, 1, 1) # Need a 'default date' to convert to strftime later
    return base_date + time_delta

In [64]:
lau_df_copy = lau_df.copy() # Make copy of lau df

# Parse both times
lau_df_copy['Prep_time'] = lau_df_copy['Prep_time'].apply(time_parser)
lau_df_copy['Total_time'] = lau_df_copy['Total_time'].apply(time_parser)

In [65]:
# Convert to strftime
lau_df_copy['Total_time'] = lau_df_copy['Total_time'].dt.strftime('%H:%M')
lau_df_copy['Prep_time'] = lau_df_copy['Prep_time'].dt.strftime('%H:%M')

I noticed one of the records had a higher prep time than total time, which doesn't make sense. I assume the website made a mistake and switched the 2. I will change the total_time to the correct time.

In [66]:
lau_df_copy[lau_df_copy['Prep_time']>lau_df_copy['Total_time']]

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Prep_time,Total_time,URL
1,1,Egg Drop Soup (蛋花湯),"['5 oz shrimp (26/30 size)', '0.25 tsp white p...",4 servings,00:20,00:10,https://www.madewithlau.com/recipes/egg-drop-soup


In [67]:
wrong = lau_df_copy['Prep_time'] > lau_df_copy['Total_time']
lau_df_copy.loc[wrong, 'Total_time'] = lau_df_copy['Prep_time']
lau_df_copy

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Prep_time,Total_time,URL
0,0,Lo Mein (鷄肉撈麵),"['1 oz dried shiitake mushroom', '2 oz seafood...",4 servings,00:25,00:55,https://www.madewithlau.com/recipes/lo-mein-101
1,1,Egg Drop Soup (蛋花湯),"['5 oz shrimp (26/30 size)', '0.25 tsp white p...",4 servings,00:20,00:20,https://www.madewithlau.com/recipes/egg-drop-soup
2,2,Pipa Tofu (琵琶豆腐),"['1 lb medium firm tofu', '4 oz shrimp', '1 oz...",4 servings,00:35,00:45,https://www.madewithlau.com/recipes/pipa-tofu
3,3,Chow Mein: A Chinese Chef's Masterclass (鷄肉炒麵),"['18 oz thick chow mein noodles', '2 oz red on...",4 servings,00:20,00:35,https://www.madewithlau.com/recipes/chow-mein
4,4,Lemon Chicken (檸檬雞),"['14 oz chicken breast', '3 whole lemon', '1 d...",4 servings,00:35,00:50,https://www.madewithlau.com/recipes/lemon-chicken
...,...,...,...,...,...,...,...
120,120,Steamed Eggs (蒸蛋),"['4 egg', '2 cup warm water', '2 stalk scallio...",4 servings,00:05,00:15,https://www.madewithlau.com/recipes/steamed-egg
121,121,Ginger Fried Rice (薑炒飯),"['2 oz ginger', '3 oz chicken', '3 egg', '1 ts...",4 servings,00:15,00:30,https://www.madewithlau.com/recipes/ginger-fri...
122,122,Mapo Tofu With Chicken (雞肉麻婆豆腐),"['1 lb tofu (any level of firmness is fine)', ...",4 servings,00:10,00:20,https://www.madewithlau.com/recipes/mapo-tofu-...
123,123,Rainbow Chicken Stir Fry (七彩炒雞),"['10 oz chicken breast', '0.50 red bell pepper...",4 servings,00:25,00:35,https://www.madewithlau.com/recipes/rainbow-ch...


In [68]:
lau_df_copy.drop(columns = 'Prep_time', inplace=True) # Take out 'Prep_time'

In [69]:
lau_df_copy.rename(columns = {'Total_time': 'Time'}, inplace=True) # Just use 'Total_time' and rename it to 'Time'

In [70]:
lau_df_copy.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Time,URL
0,0,Lo Mein (鷄肉撈麵),"['1 oz dried shiitake mushroom', '2 oz seafood...",4 servings,00:55,https://www.madewithlau.com/recipes/lo-mein-101
1,1,Egg Drop Soup (蛋花湯),"['5 oz shrimp (26/30 size)', '0.25 tsp white p...",4 servings,00:20,https://www.madewithlau.com/recipes/egg-drop-soup
2,2,Pipa Tofu (琵琶豆腐),"['1 lb medium firm tofu', '4 oz shrimp', '1 oz...",4 servings,00:45,https://www.madewithlau.com/recipes/pipa-tofu
3,3,Chow Mein: A Chinese Chef's Masterclass (鷄肉炒麵),"['18 oz thick chow mein noodles', '2 oz red on...",4 servings,00:35,https://www.madewithlau.com/recipes/chow-mein
4,4,Lemon Chicken (檸檬雞),"['14 oz chicken breast', '3 whole lemon', '1 d...",4 servings,00:50,https://www.madewithlau.com/recipes/lemon-chicken


Clean jamie_df:

In [71]:
jamie_df_copy = jamie_df.copy() # Make a copy of jamie_df

In [72]:
jamie_df_copy.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Time,URL
0,0,Asparagus stir-fry,"['350 g firm tofu', '2 teaspoons groundnut oil...",4,Not too tricky,248,,https://www.jamieoliver.com/recipes/vegetable-...
1,1,Sweet potato fishcakes,"['500 g potatoes', '500 g sweet potatoes', '2 ...",4,Not too tricky,423,1 hour 10 minutes,https://www.jamieoliver.com/recipes/fish-recip...
2,2,Spring chicken stew,"['olive oil', '2 rashers of higher-welfare smo...",4,Not too tricky,549,1 hour 10 minutes,https://www.jamieoliver.com/recipes/chicken-re...
3,3,Chicken goujons,"['6 wholemeal pittas', '4 x 120 g free-range s...",4,Not too tricky,520,23 minutes,https://www.jamieoliver.com/recipes/chicken-re...
4,4,Sweet & sour stir-fry,"['100 g fine rice noodles', '1 x 227 g tin of ...",2,Super easy,492,21 minutes,https://www.jamieoliver.com/recipes/vegetable-...


In [73]:
jamie_df_copy.isnull().sum() # Check missing data

Unnamed: 0      0
Title           0
Ingredients     0
Servings        0
Difficulty      0
Calories        0
Time           12
URL             0
dtype: int64

In [74]:
jamie_df_copy.Time.fillna('0 minutes', inplace=True) # Fill in missing data with '0 minutes' so it can be parsed

In [75]:
# Parse time and convert to strftime
jamie_df_copy['Time'] = jamie_df_copy['Time'].apply(time_parser)
jamie_df_copy['Time'] = jamie_df_copy['Time'].dt.strftime('%H:%M')

In [76]:
jamie_df_copy.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Time,URL
0,0,Asparagus stir-fry,"['350 g firm tofu', '2 teaspoons groundnut oil...",4,Not too tricky,248,00:00,https://www.jamieoliver.com/recipes/vegetable-...
1,1,Sweet potato fishcakes,"['500 g potatoes', '500 g sweet potatoes', '2 ...",4,Not too tricky,423,01:10,https://www.jamieoliver.com/recipes/fish-recip...
2,2,Spring chicken stew,"['olive oil', '2 rashers of higher-welfare smo...",4,Not too tricky,549,01:10,https://www.jamieoliver.com/recipes/chicken-re...
3,3,Chicken goujons,"['6 wholemeal pittas', '4 x 120 g free-range s...",4,Not too tricky,520,00:23,https://www.jamieoliver.com/recipes/chicken-re...
4,4,Sweet & sour stir-fry,"['100 g fine rice noodles', '1 x 227 g tin of ...",2,Super easy,492,00:21,https://www.jamieoliver.com/recipes/vegetable-...


In [77]:
combined_df = pd.concat([jamie_df_copy,lau_df_copy]) # Combine the 2 dataframes

In [78]:
combined_df_copy = combined_df.copy() # Make a copy

In [79]:
combined_df_copy.isnull().sum() # Check missing data

Unnamed: 0       0
Title            0
Ingredients      0
Servings         0
Difficulty     125
Calories       125
Time             0
URL              0
dtype: int64

Lau recipes did not contain difficulty or calories, which is why they are missing data here. Let's fill them in with 'unknown'

In [80]:
combined_df_copy.fillna('unknown', inplace=True)
combined_df_copy.reset_index(inplace=True, drop=True)
combined_df_copy.drop(columns='Unnamed: 0', inplace=True)

NLP for Ingredients column:
1. Get rid of puncutation and non-alphabet letters
2. Lemmatize words to reduce them to their base form
3. Get rid of weights and measures (found a list of standard words from Wikipedia - https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement)
4. Get rid of stop words and other most common words that aren't ingredients


In [81]:
nlp_df = combined_df_copy.copy()

In [82]:
# Function to parse ingredients
def ingredient_parser(ingredients):

    # List of measure words
    measures = ['teaspoon', 't', 'tsp.', 'tsp', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.','tbsp', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']

    translator = str.maketrans('', '', string.punctuation) # Get rid of punctuations using maketrans
    lemmatizer = WordNetLemmatizer() # Set lemmatizer
    new_ingred_list = [] # Empty list for parsed ingredients
    ingred_list = ast.literal_eval(ingredients) # Get list from ingredients column

    # Loop through each ingredient
    for i in ingred_list:
        i.translate(translator) # Get rid of punctuation
        items = re.split(' |-', i) # Split with where there is a space or hyphenated words
        items = [word for word in items if word.isalpha()] # Get rid of non alphabet words
        items = [word.lower() for word in items] # Make everything lower case
        items = [unidecode.unidecode(word) for word in items] # Unidecode each word
        items = [lemmatizer.lemmatize(word) for word in items] # Lemmatize each word
        items = [word for word in items if word not in measures] # Take out measure words
        new_ingred_list.append(' '.join(items)) # Append to list

    return ' '.join(new_ingred_list) # Return new parsed ingredients

In [83]:
# Example before parse
nlp_df.Ingredients[1020]

"['1 lb fresh rice noodle', '5 oz shrimp (the amount is up to you, but 41/50 is the preferred size)', '6 oz bean sprouts', '3 oz onion', '1 tsp cornstarch', '0.50 oz green onion', '2 tbsp light soy sauce', '1 tsp chicken bouillon', 'sesame oil (to taste)', '1 tbsp vegetable oil']"

In [84]:
# After parse
ingredient_parser(nlp_df.Ingredients[1020])

'fresh rice noodle shrimp amount is up to but is the preferred bean sprout onion cornstarch green onion light soy sauce chicken bouillon sesame oil vegetable oil'

In [85]:
# Apply to 'Ingredients' column
nlp_df['Ingredients_parsed'] = nlp_df['Ingredients'].apply(lambda x: ingredient_parser(x))
nlp_df.head()

Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Time,URL,Ingredients_parsed
0,Asparagus stir-fry,"['350 g firm tofu', '2 teaspoons groundnut oil...",4,Not too tricky,248.0,00:00,https://www.jamieoliver.com/recipes/vegetable-...,firm tofu groundnut oil sesame seed cornflour ...
1,Sweet potato fishcakes,"['500 g potatoes', '500 g sweet potatoes', '2 ...",4,Not too tricky,423.0,01:10,https://www.jamieoliver.com/recipes/fish-recip...,potato sweet potato red pepper extra virgin ol...
2,Spring chicken stew,"['olive oil', '2 rashers of higher-welfare smo...",4,Not too tricky,549.0,01:10,https://www.jamieoliver.com/recipes/chicken-re...,olive oil rasher higher welfare smoked streaky...
3,Chicken goujons,"['6 wholemeal pittas', '4 x 120 g free-range s...",4,Not too tricky,520.0,00:23,https://www.jamieoliver.com/recipes/chicken-re...,wholemeal pitta free range skinless chicken br...
4,Sweet & sour stir-fry,"['100 g fine rice noodles', '1 x 227 g tin of ...",2,Super easy,492.0,00:21,https://www.jamieoliver.com/recipes/vegetable-...,fine rice noodle tin pineapple chunk juice cor...


In [86]:
# Get the 200 most common words using nltk.FreqDist()
vocabulary = nltk.FreqDist()
for ingredients in nlp_df['Ingredients_parsed']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

fresh;1505
oil;1350
olive;963
a;831
red;781
garlic;642
onion;627
clove;607
bunch;604
or;586
and;546
leaf;522
chilli;496
pepper;474
sauce;463
large;458
extra;424
tomato;416
lemon;402
salt;401
sprig;376
ground;370
handful;364
small;363
free;362
dried;330
chicken;322
cheese;307
white;298
wine;294
virgin;293
from;287
chopped;281
black;267
sustainable;264
seed;263
range;263
vinegar;260
for;247
higher;244
welfare;244
coriander;242
peeled;236
egg;229
piece;223
tin;217
water;215
green;212
finely;212
sugar;209
flour;207
to;204
ginger;201
freshly;199
quality;192
ripe;186
butter;185
parsley;184
sea;184
few;183
stock;182
soy;177
source;175
potato;173
carrot;172
flat;165
bean;164
lime;163
the;163
thyme;163
rice;161
smoked;158
organic;158
sesame;153
vegetable;152
rosemary;144
spring;144
sliced;139
plus;136
fillet;136
yoghurt;135
plain;135
stick;134
cornstarch;133
picked;132
mint;131
parmesan;128
mixed;127
your;120
pork;120
bay;119
optional;118
celery;117
serve;116
basil;116
mustard;113
cumin;113
bab

In [87]:
# Get a list of the 250 most common words
common_words = []
for word, _ in vocabulary.most_common(250):
    common_words.append(word)
print(common_words)

['fresh', 'oil', 'olive', 'a', 'red', 'garlic', 'onion', 'clove', 'bunch', 'or', 'and', 'leaf', 'chilli', 'pepper', 'sauce', 'large', 'extra', 'tomato', 'lemon', 'salt', 'sprig', 'ground', 'handful', 'small', 'free', 'dried', 'chicken', 'cheese', 'white', 'wine', 'virgin', 'from', 'chopped', 'black', 'sustainable', 'seed', 'range', 'vinegar', 'for', 'higher', 'welfare', 'coriander', 'peeled', 'egg', 'piece', 'tin', 'water', 'green', 'finely', 'sugar', 'flour', 'to', 'ginger', 'freshly', 'quality', 'ripe', 'butter', 'parsley', 'sea', 'few', 'stock', 'soy', 'source', 'potato', 'carrot', 'flat', 'bean', 'lime', 'the', 'thyme', 'rice', 'smoked', 'organic', 'sesame', 'vegetable', 'rosemary', 'spring', 'sliced', 'plus', 'fillet', 'yoghurt', 'plain', 'stick', 'cornstarch', 'picked', 'mint', 'parmesan', 'mixed', 'your', 'pork', 'bay', 'optional', 'celery', 'serve', 'basil', 'mustard', 'cumin', 'baby', 'fennel', 'unsalted', 'plum', 'natural', 'paprika', 'mushroom', 'fat', 'milk', 'oyster', 'ski

Remove stop words and other most common words in the ingredients that are useless to the model. For example 'salt' and 'oil', we can assume everyone looking for a recipe has salt and oil in their kitchen.
We got the 250 most common words in the ingredients, however, some of these words include ingredients that are actually useful for the model so I manually took out these ingredients:
['onion', 'tomato', 'lemon', 'chicken', 'wine', 'cheese', 'egg', 'ginger', 'coriander', 'carrot', 'rice', 'butter', 'oyster', 'potato', 'lime', 'thyme', 'rosemary', 'pork', 'yoghurt', 'mushroom', 'mint', 'celery', 'parmesan', 'basil', 'mustard', 'cumin', 'fennel', 'milk', 'paprika', 'beef', 'pea', 'spinach', 'honey', 'shallot', 'shrimp', 'bacon', 'cinnamon', 'oregano', 'noodle', 'cabbage', 'lamb', 'coconut', 'lettuce', 'sausage', 'broccoli', 'nutmeg', 'leek', 'salmon', 'cheddar', 'sage', 'turmeric', 'rocket', 'anchovy', 'prawn', 'breadcrumb', 'tofu', 'avocado', 'courgette', 'cucumber', 'chickpea', 'ketchup', 'feta', 'apple', 'chestnut', 'pancetta', 'dill', 'ciabatta', 'watercress', 'peanut', 'cayenne', 'pasta']

In [88]:
# Make a list of actual ingredients so we don't take them out
actual_ingreds = ['garlic','onion', 'tomato', 'lemon', 'chicken', 'wine', 'cheese', 'egg', 'ginger', 'coriander', 'carrot', 'rice', 'butter', 'oyster', 'potato', 'lime', 'thyme', 'rosemary', 'pork', 'yoghurt', 'mushroom', 'mint', 'celery', 'parmesan', 'basil', 'mustard', 'cumin', 'fennel', 'milk', 'paprika', 'beef', 'pea', 'spinach', 'honey', 'shallot', 'shrimp', 'bacon', 'cinnamon', 'oregano', 'noodle', 'cabbage', 'lamb', 'coconut', 'lettuce', 'sausage', 'broccoli', 'nutmeg', 'leek', 'salmon', 'cheddar', 'sage', 'turmeric', 'rocket', 'anchovy', 'prawn', 'breadcrumb', 'tofu', 'avocado', 'courgette', 'cucumber', 'chickpea', 'ketchup', 'feta', 'apple', 'chestnut', 'pancetta', 'dill', 'ciabatta', 'watercress', 'peanut', 'cayenne', 'pasta']
# Filter through 'actual_ingreds' list
new_common_words = [word for word in common_words if word not in actual_ingreds]

In [89]:
# Check length of new list
len(new_common_words)

178

In [90]:
# Function that removes stop words/most common words
def remove_stop_words(ingredients):
    ingred_list = ingredients.split()
    new_ingred_list = [i for i in ingred_list if i not in new_common_words]
    return ' '.join(new_ingred_list)

In [91]:
# Make a final ingredients column that has remove_stop_words() applied to it
nlp_df['Ingredients_final'] = nlp_df['Ingredients_parsed'].apply(lambda x: remove_stop_words(x))
nlp_df.head()

Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Time,URL,Ingredients_parsed,Ingredients_final
0,Asparagus stir-fry,"['350 g firm tofu', '2 teaspoons groundnut oil...",4,Not too tricky,248.0,00:00,https://www.jamieoliver.com/recipes/vegetable-...,firm tofu groundnut oil sesame seed cornflour ...,tofu cornflour wine garlic ginger beansprouts ...
1,Sweet potato fishcakes,"['500 g potatoes', '500 g sweet potatoes', '2 ...",4,Not too tricky,423.0,01:10,https://www.jamieoliver.com/recipes/fish-recip...,potato sweet potato red pepper extra virgin ol...,potato potato chipotle tabasco boned cucumber ...
2,Spring chicken stew,"['olive oil', '2 rashers of higher-welfare smo...",4,Not too tricky,549.0,01:10,https://www.jamieoliver.com/recipes/chicken-re...,olive oil rasher higher welfare smoked streaky...,bacon rosemary onion carrot potato pearl barle...
3,Chicken goujons,"['6 wholemeal pittas', '4 x 120 g free-range s...",4,Not too tricky,520.0,00:23,https://www.jamieoliver.com/recipes/chicken-re...,wholemeal pitta free range skinless chicken br...,pitta chicken basil egg garlic parmesan cheese...
4,Sweet & sour stir-fry,"['100 g fine rice noodles', '1 x 227 g tin of ...",2,Super easy,492.0,00:21,https://www.jamieoliver.com/recipes/vegetable-...,fine rice noodle tin pineapple chunk juice cor...,rice noodle pineapple chunk cornflour cashew o...


In [92]:
# Save to .csv file
nlp_df.to_csv('Cleaned_data.csv')