Code takes ingredients from three datasets and combines them

In [2]:
import pandas as pd
import re
import ast
import numpy as np

In [14]:
df1 = pd.read_csv('Data/recipes_ingredients.csv')
df2 = pd.read_csv('Data/recipes.csv')

In [15]:
df2 = df2[['RecipeId', 'RecipeIngredientParts']]
df2.head(10)

Unnamed: 0,RecipeId,RecipeIngredientParts
0,38,"c(""blueberries"", ""granulated sugar"", ""vanilla ..."
1,39,"c(""saffron"", ""milk"", ""hot green chili peppers""..."
2,40,"c(""sugar"", ""lemons, rind of"", ""lemon, zest of""..."
3,41,"c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""..."
4,42,"c(""plain tomato juice"", ""cabbage"", ""onion"", ""c..."
5,43,"c(""graham cracker crumbs"", ""sugar"", ""butter"", ..."
6,44,"c(""chicken"", ""butter"", ""flour"", ""milk"", ""celer..."
7,45,"c(""sugar"", ""margarine"", ""egg"", ""flour"", ""salt""..."
8,46,"c(""rice vinegar"", ""haeo"")"
9,47,"c(""butter"", ""brown sugar"", ""granulated sugar"",..."


In [18]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 2 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   RecipeId               522517 non-null  int64 
 1   RecipeIngredientParts  522517 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.0+ MB


In [16]:
df1 = df1[['id', 'ingredients']] 
df1.head(10)

Unnamed: 0,id,ingredients
0,71247,"[""cherry pie filling"", ""condensed milk"", ""melt..."
1,76133,"[""corned beef chopped"", ""sauerkraut cold water..."
2,503816,"[""unsalted butter"", ""vegetable oil"", ""all - pu..."
3,418749,"[""orange cake mix"", ""instant vanilla pudding"",..."
4,392934,"[""butter"", ""brown sugar"", ""granulated sugar"", ..."
5,532245,"[""chicken breasts cutlets"", ""petite carrots"", ..."
6,489452,"[""teriyaki sauce"", ""pork chops""]"
7,126368,"[""flour""]"
8,306467,"[""olive oil"", ""yellow onion"", ""celery rib"", ""c..."
9,318331,"[""biscuit mix"", ""granulated sugar"", ""butter"", ..."


In [19]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500471 entries, 0 to 500470
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   RecipeId     500471 non-null  int64 
 1   ingredients  500471 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.6+ MB


In [17]:
df1 = df1.rename(columns = {"id": "RecipeId"})
test_df = pd.merge(df1, df2, how = "inner", on = "RecipeId")
test_df.head(10)
test_df.to_csv('ingredients.csv')

In [20]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498317 entries, 0 to 498316
Data columns (total 3 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   RecipeId               498317 non-null  int64 
 1   ingredients            498317 non-null  object
 2   RecipeIngredientParts  498317 non-null  object
dtypes: int64(1), object(2)
memory usage: 11.4+ MB


From here I searched for \ in the csv and removed any I could find as well as whatever was contained inside them.

In [4]:
# Format the column so that c() is replaced with [] and return it as a list
# Additionally call remove_unwanted
def clean_ingredients(ingredient_str):
    cleaned_str = ingredient_str.replace("c(", "[").replace(")", "]")

    if cleaned_str.startswith("[") and cleaned_str.endswith("]"):
        ingredients = ast.literal_eval(cleaned_str)
        cleaned_ingredients = remove_unwanted(ingredients)
        return cleaned_ingredients
    else:
        return [] 

In [5]:
# Remove words that may get in the way of combining
def remove_unwanted(ingredients):

    unwanted_words = ["chopped", "cold water", "shredded"]
    
    cleaned = []
    for ingredient in ingredients:
        for word in unwanted_words:
            ingredient = re.sub(r'\b' + re.escape(word) + r'\b', '', ingredient, flags = re.IGNORECASE)
        ingredient = re.sub(r'\s+', ' ', ingredient).strip() 
        cleaned.append(ingredient)
    
    return cleaned

In [6]:
# Combine the two columns of ingredients
def combine_ingredients(row):
    
    left_ingredients = clean_ingredients(row['ingredients']) if row['ingredients'] else []
    right_ingredients = clean_ingredients(row['RecipeIngredientParts']) or []
    
    combined = list(set(left_ingredients + right_ingredients))
    return combined

In [7]:
test_df = pd.read_csv('Data/ingredients.csv')
test_df['Raw_Ingredients'] = test_df.apply(combine_ingredients, axis = 1)
test_df = test_df[['RecipeId', 'Raw_Ingredients']]

In [8]:
test_df.head(10)

Unnamed: 0,RecipeId,Raw_Ingredients
0,71247,"[condensed milk, margarine, melted margarine, ..."
1,76133,"[corned beef, sauerkraut, swiss cheese, butter..."
2,503816,"[salt, sugar, all - purpose flour, vegetable o..."
3,418749,"[orange gelatin, instant vanilla pudding, oran..."
4,392934,"[salt, granulated sugar, butter, chocolate chi..."
5,532245,"[chicken breasts cutlets, carrots, water, marg..."
6,489452,"[pork chops, teriyaki sauce]"
7,126368,"[sugar, milk, salt, flour]"
8,306467,"[dry red wine, salt, parmesan cheese, celery r..."
9,318331,"[granulated sugar, butter, biscuit mix, milk, ..."


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498317 entries, 0 to 498316
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   RecipeId         498317 non-null  int64 
 1   Raw_Ingredients  498317 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.6+ MB


In [10]:
df3 = pd.read_csv('Data/RAW_recipes.csv')
df3.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [11]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [12]:
df3 = df3[['id', 'ingredients']]
df3 = df3.rename(columns = {"id": "RecipeId"})
test_df = pd.merge(test_df, df3, how = "left", on = "RecipeId")

In [13]:
test_df.head()

Unnamed: 0,RecipeId,Raw_Ingredients,ingredients
0,71247,"[condensed milk, margarine, melted margarine, ...",
1,76133,"[corned beef, sauerkraut, swiss cheese, butter...","['corned beef', 'thousand island dressing', 's..."
2,503816,"[salt, sugar, all - purpose flour, vegetable o...",
3,418749,"[orange gelatin, instant vanilla pudding, oran...",
4,392934,"[salt, granulated sugar, butter, chocolate chi...",


In [14]:
# Combine the two columns of ingredients
def combine_ingredients_2(raw_ingredients_str, ingredients_str):

    def parse_ingredients(ingredients_str):
       
        if isinstance(ingredients_str, list):
            return [ingredient.strip(" '\"") for ingredient in ingredients_str]
  
        elif isinstance(ingredients_str, str):
            ingredients_str = ingredients_str.strip("[]") 
            ingredients_list = [ingredient.strip(" '\"") for ingredient in ingredients_str.split(",")]
            return ingredients_list
        else:
            return []
    
    raw_ingredients = parse_ingredients(raw_ingredients_str)
    ingredients = parse_ingredients(ingredients_str)
    
    combined_ingredients = list(set(raw_ingredients + ingredients)) 
    return list(combined_ingredients)

In [15]:
test_df['Cleaned_Ingredients'] = test_df.apply(lambda row: combine_ingredients_2(row["Raw_Ingredients"], row["ingredients"]), axis = 1)
test_df = test_df[['RecipeId','Raw_Ingredients', 'Cleaned_Ingredients']]
test_df.to_csv('cleaned_ingredients.csv')

In [16]:
ingredients_df = pd.read_csv('cleaned_ingredients.csv')

In [17]:
ingredients_df.head()

Unnamed: 0.1,Unnamed: 0,RecipeId,Raw_Ingredients,Cleaned_Ingredients
0,0,71247,"['condensed milk', 'margarine', 'melted margar...","['condensed milk', 'margarine', 'self-rising f..."
1,1,76133,"['corned beef', 'sauerkraut', 'swiss cheese', ...","['corned beef', 'sauerkraut', 'swiss cheese', ..."
2,2,503816,"['salt', 'sugar', 'all - purpose flour', 'vege...","['salt', 'sugar', 'all - purpose flour', 'vege..."
3,3,418749,"['orange gelatin', 'instant vanilla pudding', ...","['orange gelatin', 'instant vanilla pudding', ..."
4,4,392934,"['salt', 'granulated sugar', 'butter', 'chocol...","['salt', 'granulated sugar', 'vanilla', 'choco..."


In [18]:
ingredients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498317 entries, 0 to 498316
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           498317 non-null  int64 
 1   RecipeId             498317 non-null  int64 
 2   Raw_Ingredients      498317 non-null  object
 3   Cleaned_Ingredients  498317 non-null  object
dtypes: int64(2), object(2)
memory usage: 15.2+ MB


In [19]:
# Drop single ingredient recipes (There aren't any)
ingredients_df = ingredients_df[ingredients_df['Cleaned_Ingredients'].apply(lambda x: len(x) > 1)].reset_index(drop = True)
ingredients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498317 entries, 0 to 498316
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           498317 non-null  int64 
 1   RecipeId             498317 non-null  int64 
 2   Raw_Ingredients      498317 non-null  object
 3   Cleaned_Ingredients  498317 non-null  object
dtypes: int64(2), object(2)
memory usage: 15.2+ MB
