In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('../data/recipes.csv')

df.drop(columns=['AuthorId', 'TotalTime', 'AggregatedRating', 'ReviewCount', 'RecipeYield'], inplace=True)

df

Unnamed: 0,RecipeId,Name,AuthorName,CookTime,PrepTime,DatePublished,Description,Images,RecipeCategory,Keywords,...,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,Dancer,PT24H,PT45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"",...",...,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,elly9812,PT25M,PT4H,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"",...",...,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,Stephen Little,PT5M,PT30M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"",...",...,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,Cyclopz,PT20M,PT24H,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""We...",...,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,Duckie067,PT30M,PT20M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""...",...,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,"c(""Mix everything together and bring to a boil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,541379,Meg's Fresh Ginger Gingerbread,rdsxc,PT35M,PT1H,2020-12-22T15:27:00Z,Make and share this Meg's Fresh Ginger Gingerb...,character(0),Dessert,"""< 4 Hours""",...,12.5,7.6,54.4,278.2,48.5,0.8,22.8,3.9,8.0,"c(""Preheat oven to 350&deg;F Grease an 8x8 cak..."
522513,541380,Roast Prime Rib au Poivre with Mixed Peppercorns,Denver cooks,PT3H,PT30M,2020-12-22T15:32:00Z,"White, black, green, and pink peppercorns add ...","""https://img.sndimg.com/food/image/upload/w_55...",Very Low Carbs,"c(""High Protein"", ""High In..."", ""< 4 Hours"")",...,172.4,71.4,433.8,766.3,3.2,0.7,0.1,117.0,8.0,"c(""Position rack in center of oven and preheat..."
522514,541381,Kirshwasser Ice Cream,Jonathan F.,PT3H,PT1H,2020-12-22T15:33:00Z,Make and share this Kirshwasser Ice Cream reci...,character(0),Ice Cream,"c(""Dessert"", ""< 4 Hours"")",...,117.2,72.6,470.9,192.5,33.9,0.0,17.3,12.8,6.0,"c(""heat half and half and heavy cream to a sim..."
522515,541382,Quick & Easy Asian Cucumber Salmon Rolls,CLUBFOODY,,PT15M,2020-12-22T22:11:00Z,"Extremely quick and easy to make, these are gr...","""https://img.sndimg.com/food/image/upload/w_55...",Canadian,"c(""< 15 Mins"", ""Easy"")",...,0.6,0.1,2.9,100.5,0.3,0.0,0.1,2.4,,"c(""In a small bowl, combine mayo and wasabi pa..."


In [13]:
import re


def extract_minutes(duration: str) -> int:
    """
    Converts a ISO 8601 duration pattern to minutes.
    Only considers hours and minutes.
    If string is null, returns 0 minutes.

    :param duration: the duration in ISO 8601 duration pattern format
    :return: the duration in minutes
    """

    if duration == 'NaN' or pd.isna(duration) or pd.isnull(duration):
        return 0

    match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?', duration)

    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0

    return hours * 60 + minutes


def parse_list(list_string: str) -> list[str]:
    """
    Converts the c("a", "b", "c") format (R vector) of the lists in the dataset to a python list.
    Handles missing data and edge cases.

    :param list_string: string list of items
    :return: a list of item strings
    """
    if type(list_string) == float:
        return []

    arr = [x.strip().replace('c("', '').replace('")', '') for x in re.split(r'"\s*,\s*"', list_string)]

    if arr[0].strip() == '':
        del arr[0]
    elif arr[0][0] == '[':
        arr[0] = arr[0][2:]

    if arr[-1].strip() == '':
        del arr[-1]
    elif arr[-1][-1] == ']':
        arr[-1] = arr[-1][:-2]

    return arr


def parse_image_list(list_string: str) -> list[str]:
    """
    Converts the image array format in the dataset to a python list.
    Handles missing data and edge cases.

    :param list_string: string list of images
    :return: a list containing the urls of the images
    """

    if list_string == 'character(0)' or list_string == 'NaN' or type(list_string) == float:
        return []

    if list_string[0] == '"':
        return [re.search(r'"(.*)"', list_string).group(1)]

    return parse_list(list_string)


# Composing the new properties

df['CookTime'] = df['CookTime'].map(extract_minutes)
df['PrepTime'] = df['PrepTime'].map(extract_minutes)
df['Keywords'] = df['Keywords'].map(lambda x: re.findall(r'"(.*?)"', str(x)))
df['RecipeCategory'] = df['RecipeCategory']
df['RecipeInstructions'] = df['RecipeInstructions'].map(parse_list)
df['Images'] = df['Images'].map(parse_image_list)

df

Unnamed: 0,RecipeId,Name,AuthorName,CookTime,PrepTime,DatePublished,Description,Images,RecipeCategory,Keywords,...,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,Dancer,1440,45,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...",...,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39,Biryani,elly9812,25,240,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...",...,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...
2,40,Best Lemonade,Stephen Little,5,30,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...",...,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41,Carina's Tofu-Vegetable Kebabs,Cyclopz,20,1440,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...",...,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,"[Drain the tofu, carefully squeezing out exces..."
4,42,Cabbage Soup,Duckie067,30,20,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...",...,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,"[Mix everything together and bring to a boil.,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,541379,Meg's Fresh Ginger Gingerbread,rdsxc,35,60,2020-12-22T15:27:00Z,Make and share this Meg's Fresh Ginger Gingerb...,[],Dessert,[< 4 Hours],...,12.5,7.6,54.4,278.2,48.5,0.8,22.8,3.9,8.0,[Preheat oven to 350&deg;F Grease an 8x8 cake ...
522513,541380,Roast Prime Rib au Poivre with Mixed Peppercorns,Denver cooks,180,30,2020-12-22T15:32:00Z,"White, black, green, and pink peppercorns add ...",[https://img.sndimg.com/food/image/upload/w_55...,Very Low Carbs,"[High Protein, High In..., < 4 Hours]",...,172.4,71.4,433.8,766.3,3.2,0.7,0.1,117.0,8.0,[Position rack in center of oven and preheat t...
522514,541381,Kirshwasser Ice Cream,Jonathan F.,180,60,2020-12-22T15:33:00Z,Make and share this Kirshwasser Ice Cream reci...,[],Ice Cream,"[Dessert, < 4 Hours]",...,117.2,72.6,470.9,192.5,33.9,0.0,17.3,12.8,6.0,[heat half and half and heavy cream to a simme...
522515,541382,Quick & Easy Asian Cucumber Salmon Rolls,CLUBFOODY,0,15,2020-12-22T22:11:00Z,"Extremely quick and easy to make, these are gr...",[https://img.sndimg.com/food/image/upload/w_55...,Canadian,"[< 15 Mins, Easy]",...,0.6,0.1,2.9,100.5,0.3,0.0,0.1,2.4,,"[In a small bowl, combine mayo and wasabi past..."


In [14]:
# This dataset contains more detailed ingredients, we will combine the 2 datasets
df_ingredients = pd.read_csv('../data/recipes_ingredients.csv')

df_ingredients = df_ingredients[['id', 'ingredients_raw', 'ingredients', 'serving_size']]

df_ingredients = df_ingredients.sort_values(by='id', ascending=True)

df_ingredients = df_ingredients.drop_duplicates(subset='id', keep='first')

In [15]:
def parse_ingredient_string(ingredient_string: str):
    """
    Converts an ingredient given as string to an object with the following properties:

    'name': The name of the ingredient;
    'quantity': The quantity of the ingredient in the given unit (optional);
    'unit': The unit of measure for the ingredient. Example: cup, tablespoon, lb, g etc. (optional);
    'unitQuantity': For ingredients like 2 (12 ounce) cans, represents the number of ounces (optional);
    'unitUnit': For ingredients like 2 (12 ounce) cans, represents the ounce (optional);

    :param ingredient_string: the string of the ingredient
    :return: an ingredient object
    """
    original = ingredient_string

    try:
        ingredient = {}

        quantity_match = re.search(r"^[0-9/-]+(?: [0-9/-]+)*", ingredient_string)

        if quantity_match:
            ingredient['quantity'] = quantity_match.group(0).strip()
            ingredient_string = re.sub(re.escape(ingredient['quantity']) + r'\s*', '', ingredient_string, count=1)

        unit_type_match = re.search(r"^(\((.*?)\))", ingredient_string)

        if unit_type_match:
            unit_type = unit_type_match.group(2)

            ingredient_string = re.sub(re.escape(unit_type_match.group(1)) + r'\s*', '', ingredient_string, count=1)

            unit_type_quantity_match = re.search(r"^[0-9/-]+(?: [0-9/-]+)*", unit_type)
            ingredient['unitQuantity'] = unit_type_quantity_match.group(0).strip()
            ingredient['unitUnit'] = re.sub(re.escape(ingredient['unitQuantity']) + r'\s*', '', unit_type,
                                            count=1)

        unit_match = re.search(r'^([^\s]+?)\s{2,}', ingredient_string)

        if unit_match:
            ingredient['unit'] = unit_match.group(1).strip()

            ingredient_string = re.sub(re.escape(ingredient['unit']) + r'\s*', '', ingredient_string, count=1)

        ingredient['name'] = re.sub(r'\s+', ' ', ingredient_string).strip()

        if ',' in ingredient['name']:
            ingredient_name, description = ingredient['name'].split(',', maxsplit=1)

            ingredient['name'] = ingredient_name.strip()
            ingredient['description'] = description.strip()

        return ingredient

    except AttributeError:
        if ',' in original:
            ingredient_name, description = original.split(',', maxsplit=1)

            return {
                'name': ingredient_name.strip(),
                'description': description.strip()
            }

        return {'name': original.strip()}


def parse_ingredients(row):
    parsed_ingredients_raw = parse_list(row['ingredients_raw'])

    return [parse_ingredient_string(ing) for ing in parsed_ingredients_raw]


df_ingredients['ingredients'] = df_ingredients.apply(parse_ingredients, axis=1)

df_ingredients

Unnamed: 0,id,ingredients_raw,ingredients,serving_size
406799,38,"[""4 cups blueberries, fresh or frozen "",""...","[{'quantity': '4', 'unit': 'cups', 'name': 'bl...",1 (225 g)
2594,39,"[""1 tablespoon saffron"",""4 teaspoons ...","[{'quantity': '1', 'unit': 'tablespoon', 'name...",1 (799 g)
87430,40,"[""1 1/2 cups sugar"",""1 tablespoon lem...","[{'quantity': '1 1/2', 'unit': 'cups', 'name':...",1 (212 g)
213276,41,"[""12 ounces extra firm tofu, water-packed...","[{'quantity': '12', 'unit': 'ounces', 'name': ...",1 (932 g)
322863,43,"[""1 1/4 cups graham cracker crumbs"",""1/4 ...","[{'quantity': '1 1/4', 'unit': 'cups', 'name':...",1 (171 g)
...,...,...,...,...
468502,543733,"[""1/2 cup butter"",""1/2 cup flour"",""1 medium on...","[{'quantity': '1/2', 'name': 'cup butter'}, {'...",1 (566 g)
173576,543734,"[""3 tablespoons olive oil"",""3 tablespoons bals...","[{'quantity': '3', 'name': 'tablespoons olive ...",1 (38 g)
498991,543735,"[""3 tablespoons olive oil"",""3 tablespoons bals...","[{'quantity': '3', 'name': 'tablespoons olive ...",1 (38 g)
140049,543736,"[""4 extra-large sweet potatoes, Peeled and cut...","[{'quantity': '4', 'name': 'extra-large sweet ...",1 (62 g)


In [16]:
# Merge the ingredients dataset with the normal dataset

df = pd.merge(df, df_ingredients, how='inner', left_on='RecipeId', right_on='id')
df = df.drop_duplicates(subset='RecipeId', keep='first')

df = df.drop(columns=['ingredients_raw', 'RecipeId', 'RecipeIngredientQuantities', 'RecipeIngredientParts'])

In [17]:
# Drop rows that don't fit the format
df = df[df['serving_size'].apply(lambda x: True if re.search(r'1 \(\d+\s*g\)', str(x)) else False) == True]

# Extract formatted serving size
df['serving_size'] = df['serving_size'].apply(lambda x: re.search(r'1 \((\d+)\s*g\)', str(x)).group(1))

In [18]:
# Rename elements

df.rename(
    columns={'Name': 'name', 'AuthorName': 'author_name', 'CookTime': 'cooking_time', 'PrepTime': 'preparation_time',
             'DatePublished': 'created_at', 'Description': 'description', 'Images': 'images', 'Calories': 'calories',
             'FatContent': 'total_fat', 'SaturatedFatContent': 'saturated_fat', 'CholesterolContent': 'cholesterol',
             'SodiumContent': 'sodium', 'CarbohydrateContent': 'carbohydrates', 'FiberContent': 'fiber',
             'SugarContent': 'sugar', 'ProteinContent': 'protein', 'RecipeServings': 'servings',
             'RecipeInstructions': 'steps', 'servingSize': 'serving_size', 'Keywords': 'tags',
             'RecipeCategory': 'category'}, inplace=True)

df.drop(columns=['id'], inplace=True)

df

Unnamed: 0,name,author_name,cooking_time,preparation_time,created_at,description,images,category,tags,calories,...,cholesterol,sodium,carbohydrates,fiber,sugar,protein,servings,steps,ingredients,serving_size
0,Low-Fat Berry Blue Frozen Dessert,Dancer,1440,45,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...",170.9,...,8.0,29.8,37.1,3.6,30.2,3.2,4.0,"[Toss 2 cups berries with sugar., Let stand fo...","[{'quantity': '4', 'unit': 'cups', 'name': 'bl...",225
1,Biryani,elly9812,25,240,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...",1110.7,...,372.8,368.4,84.4,9.0,20.4,63.4,6.0,[Soak saffron in warm milk for 5 minutes and p...,"[{'quantity': '1', 'unit': 'tablespoon', 'name...",799
2,Best Lemonade,Stephen Little,5,30,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...",311.1,...,0.0,1.8,81.5,0.4,77.2,0.3,4.0,"[Into a 1 quart Jar with tight fitting lid, pu...","[{'quantity': '1 1/2', 'unit': 'cups', 'name':...",212
3,Carina's Tofu-Vegetable Kebabs,Cyclopz,20,1440,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...",536.1,...,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,"[Drain the tofu, carefully squeezing out exces...","[{'quantity': '12', 'unit': 'ounces', 'name': ...",932
4,Best Blackbottom Pie,Barefoot Beachcomber,120,20,1999-08-21T10:35:00Z,Make and share this Best Blackbottom Pie recip...,[],Pie,"[Dessert, Weeknight, Stove Top, < 4 Hours]",437.9,...,94.3,267.6,58.0,1.8,42.5,7.0,8.0,"[Graham Cracker Crust: In small bowl, combine ...","[{'quantity': '1 1/4', 'unit': 'cups', 'name':...",171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490471,Meg's Fresh Ginger Gingerbread,rdsxc,35,60,2020-12-22T15:27:00Z,Make and share this Meg's Fresh Ginger Gingerb...,[],Dessert,[< 4 Hours],316.6,...,54.4,278.2,48.5,0.8,22.8,3.9,8.0,[Preheat oven to 350&deg;F Grease an 8x8 cake ...,"[{'quantity': '3', 'name': 'tablespoons fresh ...",97
490472,Roast Prime Rib au Poivre with Mixed Peppercorns,Denver cooks,180,30,2020-12-22T15:32:00Z,"White, black, green, and pink peppercorns add ...",[https://img.sndimg.com/food/image/upload/w_55...,Very Low Carbs,"[High Protein, High In..., < 4 Hours]",2063.4,...,433.8,766.3,3.2,0.7,0.1,117.0,8.0,[Position rack in center of oven and preheat t...,"[{'quantity': '9', 'name': 'lbs prime rib roas...",662
490473,Kirshwasser Ice Cream,Jonathan F.,180,60,2020-12-22T15:33:00Z,Make and share this Kirshwasser Ice Cream reci...,[],Ice Cream,"[Dessert, < 4 Hours]",1271.3,...,470.9,192.5,33.9,0.0,17.3,12.8,6.0,[heat half and half and heavy cream to a simme...,"[{'quantity': '3', 'name': 'pints half-and-hal...",520
490474,Quick & Easy Asian Cucumber Salmon Rolls,CLUBFOODY,0,15,2020-12-22T22:11:00Z,"Extremely quick and easy to make, these are gr...",[https://img.sndimg.com/food/image/upload/w_55...,Canadian,"[< 15 Mins, Easy]",16.1,...,2.9,100.5,0.3,0.0,0.1,2.4,,"[In a small bowl, combine mayo and wasabi past...","[{'quantity': '4', 'name': 'tablespoons kewpie...",406


In [19]:
# Convert to JSON to preserve data for CSV

from json import dumps

df['ingredients'] = df['ingredients'].map(dumps)
df['images'] = df['images'].map(dumps)
df['steps'] = df['steps'].map(dumps)
df['tags'] = df['tags'].map(dumps)

In [20]:
df.to_csv('output/parsed_recipes.csv', index=False)