In [27]:
import pandas as pd
import numpy as np
import json
import ast


In [37]:
# importing allrecipes data and converting to dataframe
with open('data/ar.json', 'r') as file:
    ar_json = json.load(file)
ar = pd.DataFrame(ar_json)
ar = ar.T  # transposing dataframes


# importing epicturious data and converting to dataframe
with open('data/epi.json', 'r') as file:
    epi_json = json.load(file)
epi = pd.DataFrame(epi_json)
epi = epi.T

# importing foodnetwork data and converting to dataframe
with open('data/fn.json', 'r') as file:
    fn_json = json.load(file)
fn = pd.DataFrame(fn_json)
fn = fn.T

# merging all of the dataframes into one
df = pd.concat([ar, epi, fn], ignore_index=True)
# removing NaN values
df = df.dropna()
# converting ingredients into strings
df['ingredients'] = df['ingredients'].astype(str)

df.head(2)

Unnamed: 0,title,ingredients,instructions,picture_link
0,Slow Cooker Chicken and Dumplings,"['4 skinless, boneless chicken breast halves A...","Place the chicken, butter, soup, and onion in ...",55lznCYBbs2mT8BTx6BTkLhynGHzM.S
1,Awesome Slow Cooker Pot Roast,['2 (10.75 ounce) cans condensed cream of mush...,"In a slow cooker, mix cream of mushroom soup, ...",QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe


In [35]:
# removing text that says 'ADVERTISEMENT' from ingredients
df['ingredients'] = df['ingredients'].str.replace(' ADVERTISEMENT', '', regex=False)
df['ingredients'] = df['ingredients'].str.replace('ADVERTISEMENT', '', regex=False)
df.head(2)

Unnamed: 0,title,ingredients,instructions,picture_link
0,Slow Cooker Chicken and Dumplings,"['4 skinless, boneless chicken breast halves',...","Place the chicken, butter, soup, and onion in ...",55lznCYBbs2mT8BTx6BTkLhynGHzM.S
1,Awesome Slow Cooker Pot Roast,['2 (10.75 ounce) cans condensed cream of mush...,"In a slow cooker, mix cream of mushroom soup, ...",QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe


In [25]:
def count_ingredients(ingredient_str: str) -> int:
    '''    
    Count the number of ingredients in a given recipe.

    Arguments:
    ingredient_str (str): A string representation of a list of ingredients, where each ingredient is represented as a string. 
        The list may contain empty strings.

    Returns:
    int: The count of non-empty ingredient strings. Returns 0 if the input string is empty or if it cannot be evaluated as a list.'''
    
    if ingredient_str:  # making sure string is not empty
        # converting string representation of the list to an actual list
        ingredients_list = ast.literal_eval(ingredient_str)
        # counting non-empty strings in the list
        return sum(1 for ingredient in ingredients_list if ingredient.strip())  
    return 0  # return 0 for empty strings

In [28]:
# adding ingredient count to dataframe
df['ingredient_count'] = df['ingredients'].apply(count_ingredients)
df