In [None]:
import pandas as pd
import numpy as np
import json
import ast
import re

#from collections import Counter

In [None]:
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import json

def open_json(file_path: str):
    with open(file_path, 'r') as file:
        return json.load(file)

def count_ingredients(ingredients_list: list) -> int:
    '''    
    Count the number of ingredients in a given recipe.

    Arguments:
    ingredients_list (list): A list of ingredient strings.

    Returns:
    int: The count of non-empty ingredient strings. Returns 0 if the input list is empty.'''
    
    if ingredients_list:  # Check if the list is not empty
        # Counting non-empty strings in the list
        return sum(1 for ingredient in ingredients_list if ingredient.strip())  
    return 0  # Return 0 for empty lists

def count_verbs(dir_list):
    text = ' '.join(dir_list)
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return sum(1 for word, pos in pos_tags if pos.startswith('VB'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\romer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\romer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Processing allrecipes data

In [29]:
# importing allrecipes data and converting to dataframe
ar_data = open_json('scraping/recipes/recipes_allrecipes.json')
ar_data = pd.DataFrame(ar_data)

# splitting basic_info into separate columns and removing old basic_info column
ar_data[['title', 'category','rating','rating_count']] = ar_data['basic_info'].apply(pd.Series)
ar_data = ar_data.drop(columns=['basic_info'])

# splitting prep_data into separate columns and removing old prep_data column, only keeping cook_time, total_time, yield, and servings
ar_data[['cook_time', 'total_time', 'yield', 'servings']] = ar_data['prep_data'].apply(
    lambda x: pd.Series({
        'cook_time': x.get('cook_time', ''),
        'total_time': x.get('total_time', ''),
        'yield': x.get('yield', ''),
        'servings': x.get('servings', '')
    }))
ar_data = ar_data.drop(columns=['prep_data'])

# splitting nutritions column and removing old nutritions column
ar_data[['calories','fat','carbs','protein']] = ar_data['nutritions'].apply(lambda x: pd.Series(x, dtype = 'object'))
ar_data = ar_data.drop(columns=['nutritions'])

# removing rows where there are no directions
ar_data = ar_data[ar_data['directions'].apply(lambda x: x != [])]

In [32]:
# add verb count (basically number of steps)
ar_data['verb_count'] = ar_data['directions'].apply(count_verbs)

# add ingredients count
ar_data['ingredient_count'] = ar_data['ingredients'].apply(count_ingredients)

# merge yield and servings
ar_data['yield_servings_merge'] = ar_data['yield' if ar_data['yield'].empty else 'servings']

# save file to csv
ar_data.to_csv('recipe_data.csv', index = False)