In [None]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse

import json
import ast
import re
import os

#from collections import Counter

In [3]:
def open_json(file_path: str):
    with open(file_path, 'r') as file:
        return json.load(file)

def count_ingredients(ingredients_list: list) -> int:
    '''    
    Count the number of ingredients in a given recipe.

    Arguments:
    ingredients_list (list): A list of ingredient strings.

    Returns:
    int: The count of non-empty ingredient strings. Returns 0 if the input list is empty.'''
    
    if ingredients_list:  # Check if the list is not empty
        # Counting non-empty strings in the list
        return sum(1 for ingredient in ingredients_list if ingredient.strip())  
    return 0  # Return 0 for empty lists

In [4]:
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import json


def count_verbs(dir_list):
    text = ' '.join(dir_list)
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return sum(1 for word, pos in pos_tags if pos.startswith('VB'))

ModuleNotFoundError: No module named 'nltk'

### Processing allrecipes data

In [5]:
# importing allrecipes data and converting to dataframe
data = open_json('scraping/recipes/scraped_recipes.json')
data = pd.DataFrame(data)

# splitting basic_info into separate columns and removing old basic_info column
data[['title', 'category','rating','rating_count']] = data['basic_info'].apply(pd.Series)
data = data.drop(columns=['basic_info'])

# splitting prep_data into separate columns and removing old prep_data column, only keeping cook_time, total_time, yield, and servings
data[['cook_time', 'total_time', 'yield', 'servings']] = data['prep_data'].apply(
    lambda x: pd.Series({
        'cook_time': x.get('cook_time', ''),
        'total_time': x.get('total_time', ''),
        'yield': x.get('yield', ''),
        'servings': x.get('servings', '')
    }))
data = data.drop(columns=['prep_data'])

# splitting nutritions column and removing old nutritions column
data[['calories','fat','carbs','protein']] = data['nutritions'].apply(lambda x: pd.Series(x, dtype = 'object'))
data = data.drop(columns=['nutritions'])

# removing rows where there are no directions
data = data[data['directions'].apply(lambda x: x != [])]

In [6]:
data = pd.DataFrame(data).replace(["N/A", ""], pd.NA)
data = data.dropna()
data

Unnamed: 0,ingredients,directions,image_filename,category_url,title,category,rating,rating_count,cook_time,total_time,yield,servings,calories,fat,carbs,protein
0,"[1 pound lean ground beef, 2 tablespoons dried...",[Heat a large skillet over medium-high heat. C...,5410153-e9807f995fb74fe58bd2fcd56b541010.jpg,https://www.allrecipes.com/recipes/14930/main-...,Grilled Cheese Sloppy Joes,Recipes,4.8,4,1 hr,1 hr 10 mins,8 sandwiches,8,585,39g,32g,27g
3,"[1 cup butter, 1 cup packed brown sugar, 1 cup...",[Preheat the oven to 350 degrees F (175 degree...,1059053-70a5c73ef725427e9770ad083c423e87.jpg,https://www.allrecipes.com/recipes/2452/fruits...,Golden Yam Brownies,Recipes,4.5,497,30 mins,1 hr,24 servings,24,221,10g,32g,2g
4,[1 (14 ounce) can sweetened condensed milk (su...,[Preheat the oven to 350 degrees F (175 degree...,9281485-authentic-mexican-corn-bread-Lisa-Lode...,https://www.allrecipes.com/recipes/342/bread/q...,Authentic Mexican Corn Bread,Recipes,4.3,50,45 mins,1 hr,1 pan cornbread,12,448,14g,74g,11g
6,"[6 (5-inch) corn tortillas, 3 cups chopped coo...",[Place a paper towel on a microwave-safe plate...,5414541-1e498b617cf84c479998b9903009a851.jpg,https://www.allrecipes.com/recipes/17874/main-...,Traditional Mexican Street Tacos,Latin American,4.4,9,0 mins,10 mins,6 tacos,2,697,30g,44g,64g
7,"[1 ½ cups rolled oats, 1 ½ cups sifted pastry ...",[Preheat the oven to 350 degrees F (175 degree...,7368576-869e126fb14a489bbcb6df6fb3c827f6.jpg,https://www.allrecipes.com/recipes/841/holiday...,Gramma's Date Squares,Desserts,4.6,476,25 mins,50 mins,12 squares,12,363,13g,64g,4g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19061,"[3 large eggs, 2 cups white sugar, 1 cup veget...",[Preheat the oven to 350 degrees F (175 degree...,459186-chocolate-zucchini-muffins-Pam-Ziegler-...,https://www.allrecipes.com/recipes/348/bread/q...,Chocolate Zucchini Muffins,Bread,4.7,245,20 mins,35 mins,2 dozen muffins,24,219,10g,30g,3g
19066,"[3 cups all-purpose flour, 1 teaspoon baking s...",[Preheat the oven to 350 degrees F (175 degree...,688015-3c19325f6a6d4f93b0f8928cf0e47b2e.jpg,https://www.allrecipes.com/recipes/348/bread/q...,Chocolate Chip Orange Zucchini Bread,Recipes,4.7,476,50 mins,1 hr 10 mins,2 9x5-inch loaves,20,339,18g,42g,4g
19071,"[2 pounds ground chuck, 2 tablespoons chili po...","[Place the ground chuck in a large skillet, an...",2463212-tacos-de-matamoros-Allrecipes-Magazine...,https://www.allrecipes.com/recipes/17874/main-...,Tacos De Matamoros,Recipes,4.3,111,1 hr 30 mins,2 hrs,6 servings,6,629,39g,33g,38g
19072,"[4 cups chopped sweet potato, 1 (12 ounce) jar...",[Preheat the oven to 325 degrees F (165 degree...,524430-f5bc19f3cf7544a2939f79a85a680825.jpg,https://www.allrecipes.com/recipes/2452/fruits...,Easy Candy Yams,Side Dish,4.3,57,40 mins,45 mins,1 8x8-inch dish,6,413,16g,69g,3g


In [7]:
# Removing images that no longer exist in the dataframe after removing rows with NaN or empty values

# Load the DataFrame (assuming it's already cleaned)
data = pd.DataFrame(data)  # Replace with your actual DataFrame if needed

# Get the set of image filenames from the image_filename column
image_filenames = set(data['image_filename'].dropna())  # Ensure to drop any NaN values

# Directory where images are stored
images_directory = 'scraping/images'

# List all .jpg files in the images folder
all_images = [f for f in os.listdir(images_directory) if f.endswith('.jpg')]

# Loop through all images and delete those not referenced in the DataFrame
for image in all_images:
    if image not in image_filenames:
        # If the image is not in the DataFrame, delete it
        image_path = os.path.join(images_directory, image)
        os.remove(image_path)
        print(f"Deleted {image}")


In [8]:
data

Unnamed: 0,ingredients,directions,image_filename,category_url,title,category,rating,rating_count,cook_time,total_time,yield,servings,calories,fat,carbs,protein
0,"[1 pound lean ground beef, 2 tablespoons dried...",[Heat a large skillet over medium-high heat. C...,5410153-e9807f995fb74fe58bd2fcd56b541010.jpg,https://www.allrecipes.com/recipes/14930/main-...,Grilled Cheese Sloppy Joes,Recipes,4.8,4,1 hr,1 hr 10 mins,8 sandwiches,8,585,39g,32g,27g
3,"[1 cup butter, 1 cup packed brown sugar, 1 cup...",[Preheat the oven to 350 degrees F (175 degree...,1059053-70a5c73ef725427e9770ad083c423e87.jpg,https://www.allrecipes.com/recipes/2452/fruits...,Golden Yam Brownies,Recipes,4.5,497,30 mins,1 hr,24 servings,24,221,10g,32g,2g
4,[1 (14 ounce) can sweetened condensed milk (su...,[Preheat the oven to 350 degrees F (175 degree...,9281485-authentic-mexican-corn-bread-Lisa-Lode...,https://www.allrecipes.com/recipes/342/bread/q...,Authentic Mexican Corn Bread,Recipes,4.3,50,45 mins,1 hr,1 pan cornbread,12,448,14g,74g,11g
6,"[6 (5-inch) corn tortillas, 3 cups chopped coo...",[Place a paper towel on a microwave-safe plate...,5414541-1e498b617cf84c479998b9903009a851.jpg,https://www.allrecipes.com/recipes/17874/main-...,Traditional Mexican Street Tacos,Latin American,4.4,9,0 mins,10 mins,6 tacos,2,697,30g,44g,64g
7,"[1 ½ cups rolled oats, 1 ½ cups sifted pastry ...",[Preheat the oven to 350 degrees F (175 degree...,7368576-869e126fb14a489bbcb6df6fb3c827f6.jpg,https://www.allrecipes.com/recipes/841/holiday...,Gramma's Date Squares,Desserts,4.6,476,25 mins,50 mins,12 squares,12,363,13g,64g,4g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19061,"[3 large eggs, 2 cups white sugar, 1 cup veget...",[Preheat the oven to 350 degrees F (175 degree...,459186-chocolate-zucchini-muffins-Pam-Ziegler-...,https://www.allrecipes.com/recipes/348/bread/q...,Chocolate Zucchini Muffins,Bread,4.7,245,20 mins,35 mins,2 dozen muffins,24,219,10g,30g,3g
19066,"[3 cups all-purpose flour, 1 teaspoon baking s...",[Preheat the oven to 350 degrees F (175 degree...,688015-3c19325f6a6d4f93b0f8928cf0e47b2e.jpg,https://www.allrecipes.com/recipes/348/bread/q...,Chocolate Chip Orange Zucchini Bread,Recipes,4.7,476,50 mins,1 hr 10 mins,2 9x5-inch loaves,20,339,18g,42g,4g
19071,"[2 pounds ground chuck, 2 tablespoons chili po...","[Place the ground chuck in a large skillet, an...",2463212-tacos-de-matamoros-Allrecipes-Magazine...,https://www.allrecipes.com/recipes/17874/main-...,Tacos De Matamoros,Recipes,4.3,111,1 hr 30 mins,2 hrs,6 servings,6,629,39g,33g,38g
19072,"[4 cups chopped sweet potato, 1 (12 ounce) jar...",[Preheat the oven to 325 degrees F (165 degree...,524430-f5bc19f3cf7544a2939f79a85a680825.jpg,https://www.allrecipes.com/recipes/2452/fruits...,Easy Candy Yams,Side Dish,4.3,57,40 mins,45 mins,1 8x8-inch dish,6,413,16g,69g,3g


In [None]:
# add verb count (basically number of steps)
data['verb_count'] = data['directions'].apply(count_verbs)

# add ingredients count
data['ingredient_count'] = data['ingredients'].apply(count_ingredients)

# merge yield and servings
data['yield_servings_merge'] = data['yield' if data['yield'].empty else 'servings']

In [7]:
# save file to csv
data.to_csv('data/recipe_data.csv', index = False)

In [None]:
# Function to extract the last part of the URL
def extract_last_category(url):
    """
    Extract the last non-empty segment from the URL path.
    :param url: The full category URL.
    :return: The last category in the URL.
    """
    path_parts = urlparse(url).path.strip("/").split("/")
    return path_parts[-1] if path_parts else "Unknown"

# Apply the function to extract the last category
data["category"] = data["category_url"].apply(extract_last_category)

# Function to convert to proper format
def to_proper_format(category):
    """
    Converts a category string to proper human-readable format.
    :param category: The category name.
    :return: Properly formatted category name.
    """
    return " ".join(word.capitalize() for word in category.split("-"))

# Apply the function
data["category"] = data["category"].apply(to_proper_format)

# Print the result
data['category'].value_counts()

category
Ground            89
Zucchini Bread    54
Pumpkin Bread     50
Rhubarb Pie       50
Wheat Bread       46
                  ..
Labor Day          1
Pancit             1
Smoothies          1
Main Dish          1
Bloody Marys       1
Name: count, Length: 366, dtype: int64