In [30]:
from datasets import load_dataset
import pandas as pd
import re
from tqdm import tqdm
import os

In [2]:
dataset = load_dataset("corbt/all-recipes")



In [31]:
df = dataset["train"].to_pandas()
df.to_pickle("DBCopy.pkl")

In [32]:
df = pd.read_pickle("DBCopy.pkl")

In [33]:
ALLOWED_HEADERS = {
    "name",
    "ingredients",
    "directions",
}

In [34]:
def extract_headers(text):
    if not isinstance(text, str):
        return set()
    
    headers = re.findall(r"^\s*([A-Za-z ]+):",text,flags=re.MULTILINE)

    return {h.strip().lower() for h in headers}

In [35]:
def row_has_only_allowed_headers(text, allowed_headers):
    headers = extract_headers(text)

    if not headers:
        return False

    return headers.issubset(allowed_headers)

In [36]:
df_filtered = df[df["input"].apply(lambda x: row_has_only_allowed_headers(x, ALLOWED_HEADERS))].reset_index(drop=True)

In [37]:
df_filtered.to_pickle("DBCopy.pkl")

In [38]:
def parse_recipe(text):
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    name = lines[0]

    ingredients = []
    directions = []
    section = None

    for line in lines[1:]:
        lower = line.lower()

        if lower.startswith("ingredients"):
            section = "ingredients"
            continue
        if lower.startswith("directions"):
            section = "directions"
            continue

        if line.startswith("-"):
            if section == "ingredients":
                ingredients.append(line[1:].strip())
            elif section == "directions":
                directions.append(line[1:].strip())

    return name, " | ".join(ingredients), " | ".join(directions)

In [39]:
tqdm.pandas()

In [40]:
nameIngDirDf = pd.DataFrame(
    df_filtered["input"].progress_map(parse_recipe).tolist(),
    columns=["name", "ingredients", "directions"]
)

100%|██████████| 2141272/2141272 [00:34<00:00, 61440.65it/s]


In [41]:
nameIngDirDf.to_pickle("Rearranged.pkl")

In [42]:
nameIngDirDf["ingredients"]

0          1 c. firmly packed brown sugar | 1/2 c. evapor...
1          1 small jar chipped beef, cut up | 4 boned chi...
2          2 (16 oz.) pkg. frozen corn | 1 (8 oz.) pkg. c...
3          1 large whole chicken | 2 (10 1/2 oz.) cans ch...
4          1 c. peanut butter | 3/4 c. graham cracker cru...
                                 ...                        
2141267    1 sweet onion, chopped (Vidalia, Maui, etc.) |...
2141268    1/2 cup chocolate hazelnut spread (recommended...
2141269    1 dozen eggs | 1 paprika | 1 salt and pepper t...
2141270    150 grams Daikon radish | 1 tbsp Sesame oil | ...
2141271    1 pound ground veal | 1/2 pound sweet Italian ...
Name: ingredients, Length: 2141272, dtype: str

In [43]:

INGREDIENT_RE = re.compile(
    r"""
    ^\s*
    (?P<quantity>\d+(?:\s\d+\/\d+|\.\d+|\/\d+)?)?   
    \s*
    (?P<size>large|small|medium)?                  
    \s*
    (?P<unit>
        c\.?|cup|cups|
        tbsp\.?|tablespoons?|
        tsp\.?|teaspoons?|
        lb\.?|lbs?|
        oz\.?|
        pkg\.?|package|packages?
    )?
    \s*
    (?P<ingredient>[a-zA-Z][a-zA-Z\s\-]+?)          
    (?:\s*\((?P<notes>[^)]+)\))?         
    \s*$
    """,
    re.VERBOSE | re.IGNORECASE
)

In [44]:

def parse_ingredient(text: str) -> dict:
    text = text.strip()
    m = INGREDIENT_RE.match(text)
    if not m:
        return {
            "quantity": None,
            "unit": None,
            "size": None,
            "ingredient": text,
            "notes": None,
        }
    return {
        "quantity": m.group("quantity"),
        "unit": m.group("unit"),
        "size": m.group("size"),
        "ingredient": m.group("ingredient").strip(),
        "notes": m.group("notes"),
    }

In [45]:
recipes_df = nameIngDirDf[["name", "directions"]].copy()
chunk_size = 10_000
output_file = "Ingredients.csv"
total_rows = len(nameIngDirDf)

In [46]:
print(recipes_df.columns)
print(recipes_df.head())
print(len(recipes_df))

Index(['name', 'directions'], dtype='str')
                    name                                         directions
0    No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...
1  Jewell Ball'S Chicken  Place chipped beef on bottom of baking dish. |...
2            Creamy Corn  In a slow cooker, combine all ingredients. Cov...
3          Chicken Funny  Boil and debone chicken. | Put bite size piece...
4     Reeses Cups(Candy)  Combine first four ingredients and press in 13...
2141272


In [47]:
nameIngDirDf.isna().sum()

name           0
ingredients    0
directions     0
dtype: int64

In [48]:
for start in tqdm(range(0, total_rows, chunk_size), desc="Processing recipes"):
    chunk = nameIngDirDf.iloc[start:start + chunk_size]

    rows = []

    for recipe_id, row in chunk.iterrows():
        if pd.isna(row["ingredients"]):
            continue

        for ing in re.split(r"\s*\|\s*", row["ingredients"]):
            parsed = parse_ingredient(ing)
            parsed["recipe_id"] = recipe_id
            parsed["recipe_name"] = row["name"]
            rows.append(parsed)

    if not rows:
        continue

    ingredients_df = pd.DataFrame(rows)

    # Write header only once
    ingredients_df.to_csv(
        output_file,
        mode="a",
        index=False,
        header=not os.path.exists(output_file)
    )

    del ingredients_df, rows  # free memory

Processing recipes:   0%|          | 0/215 [00:00<?, ?it/s]

Processing recipes: 100%|██████████| 215/215 [13:18<00:00,  3.72s/it]


In [49]:
print(len(nameIngDirDf))
nameIngDirDf

2141272


Unnamed: 0,name,ingredients,directions
0,No-Bake Nut Cookies,1 c. firmly packed brown sugar | 1/2 c. evapor...,"In a heavy 2-quart saucepan, mix brown sugar, ..."
1,Jewell Ball'S Chicken,"1 small jar chipped beef, cut up | 4 boned chi...",Place chipped beef on bottom of baking dish. |...
2,Creamy Corn,2 (16 oz.) pkg. frozen corn | 1 (8 oz.) pkg. c...,"In a slow cooker, combine all ingredients. Cov..."
3,Chicken Funny,1 large whole chicken | 2 (10 1/2 oz.) cans ch...,Boil and debone chicken. | Put bite size piece...
4,Reeses Cups(Candy),1 c. peanut butter | 3/4 c. graham cracker cru...,Combine first four ingredients and press in 13...
...,...,...,...
2141267,Curried Asparagus Vichyssoise,"1 sweet onion, chopped (Vidalia, Maui, etc.) |...",Cook the onion in butter in a medium saucepan ...
2141268,Sunny's Fake Crepes,1/2 cup chocolate hazelnut spread (recommended...,Spread hazelnut spread on 1 side of each torti...
2141269,Devil Eggs,1 dozen eggs | 1 paprika | 1 salt and pepper t...,Boil eggs on medium for 30mins. | Then cool eg...
2141270,Extremely Easy and Quick - Namul Daikon Salad,150 grams Daikon radish | 1 tbsp Sesame oil | ...,Julienne the daikon and squeeze out the excess...


In [50]:
recipes_df
recipes_df = recipes_df.reset_index(drop=True)
recipes_df["id"] = range(1, len(recipes_df) + 1)

In [51]:
print(recipes_df.head())

                    name                                         directions  \
0    No-Bake Nut Cookies  In a heavy 2-quart saucepan, mix brown sugar, ...   
1  Jewell Ball'S Chicken  Place chipped beef on bottom of baking dish. |...   
2            Creamy Corn  In a slow cooker, combine all ingredients. Cov...   
3          Chicken Funny  Boil and debone chicken. | Put bite size piece...   
4     Reeses Cups(Candy)  Combine first four ingredients and press in 13...   

   id  
0   1  
1   2  
2   3  
3   4  
4   5  


In [54]:
recipes_df.insert(0, "id", recipes_df.pop("id"))
print(recipes_df.head())
print(recipes_df.columns)


   id                   name  \
0   1    No-Bake Nut Cookies   
1   2  Jewell Ball'S Chicken   
2   3            Creamy Corn   
3   4          Chicken Funny   
4   5     Reeses Cups(Candy)   

                                          directions  
0  In a heavy 2-quart saucepan, mix brown sugar, ...  
1  Place chipped beef on bottom of baking dish. |...  
2  In a slow cooker, combine all ingredients. Cov...  
3  Boil and debone chicken. | Put bite size piece...  
4  Combine first four ingredients and press in 13...  
Index(['id', 'name', 'directions'], dtype='str')


In [55]:
recipes_df.to_csv("recipes_cleaned.csv", index=False)