Takes this Kaggle dataset 'Recipes from Tasty' https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json, and turns them into basic dialogue using a preset list of user prompt tempaltes.

In [None]:
ONE_STEP_TEMPLATES = [
    "How do I cook {title}?",
    "How do I make {title}?",
    "How do you make {title}?",
    "Help me make {title}.",
    "Tell me how to make {title}.",
    "How do I prepare {title}?",
    "Could you tell me how to prepare {title}?",
    "Have you got a recipe for {title}?",
    "Do you have a recipe for {title}?",
    "Could I have the recipe for {title}?",
    "Do you know how to make {title}?",
    "How do I go about making {title}?",
    "Can you tell me how to make {title}?",
]

# TWO_STEP_TEMPLATES_1 = ["What ingredients do I need to make {title}?","What ingredients do I need to cook {title}?","What do I need to make {title}?","What do I need to cook {title}?"]

# TWO_STEP_TEMPLATES_2 = ["What are the steps?","How do I prepare it?","How do I cook it?","How can I cook it?"]

In [None]:
import os
import kaggle
import pandas as pd
import json
import random
import unicodedata
import re
from fractions import Fraction
from IPython.display import display
from datasets import Dataset

data_source = "https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty"
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Convert fraction unicode characters to strings (e.g. ½ -> 1/2)
def convert_fraction_unicode_chars_to_strings(string):
    output = ""
    i = 0
    while i < len(string):
        char = string[i]
        try:
            if unicodedata.name(char).startswith("VULGAR FRACTION"):  # check if the character is a fraction
                val = unicodedata.numeric(char)
                # if the current character is a fraction, find the end of the fraction
                j = i + 1
                while j < len(string):
                    next_char = string[j]
                    if not unicodedata.name(next_char).startswith(
                        "VULGAR FRACTION"
                    ):  # break if next character is not a fraction
                        break
                    next_val = unicodedata.numeric(next_char)
                    val = val * 10 + next_val
                    j += 1
                # convert the numeric value to a Fraction object and then to a string with a maximum of 2 digits
                fraction = str(Fraction(val).limit_denominator(100))
                output += fraction
                i = j
            else:
                # if the current character is not a fraction, simply add it to the output
                output += char
                i += 1
        except ValueError:
            # if the character does not have a name, skip it
            i += 1
    return output

In [None]:
kaggle.api.dataset_download_files("zeeenb/recipes-from-tasty", "data", unzip=True)

In [None]:
dishes = pd.read_csv("data/dishes.csv", usecols=["language", "name", "slug"])
# Remove non-English recipes
dishes = dishes[dishes["language"] == "eng"]
# Open ingredient_and_instructions.json and extract instructions
ingredient_and_instructions = json.load(open("data/ingredient_and_instructions.json"))

# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE
# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the title of the recipe filled in
# The RESPONSE is the ingredients and instructions for the recipe concatenated
# The SOURCE is the recipe title
recipes = []
for index, row in dishes.iterrows():
    recipe_name = row["name"]
    # Remove the following phrases from the recipe name (ignoring case)
    recipe_name = re.sub("How to Make ", "", recipe_name, flags=re.IGNORECASE)

    # Concatenate ingredients from ingredient_and_instructions[row["slug"]]["ingredients_sections"]
    ingredients = ""
    for section in ingredient_and_instructions[row["slug"]]["ingredient_sections"]:
        if section["name"] != "":
            ingredients += f"\n{section['name']}\n"
        for ingredient in section["ingredients"]:
            primary_unit_quantity = ""
            if ingredient["primary_unit"]["quantity"] != "" and ingredient["primary_unit"]["quantity"] is not None:
                primary_unit_quantity = (
                    convert_fraction_unicode_chars_to_strings(ingredient["primary_unit"]["quantity"]) + " "
                )
            primary_unit_display = ""
            if ingredient["primary_unit"]["display"] != "" and ingredient["primary_unit"]["display"] is not None:
                primary_unit_display = ingredient["primary_unit"]["display"] + " "
            extra_comment = ingredient["extra_comment"]
            if ingredient["extra_comment"] != "":
                extra_comment = ", " + ingredient["extra_comment"]
            ingredients += f"\n• {primary_unit_quantity}{primary_unit_display}{ingredient['name']}{extra_comment}"
        ingredients += "\n"

    # Concatenate instructions from ingredient_and_instructions[row["slug"]]["instructions"] and iterate a number per instruction
    instructions = ""
    # Remove last instruction if it is "Enjoy!"
    if ingredient_and_instructions[row["slug"]]["instructions"][-1]["display_text"] == "Enjoy!":
        ingredient_and_instructions[row["slug"]]["instructions"] = ingredient_and_instructions[row["slug"]][
            "instructions"
        ][:-1]
    for i, instruction in enumerate(ingredient_and_instructions[row["slug"]]["instructions"]):
        instructions += f"\n{i+1}. {convert_fraction_unicode_chars_to_strings(instruction['display_text'])}"

    # Constuct the full response
    response = f"""Here's a recipe for {recipe_name}:

Ingredients:
{ingredients}
Instructions:
{instructions}

Enjoy your {recipe_name}!"""
    recipes.append(
        {
            "INSTRUCTION": random.choice(ONE_STEP_TEMPLATES).format(title=recipe_name),
            "RESPONSE": response,
            "SOURCE": data_source,
        }
    )
recipes = pd.DataFrame(recipes)

# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column
with pd.option_context("display.max_colwidth", -1):
    # Assuming the variable df contains the relevant DataFrame
    display(
        recipes.head().style.set_properties(
            **{
                "text-align": "left",
                "white-space": "pre-wrap",
            }
        )
    )

In [None]:
# Upload dataset to HF
recipes.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow")
ds = Dataset.from_parquet("dataset.parquet")
# Uncomment to push dataset to HF
ds.push_to_hub("dctanner/oa_recipes")