In [11]:
import os
import re
import pytesseract
import pandas as pd


# text_df - the filename and ocr text from all images in the pages_corrected folder with columns = ["filename", "text"]
# recipe_df -  the manually corrected recipe text with columns = ["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"]
#loading the text_df and recipe_df dataframes from pickle
text_df = pd.read_pickle("text_df.pkl")
recipe_df = pd.read_pickle("recipe_df.pkl")
recipe_df.tail()

Unnamed: 0,page,title,author,ingredients,instructions,serving_size,misc
4,14,Juanita's Sugar Cookies,Juanita Ardoin of Mamou,\n2-1/2 Cups Flour\n1 Cup Sugar\n3/4 Cup Cooki...,"\nMix Sugar, Oil, Eggs, and Vanilla Flavor\n\n...",Makes about 3 dozen,\n
5,15,Juanita's Peanut Butter Cookies,Juanita Ardoin,\n2-1/2 Cups Flour\n1 Cup Butter\n1 Cup Peanut...,"\nMix flour, salt, baking soda, and set aside\...",No serving size listed,\n
6,16,Mama's Cajun French Toast,My Mama--Lestea S. Ardoin--wife of Kimbell Ard...,\n1 Egg\n3/4 Cup Sugar [I think he's talking a...,\nMix and Whip up all ingredients [except for ...,No serving size listed,\nI add enough Sugar to cover all the dark pla...
7,17,JAKE'S Peanut Butter Ball Cookies,JAKE ARDOIN of Eunice--a son of Dr. Brent C. A...,\nHoney\nCereal -- any kind (crunch it)\nPeanu...,\nPut 6 handfuls of cereal in zip-lock bag.\n\...,No serving size listed,"\nJake, 8 years old, is one of my grandsons. W..."
8,18,Juanita's Strawberry Bunt Cake,Juanita Ardoin,\n**Cake Ingredients**\n1 box Cake mix--white ...,"\n**Cake Instructions**\nMix the Cake mix, dry...",No serving size listed,\nThis is a grandma's recipe.\n


In [2]:
#setting up the recipe df. This only needs to be run once the first time
def setup_recipe_dataframe() -> None:
    """
    Setup for the recipe_df dataframe.
    Doesn't need to be run again after the first time.
    """
    recipe_df = pd.DataFrame(columns=["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"])
    recipe_df.to_pickle("recipe_df.pkl")
    
#setup_recipe_dataframe()

In [3]:
#Instructions to create a dataframe containing the ocr text from all images in the image_folder
#first use some application to lighten the backgrounds and darken the text of your images.
# I used GIMP and its auto color correction stuff, specifically white balance under the auto menu. It worked great.
# I couldn't figure out how to make it edit a batch of photos within GIMP, so I suggest using a macro program to do this easily

image_folder = "/mnt/g/My Drive/Project Stuff/Recipe_ripper/pages_corrected/"

#running tesseract ocr on all the images in the image folder
#to create a list of tuples named recipe_list containing (filename, text) for each page image
def run_ocr_on_images_in_folder(image_folder:str) -> list:
    """
    Runs tesseract ocr on all the images in the image folder. This won't work if your
    files are named differntly from mine "cookbook_page_{page_number}.png".

    Args:
        image_folder (str): The folder where the recipe scan images are located

    Returns:
        list: a list of tuples containing (filename, text) for each page image
    """    
    recipe_list = []
    for image in os.listdir(image_folder):
        file_name = image[9:-4]
        text = pytesseract.image_to_string(Image.open(f"{image_folder}"+image))
        recipe_list.append((file_name, text))
    return recipe_list

def create_dataframe_from_recipe_list(recipe_list:list[tuple[str,str]]) -> None:
    """
    Generates a dataframe from the items in the recipe list returned from run_ocr_on_images_in_folder()

    Args:
        recipe_list (list[tuple[str,str]]): a list of tuples of the form (filename, text)
    """    
    #create 
    text_df = pd.DataFrame([ [page[0], page[1]] for page in recipe_list], columns=["filename", "text"])
    #store the ocr text in a pickle file
    text_df.to_pickle("text_df.pkl")
    
# recipe_list = run_ocr_on_images_in_folder(image_folder)
# create_dataframe_from_recipe_list(recipe_list)


In [None]:
#renaming all the files because I accidently added 2 to every page number

image_folder = "/mnt/g/My Drive/Project Stuff/Recipe_ripper/pages_corrected/"

def add_2_to_every_page_number(image_folder:str) -> None:
    """
    Renaming all the files because I accidently added 2 to every page number

    Args:
        image_folder (str): string name of folder location with images
    """    
    #renaming all the files because I accidently added 2 to every page number
    for image in os.listdir(image_folder):
        filename = image[9:-4]
        pagenumber = int(filename[4:])
        pagenumber -= 2
        new_name = image_folder + "cookbook_page" + str(pagenumber) + ".png"
        image_location = image_folder + image
        os.rename(image_location, new_name)

In [19]:
#setting up recipe df. This only needs to be run once the first time. 
def create_recipe_dataframe() -> None:
    """
    Setup for the recipe_df dataframe.
    Doesn't need to be run again after the first time.
    """
    recipe_df = pd.DataFrame(columns=["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"])
    recipe_df.to_pickle("recipe_df.pkl")
    
# create_recipe_dataframe()

In [15]:
def get_title_and_author(text):
    if find_starting_phrase_end_index(text) != -1:
        index_to_start_on = find_starting_phrase_end_index(text) + 1
        tmp = text[index_to_start_on:].strip()
        print(tmp)
        title = tmp.split("\n\n")[0].strip()
        author = tmp.split("\n\n")[1].strip()

def split_recipe_into_parts(text_df_row:str) -> pd.DataFrame:
    """Splits the ocr text into the different parts of the recipe and stores them in the recipe_df dataframe

    Args:
        text_df_row (str): a row from the text_df dataframe

    Returns:
        pd.DataFrame: a dataframe containing the ocr text split into the different parts of the recipe
    """       
    
    
    #splitting the ocr text into the different parts of the recipe and storing them in the recipe_df dataframe
    
    page = text_df_row["filename"]
    text = text_df_row["text"]
    
    #get title and author
    if find_starting_phrase_end_index(text) != -1:
        tmp = text.split("Cajun Cuisine is Easy & Fast")[1]
        title = tmp.split("\n")
        author = tmp.split("\n")
        
    title = text.split("\n")[0]
    author = text.split("\n")[1]
    ingredients = text.split("Ingredients")[1][:text.find("Hands-On")]
    instructions = text.split("Instructions")[1].split("\n\n")[0]
    serving_size = "add manually, no specific format"
    misc = "add manually, no specific format"
    tmp_df = pd.DataFrame([{"page":page,"title":title,"author":author,
                                "ingredients":ingredients, "instructions":instructions,
                                "serving_size":serving_size,"misc":misc}])
    return tmp_df

split_recipe_into_parts(text_df.iloc[0])


IndexError: list index out of range

In [19]:
text = text_df.iloc[2]["text"]
repr(text)

'\'Cajun Cooking is Tasty and Festive\\n\\nSusan\\\'s Bread Pudding\\n\\nSusan Saunders of Point Blue\\n\\nIngredients\\n\\n9 Slices of Bread (broken into pieces)\\n4 Cups Whole Milk (warmed)\\n\\n2 Eggs (beaten)\\n\\n1-1/2 Cups Sugar\\n\\n1/2. Bar of Butter or margarine 8 oz. (melted)\\n1 Can Sweetened Condensed Milk 14 oz.\\n1 Tablespoon Vanilla Extract\\n\\nHands-On Cooking Instructions\\n\\nMix Sugar, Eggs, Butter, Vanilla and Condenses Milk in a|\\nlarge bowl.\\n\\nBeat ingredients until well blended (about 2 minutes with\\nwire whisk) ‘Then\\n\\nStir in warm Milk\\n\\nPlace broken up pieces of bread in 11 x 14 pan\\n\\nPre-heat oven to 300 degrees and Bake for 45 minutes\\n\\nIf you cut recipe in half, the cooking time is 30\\nminutes; and, the pudding should be baked in an\\n8-1/2 inch square pan.\\n\\nI sometimes use French bread for a crunchier texture,\\nI use about half a loaf.\\n\\nIt tastes best if served warm or at room temperature\\n\\n‘Susan owns "Susan\\\'s on Court" i

In [23]:
text = text_df.iloc[2]["text"]
if find_starting_phrase_end_index(text) != -1:
    index_to_start_on = find_starting_phrase_end_index(text) + 1
    tmp = text[index_to_start_on:].strip()
    print(tmp)
    title = tmp.split("\n\n")[0].strip()
    author = tmp.split("\n\n")[1].strip()
print(f"title: {title}\nauthor: {author}")

Bea's Bread Pudding

Bea Fontenot of Mamou

Ingredients

1 Can Evaporated Milk (12 02.)
2 Cups Milk

4 Egg Yokes

1/3 Cup Melted Butter

1 Cup Sugar

1 teaspoon Nutmeg

1 teaspoon Vanilla

10 Slices of Bread (cubed)

Hands-On Cooking Instructions

Beat Egg Yolks

Add Milk, Sugar, and melted butter into egg yolks
and stir until well mixed. (if you like raisinsadd now)
Fold Bread into mixture and let bread soak for about
1 minute.

Pour into baking dish and bake at 350 degrees for
45 minutes.

Making the Meringue
Use 4 Egg Whites. Add 1/4 teaspoon Cream of
Tartar and beat until stiff, then

‘Add Sugar 1 Tablespoon at a time while beating egg|
Whites (8 Tablespoons Sugar). Continue beating until
stiff peaks form.

Spread Meringue on hot pudding and retum to oven
until Meringue is brown.

12
title: Bea's Bread Pudding
author: Bea Fontenot of Mamou


In [18]:
def find_starting_phrase_end_index(text:str) -> int:
    """Find the ending index of whichever introductory phrase is used in the recipe.

    Args:
        text (str): the ocr text

    Returns:
        int: the index of the final character of the match. returns -1 if neither intro matches
    """    
    re_cajun_cuisine = re.compile(r"Cajun Cuisine is Easy & Fast")
    re_cajun_cooking = re.compile(r"Cajun Cooking is Tasty and Festive")
    if re_cajun_cuisine.match(text):
        return re_cajun_cuisine.match(text).end()
    elif re_cajun_cooking.match(text):
        return re_cajun_cooking.match(text).end()
    else:
        return -1


def test_find_starting_phrase_end_index():
    cajun_cuisine_text = text_df.iloc[2]["text"]
    cajun_cooking_text = text_df.iloc[1]["text"]
    neither_text = "this is some text that has neither 04e58][=--=897 tb+6814\n\n\n0+036525++0"

    assert(find_starting_phrase_end_index(cajun_cuisine_text) == 28)
    assert(find_starting_phrase_end_index(cajun_cuisine_text) == 34)
    assert(find_starting_phrase_end_index(neither_text) == -1)

28
34
Cajun Cuisine is Easy & Fast

Bea's Bread Pudding

Bea Fontenot of Mamou

Ingredients

1 Can Evaporated Milk (12 02.)
2 Cups Milk

4 Egg Yokes

1/3 Cup Melted Butter

1 Cup Sugar

1 teaspoon Nutmeg

1 teaspoon Vanilla

10 Slices of Bread (cubed)

Hands-On Cooking Instructions

Beat Egg Yolks

Add Milk, Sugar, and melted butter into egg yolks
and stir until well mixed. (if you like raisinsadd now)
Fold Bread into mixture and let bread soak for about
1 minute.

Pour into baking dish and bake at 350 degrees for
45 minutes.

Making the Meringue
Use 4 Egg Whites. Add 1/4 teaspoon Cream of
Tartar and beat until stiff, then

‘Add Sugar 1 Tablespoon at a time while beating egg|
Whites (8 Tablespoons Sugar). Continue beating until
stiff peaks form.

Spread Meringue on hot pudding and retum to oven
until Meringue is brown.

12




In [51]:


page = 18
title = "Juanita's Strawberry Bunt Cake"
author = "Juanita Ardoin"
serving_size = "No serving size listed"
ingredients = """
**Cake Ingredients**
1 box Cake mix--white or yellow. I use Duncan Hines
1 box Strawberry Gelatin (regular size)
10 oz. Frozen Strawberries (thawed)
1/4 Cup Cold Water
3/4 Cup high grade Oil (I use Canola)
4 whole Eggs

**Glaze Ingredients**
1/2 stick of Butter (1/4 lb.)
1 Cup Powdered Sugar
10 oz. Frozen Strawberries (thawed)
"""
instructions = """
Martha's Fresh Pear CakeMartha FuselierIngredients3 Cups Flour1 teaspoon Soda1 teaspoon Salt2 Cups Sugar1-1/2 Cups Cooking Oil3 EggsDice 3 Pears (or substitute 3 Apples)1 Cup Coconut1 Cup Pecans (pieces)1 teaspoon VanillaButterscotch ChipsHands-On Cooking InstructionsMix 3 Cups Flour, teaspoon Soda, and teaspoon Saltand Mix Well. ThenCream 2 Cups Sugar with 1-1/2 Cups CookingOil, 3 Eggs and teaspoon Vanilla -- Then Addto Above Mixture‘Add diced Pears (or Apples), Cup of Coconut,Cup of Pecans.Pour into tube Pan and cover cake withButterscotch ChipsPre-heat oven to 325 degrees and cook at 325degrees for 1-1/2 hours
"""
misc = """
This is a grandma's recipe.
"""

#run cell to add new to to the recipe_df
recipe_df = add_row_to_recipe_df(recipe_df=recipe_df, page=page, title=title, author=author, ingredients=ingredients, instructions=instructions, serving_size=serving_size, misc=misc)
#save it to pickle
recipe_df.to_pickle("recipe_df.pkl")
recipe_df.tail(5)


Unnamed: 0,page,title,author,ingredients,instructions,serving_size,misc
4,14,Juanita's Sugar Cookies,Juanita Ardoin of Mamou,\n2-1/2 Cups Flour\n1 Cup Sugar\n3/4 Cup Cooki...,"\nMix Sugar, Oil, Eggs, and Vanilla Flavor\n\n...",Makes about 3 dozen,\n
5,15,Juanita's Peanut Butter Cookies,Juanita Ardoin,\n2-1/2 Cups Flour\n1 Cup Butter\n1 Cup Peanut...,"\nMix flour, salt, baking soda, and set aside\...",No serving size listed,\n
6,16,Mama's Cajun French Toast,My Mama--Lestea S. Ardoin--wife of Kimbell Ard...,\n1 Egg\n3/4 Cup Sugar [I think he's talking a...,\nMix and Whip up all ingredients [except for ...,No serving size listed,\nI add enough Sugar to cover all the dark pla...
7,17,JAKE'S Peanut Butter Ball Cookies,JAKE ARDOIN of Eunice--a son of Dr. Brent C. A...,\nHoney\nCereal -- any kind (crunch it)\nPeanu...,\nPut 6 handfuls of cereal in zip-lock bag.\n\...,No serving size listed,"\nJake, 8 years old, is one of my grandsons. W..."
8,18,Juanita's Strawberry Bunt Cake,Juanita Ardoin,\n**Cake Ingredients**\n1 box Cake mix--white ...,"\n**Cake Instructions**\nMix the Cake mix, dry...",No serving size listed,\nThis is a grandma's recipe.\n


In [22]:

# run this to add the stuff above to the recipe_df and store it.
def add_row_to_recipe_df(recipe_df:pd.DataFrame, page:str, title:str, author:str,
                         ingredients:str, instructions:str, serving_size:str, misc:str) -> None:
    """Adds a new recipe row to the recipe_df dataframe

    Args:
        recipe_df (pd.DataFrame): the dataframe to add the recipes to
        page (str): page_number
        title (str): title of recipe
        author (str): author of recipe
        ingredients (str): ingredients list for recipe
        instructions (str): cooking instructions for recipe
        serving_size (str): serving size of the recipe
        misc (str): other info that doesn't fit elsewhere
    """ 
    #create new df from given data
    new_df = pd.DataFrame([{"page":page,"title":title,"author":author, "ingredients":ingredients, "instructions":instructions,"serving_size":serving_size,"misc":misc}])
    #concat recipe_df with the new_df and overwrite recipe_store with it
    recipe_df = pd.concat([recipe_df, new_df], ignore_index=True)
    return recipe_df