In [1]:
import os
import re
import pytesseract
import pandas as pd
from PIL import Image


# text_df - the filename and ocr text from all images in the pages_corrected folder with columns = ["filename", "text"]
# recipe_df -  the manually corrected recipe text with columns = ["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"]
#loading the text_df and recipe_df dataframes from pickle
text_df = pd.read_pickle("text_df.pkl")
recipe_df = pd.read_pickle("recipe_df.pkl")
recipe_df.tail()

Unnamed: 0,page,title,author,ingredients,instructions,serving_size,misc
10,20,Goldie's Black Eye Pea Cake,Goldie Jemerson -- one of her mother's recipes,Cake Ingredients:\n1 Cup Wesson Cooking Oil\n1...,To make Cake:\nMix all ingredients in one bowl...,None listed,manually lookup
11,23,Stephanie's Old Fasioned Syrup Cake,"Stephanie Dupre of Ville Platte, now living in...",1/2 Cup Buttermilk\n1/2 Cup Sugar\n1/2 Cup But...,"Mix Sugar, Syrup, Butter. -- Mix thoroughly.\n...",manually lookup,manually lookup
12,24,CJ's Pineapple Meringue Cake,Claudia--C.J. is married to Terry Fontenot of ...,\nIngredients for Cake:\n1 Cup Flour\n2 teaspo...,Instructions for Cake:\nGrease and Flour Two 9...,manually lookup,manually lookup
13,26,"Lucille's 1, 2,3, 4 Cake",Lucille Deville of Mamou,[Cake Ingredients]\n1 Cup Butter\n2 Cups Sugar...,[Cake Instructions]\nBlend Sugar with Butter a...,No serving size listed,\n\n
14,27,Verdie's Fig Cake--Loaf,Ms. Verdie from Eunice is married to Rev. Gabr...,1 Pint Cooked Figs with syrup\n1/2 Cup Butter\...,\nWe used 9x13 dish to cook. 9x15 may be bette...,No serving size listed,This goes real well with coffee and company; i...


In [2]:
#setting up the recipe df. This only needs to be run once the first time
def setup_recipe_dataframe() -> None:
    """
    Setup for the recipe_df dataframe.
    Doesn't need to be run again after the first time.
    """
    recipe_df = pd.DataFrame(columns=["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"])
    recipe_df.to_pickle("recipe_df.pkl")
    
#setup_recipe_dataframe()

In [None]:
#renaming all the files because I accidently added 2 to every page number

def add_2_to_every_page_number(image_folder:str) -> None:
    """
    Renaming all the files because I accidently added 2 to every page number

    Args:
        image_folder (str): string name of folder location with images
    """    
    #renaming all the files because I accidently added 2 to every page number
    for image in os.listdir(image_folder):
        filename = image[9:-4]
        pagenumber = int(filename[4:])
        pagenumber -= 2
        new_name = image_folder + "cookbook_page" + str(pagenumber) + ".png"
        image_location = image_folder + image
        os.rename(image_location, new_name)

In [2]:
#Instructions to create a dataframe containing the ocr text from all images in the image_folder
#first use some application to lighten the backgrounds and darken the text of your images.
# I used GIMP and its auto color correction stuff, specifically white balance under the auto menu. It worked great.
# I couldn't figure out how to make it edit a batch of photos within GIMP, so I suggest using a macro program to do this easily

image_folder = "/mnt/g/My Drive/Project Stuff/Recipe_ripper/pages_corrected/"

#running tesseract ocr on all the images in the image folder
#to create a list of tuples named recipe_list containing (filename, text) for each page image
def run_ocr_on_images_in_folder(image_folder:str) -> list:
    """
    Runs tesseract ocr on all the images in the image folder. This won't work if your
    files are named differntly from mine "cookbook_page_{page_number}.png".

    Args:
        image_folder (str): The folder where the recipe scan images are located

    Returns:
        list: a list of tuples containing (filename, text) for each page image
    """    
    recipe_list = []
    for image in os.listdir(image_folder):
        file_name = image[9:-4]
        text = pytesseract.image_to_string(Image.open(f"{image_folder}"+image))
        recipe_list.append((file_name, text))
    return recipe_list

def create_dataframe_from_recipe_list(recipe_list:list[tuple[str,str]]) -> None:
    """
    Generates a dataframe from the items in the recipe list returned from run_ocr_on_images_in_folder()

    Args:
        recipe_list (list[tuple[str,str]]): a list of tuples of the form (filename, text)
    """    
    #create 
    text_df = pd.DataFrame([ [page[0], page[1]] for page in recipe_list], columns=["filename", "text"])
    #store the ocr text in a pickle file
    text_df.to_pickle("text_df.pkl")
    
# recipe_list = run_ocr_on_images_in_folder(image_folder)
# create_dataframe_from_recipe_list(recipe_list)


In [9]:
#find the ending index of the header phrase for each page to make cleaning the text easier
def find_starting_phrase_end_index(text:str) -> int:
    """Find the ending index of whichever introductory phrase is used in the recipe.

    Args:
        text (str): the ocr text

    Returns:
        int: the index of the final character of the match. returns -1 if neither intro matches
    """    
    re_cajun_cuisine = re.compile(r"Fast")
    re_cajun_cooking = re.compile(r"Festive")
    if re_cajun_cuisine.search(text[:50]):
        return re_cajun_cuisine.search(text).end()
    elif re_cajun_cooking.search(text[:50]):
        return re_cajun_cooking.search(text).end()
    else:
        return -1
    
def cleanup_text(text:str):
    """
    Strips text, then removes the page number from end, then strips again
    Args:
        text (str): the ocr text of the recipe
    """   
    idx = find_starting_phrase_end_index(text)
    if idx != -1:
        text = text[idx:]
    text = text.strip()
    if len(text) > 0:
        while text[-1].isnumeric():
            text = text[:-1]
    return text.strip()

# for x in text_df["text"]:
#     text_df["text"][text_df["text"] == x] = cleanup_text(x)
# text_df.to_pickle("text_df.pkl")

Unnamed: 0,filename,text
0,page10,Juanita's ue be Doughnuts\nJuanita Ardoin (Mak...
1,page11,Susan's Bread Pudding\n\nSusan Saunders of Poi...
2,page12,Bea's Bread Pudding\n\nBea Fontenot of Mamou\n...
3,page13,This is_one of = =\n\nJuanita's Oatmeal Cookie...
4,page14,Juanita's Sugar Cookies\nJuanita Ardoin of Mam...


In [2]:
# attemps to automatically clean up and grab the relevant sections from the ocr text
# anything single section that fails will just return the DEFAULT_VALUE
# any exceptions that occur (2 ingredients sections, missing header text, page without a recipe, etc)
# will return DEFAULT_VALUE for all sections
DEFAULT_VALUE = "manually lookup"

def get_title_and_author(text):
    title = tmp.split("\n\n")[0].strip()
    author = tmp.split("\n\n")[1].strip()
    return title, author
    
def cleanup_text(text:str):
    """
    Strips text, then removes the page number from end, then strips again
    Args:
        text (str): the ocr text of the recipe
    """   
    text = text.strip()
    while text[-1].isnumeric():
        text = text[:-1]
    return text.strip()

def split_at_regex(text:str, split_str:str) -> list[str]:
    """
    Splits the text into everything before and everything after the heading
    to the instructions section. Then strips the whitespace on the ends of each section

    Args:
        text (str): the ocr text of the recipe
        split_str (str): the string to split the text on. 

    Returns:
        list[str]: the two sections in a list
    """
    
    reg = re.compile(r"{}".format(split_str))
    if reg.search(text):
        ret = reg.split(text)
        ret = [section.strip() for section in ret]
        return ret
    else:
        print(f"failed to find {split_str} in the text")
        return [DEFAULT_VALUE, DEFAULT_VALUE ]
  
    

def split_recipe_into_parts(text_df_row:str) -> pd.DataFrame:
    """Splits the ocr text into the different parts of the recipe returns a dataframe containing the parts.
    Because of the lack of consistent format, any issues will just return the DEFAULT_VALUE

    Args:
        text_df_row (str): a row from the text_df dataframe

    Returns:
        pd.DataFrame: a dataframe containing the ocr text split into the different parts of the recipe
    """       
    
    page = text_df_row["filename"].split("page")[1]
    text = text_df_row["text"]

    #removes the page number from the text and strips the whitespace on the ends
    text = cleanup_text(text)
    
    #get_instructions
    try:
        title_author_ingreds_text, instructions = split_at_regex(text, "Hands-On Cooking Instructions")
        
        #get ingredients
        title_author_text, ingredients = split_at_regex(title_author_ingreds_text, "Ingredients")
        ingredients = ingredients.replace("\n\n", "\n")

        #get title and author
        title, author = get_title_and_author(title_author_ingreds_text)
        
        serving_size = DEFAULT_VALUE #no specific format for this, if it appears will end up in other sections
        misc = DEFAULT_VALUE #no specific format for this, if it appears will end up in other sections
    except:
        return pd.DataFrame([{"page":page,"title":DEFAULT_VALUE,"author":DEFAULT_VALUE,
                                "ingredients":DEFAULT_VALUE, "instructions":DEFAULT_VALUE,
                                "serving_size":DEFAULT_VALUE,"misc":DEFAULT_VALUE}])
    
    tmp_df = pd.DataFrame([{"page":page,"title":title,"author":author,
                                "ingredients":ingredients, "instructions":instructions,
                                "serving_size":serving_size,"misc":misc}])
    return tmp_df

In [31]:
#create a dataframe for automatically parsing the pages using the split_recipe_into_parts() function
# pickles it after
parsed_recipes_df = pd.concat([split_recipe_into_parts(row) for index, row in text_df.iterrows()], ignore_index=True)
parsed_recipes_df.to_pickle("parsed_recipes_df.pkl")

failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Ingredients in the text
failed to find Ingredients in the text
failed to find Hands-On Cooking Instructions in the text
failed to find Ingredients in the text
failed to find Ingr

In [52]:
# just change the page number and it will output the relevant info
page_num = 24
tmp_df = parsed_recipes_df[parsed_recipes_df["page"] == str(page_num)]
row_index = 0
for index, row in tmp_df.iterrows():
    row_index = index
    print(row["title"] + "\n\n")
    print(row["author"] + "\n\n")
    print(row["ingredients"] + "\n\n")
    print(row["instructions"] + "\n\n")
    print(row["serving_size"] + "\n\n")
    print(row["misc"] + "\n\n")
    print("\n\n")

manually lookup


manually lookup


1 Cup Flour
2 teaspoons Baking Powder
2 teaspoons Vanilla
5 Tablespoons of Milk
4 Eggs (separate yolk from white)
1-1/2 Cups Sugar
1/2 Cup Vegetable Shortening
3/4 Cup Finely chopped Pecans


Grease and Flour Two 9" round cake pans

Sift Flour and Baking Powder onto wax paper
Beat Egg Whites until foamy; then beat in 1 Cup
Sugar until stiff (meringue forms peaks); then
add 1 teaspoon Vanilla

Beat Shortening and 1/2 Cup Sugar until smooth;
then beat in Egg Yolks

Beat in Flour alternately with Milk

Stir in 1 teaspoon Vanilla

Put Batter into 2 pans; top with half of the
meringue on each,

Top with Pecans

Bake at 350 degrees for 20 minutes and let cool|
.and then—go on to next page--Continued


manually lookup


manually lookup







In [53]:
#this is an area to do manual editing of the parse_df and add those recipes to the recipe_df

tmp_df["title"] ="CJ's Pineapple Meringue Cake"
tmp_df["author"] = "Claudia--C.J. is married to Terry Fontenot of Lafayette"
tmp_df["ingredients"] = """
Ingredients for Cake:
1 Cup Flour
2 teaspoons Baking Powder
2 teaspoons Vanilla
5 Tablespoons of Milk
4 Eggs (separate yolk from white)
1-1/2 Cups Sugar
1/2 Cup Vegetable Shortening
3/4 Cup Finely chopped Pecans

Ingredients for Filling:
1 small can crushed pineapple (8 oz.)
8 oz. Cool Whip
1/2 Cup Powdered Sugar
"""
tmp_df["instructions"] = """Instructions for Cake:
Grease and Flour Two 9" round cake pans

Sift Flour and Baking Powder onto wax paper

Beat Egg Whites until foamy; then beat in 1 Cup
Sugar until stiff (meringue forms peaks); then
add 1 teaspoon Vanilla

Beat Shortening and 1/2 Cup Sugar until smooth;
then beat in Egg Yolks

Beat in Flour alternately with Milk

Stir in 1 teaspoon Vanilla

Put Batter into 2 pans; top with half of the
meringue on each,

Top with Pecans

Bake at 350 degrees for 20 minutes and let cool.

Instructions for making Pineapple Cream Filling:
Drain pineapple -- press dry with paper towels
Add powdered sugar and cool whip and Mix Well.

Instructions for putting Cake together
Place 1 layer Meringue side down on a plate.
Spread Creme Filling on top, and then
Top with 2nd layer -- Meringue side up
"""
tmp_df["serving_size"] = ""
tmp_df["misc"] = ""
recipe_df = pd.concat([recipe_df, tmp_df], ignore_index=True)
recipe_df.to_pickle("recipe_df.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["title"] ="CJ's Pineapple Meringue Cake"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["author"] = "Claudia--C.J. is married to Terry Fontenot of Lafayette"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["ingredients"] = """
A value is trying to be set on a copy of a slice fro

In [4]:
recipe_df.tail()

Unnamed: 0,page,title,author,ingredients,instructions,serving_size,misc
10,20,Goldie's Black Eye Pea Cake,Goldie Jemerson -- one of her mother's recipes,Cake Ingredients:\n1 Cup Wesson Cooking Oil\n1...,To make Cake:\nMix all ingredients in one bowl...,None listed,manually lookup
11,23,Stephanie's Old Fasioned Syrup Cake,"Stephanie Dupre of Ville Platte, now living in...",1/2 Cup Buttermilk\n1/2 Cup Sugar\n1/2 Cup But...,"Mix Sugar, Syrup, Butter. -- Mix thoroughly.\n...",manually lookup,manually lookup
12,24,CJ's Pineapple Meringue Cake,Claudia--C.J. is married to Terry Fontenot of ...,\nIngredients for Cake:\n1 Cup Flour\n2 teaspo...,Instructions for Cake:\nGrease and Flour Two 9...,manually lookup,manually lookup
13,26,"Lucille's 1, 2,3, 4 Cake",Lucille Deville of Mamou,[Cake Ingredients]\n1 Cup Butter\n2 Cups Sugar...,[Cake Instructions]\nBlend Sugar with Butter a...,No serving size listed,\n\n
14,27,Verdie's Fig Cake--Loaf,Ms. Verdie from Eunice is married to Rev. Gabr...,1 Pint Cooked Figs with syrup\n1/2 Cup Butter\...,\nWe used 9x13 dish to cook. 9x15 may be bette...,No serving size listed,This goes real well with coffee and company; i...


In [66]:
# for manually adding recipes to the recipe_df
# this comment only here cuz i need this every so often:  °F
page = 81
title = "To Cook Rice the Cajun Way"
author = "Juanita Ardoin"
ingredients = """To cook short or medium grain rice use the following proportions: 1 cup water to 1 cup rice.
Long grain rice: 1-1/2 cups water to 1 cup rice.
Salt the water to taste. We usually salt it after adding water to the rice. We now use electric rice cookers.
"""
instructions = """[Instructions for cooking rice on Stove]
Use the above proportions in a pot. Heat on thestove until the water comes to a boil; then put lid on pot and lower heat until it simmers or just slightly boils--then put lid back on the pot.
If this is your first time, you should peep under the cover about every 4 minutes, as you need to know when water dissipates.
Let this go on until the water stops making bubbles on top of the rice. (The air bubbles that form will now be very big and have theappearance of starchy bubbles).
Lower wick to very low heat and lit cook forabout 20 minutes. Then...turn off the heat and let it set about 12 minutes before serving.
"""
misc = """Never stir the rice at any time during the entire Process. If you do, rice will be mushy or gooey.
If at first you don't succeed, try again. It really isn't difficult. Pretty soon you won't peek but once or twice.
"""
serving_size = """
"""

#run cell to add new to to the recipe_df
recipe_df = add_row_to_recipe_df(recipe_df=recipe_df, page=page, title=title, author=author, ingredients=ingredients, instructions=instructions, serving_size=serving_size, misc=misc)
#save it to pickle
recipe_df.to_pickle("recipe_df.pkl")
recipe_df.tail(5)

Unnamed: 0,page,title,author,ingredients,instructions,serving_size,misc
61,77,Juanita's Roux,Juanita Ardoin,1 Cup Flour\n1 Cup Cooking Oil\n,Mix together the above ingredients and brown s...,\n,\n
62,78,Juanita's Okra Gumbo,Juanita Ardoin,1 lb. fresh Okra (sliced 1/8 inch thick)\n2 Ta...,Put Cooking Oil in heavy pot and fry Okra unti...,\n,\n
63,79,Emelie's Rabbit--Turnip Stew,Emelie Frazer -- won 1997 award 'Woman of the ...,1 Rabbit (cut up)\n2 Large head of Onions (cho...,Put oil in pot. There should be enough to cove...,\n,"Emelie's recipe from her mother, Mrs. Edna Bre..."
64,80,Juanita's Old Fashioned Stew,Juanita Ardoin,2-1/2 lb. Ground Meat or Chicken\n3/4 Cup of R...,Mix chopped onions & chopped bell pepper toget...,\n,\n
65,81,To Cook Rice the Cajun Way,Juanita Ardoin,To cook short or medium grain rice use the fol...,[Instructions for cooking rice on Stove]\nUse ...,\n,Never stir the rice at any time during the ent...


In [3]:
# run this to add the stuff above to the recipe_df and store it.
def add_row_to_recipe_df(recipe_df:pd.DataFrame, page:str, title:str, author:str,
                         ingredients:str, instructions:str, serving_size:str, misc:str) -> None:
    """Adds a new recipe row to the recipe_df dataframe

    Args:
        recipe_df (pd.DataFrame): the dataframe to add the recipes to
        page (str): page_number
        title (str): title of recipe
        author (str): author of recipe
        ingredients (str): ingredients list for recipe
        instructions (str): cooking instructions for recipe
        serving_size (str): serving size of the recipe
        misc (str): other info that doesn't fit elsewhere
    """ 
    #create new df from given data
    new_df = pd.DataFrame([{"page":page,"title":title,"author":author, "ingredients":ingredients, "instructions":instructions,"serving_size":serving_size,"misc":misc}])
    #concat recipe_df with the new_df and overwrite recipe_store with it
    recipe_df = pd.concat([recipe_df, new_df], ignore_index=True)
    return recipe_df