In [1]:
import os
import pytesseract
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

%matplotlib inline


# Dataframes are pickled to save. 
# text_df - the filename and ocr text from all images in the pages_corrected folder with columns = ["filename", "text"]
# recipe_df -  the manually corrected recipe text with columns = ["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"]

In [2]:
#setting up the text and recipe df. This only needs to be run once the first time
def create_text_and_recipe_dataframes() -> None:
    """
    Setup for the recipe_df and text_df dataframes and saving to store.
    Doesn't need to be run again after the first time.
    """
    recipe_df = pd.DataFrame(columns=["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"])
    recipe_df.to_pickle("recipe_df.pkl")
    
#create_text_and_recipe_dataframes()

In [5]:
#top create a dataframe containing the ocr text from all images in the image_folder

#first used the application IrfanView to apply color correction to all images
#and saved the resulting images to the image folder below

image_folder = "/mnt/g/My Drive/Project Stuff/Recipe_ripper/pages_corrected/"

#running tesseract ocr on all the images in the image folder
#to create a list of tuples named recipe_list containing (filename, text) for each page image
def run_ocr_on_images_in_folder(image_folder:str) -> list:
    """
    Runs tesseract ocr on all the images in the image folder

    Args:
        image_folder (str): _description_

    Returns:
        list: a list of tuples containing (filename, text) for each page image
    """    
    recipe_list = []
    for image in os.listdir(image_folder):
        file_name = image[9:-4]
        text = pytesseract.image_to_string(Image.open(f"{image_folder}"+image))
        recipe_list.append((file_name, text))
    return recipe_list

def create_dataframe_from_recipe_list(recipe_list:list) -> None:
    #create 
    text_df = pd.DataFrame([ [page[0], page[1]] for page in recipe_list], columns=["filename", "text"])
    text_df.head()
    #store the ocr text in a pickle file
    text_df.to_pickle("text_df.pkl")
    
run_ocr_on_images_in_folder(image_folder)


[('page10',
  "Cajun Cuisine is Easy & Fast\n\nJuanita's ue be Doughnuts\nJuanita Ardoin (Makes 5 dozen _(Gx3x3))\n\n8 Cups White Flour (not self-rising)\n2 Cups Sugar\n\n1 Cup Evaporated Milk\n\n1 Tablespoon Vanilla (pure vanilla)\n6 teaspoons baking Powder\n\n4 Eggs\n\nuse Martha White all purpose flour, Canola cooking oil\nand Clabber Girl baking powder. These doughnuts should\nnot absorb oil. If they do, try increasing the heat of your\ncooking oil.\n\nHands-On Cooking Instructions\n\nCream Eggs & Sugar\nAdd Milk & Vanilla and stir well\n\nMix Flour and Baking Powder together\nAdd Flour mixture to liquid mixture\n\nStir until if forms a gummy ball\n\nPrepare board by powdering with flour\n\nRoll the dough to about 1/4 inch thickness\n\nCut into squares of about 3 inches\n\nCut the 3 inch squares diagonally. This will make\nlitde triangles of about 3x3 inches which are easy to\nflip over when cooking.\n\nDeep Fat Fry until Golden Brown\n\nRecipe makes several dozen. One way to handl

In [None]:
#renaming all the files because I accidently added 2 to every page number

image_folder = "/mnt/g/My Drive/Project Stuff/Recipe_ripper/pages_corrected/"

def add_2_to_every_page_number(image_folder:str) -> None:
    """
    Renaming all the files because I accidently added 2 to every page number

    Args:
        image_folder (str): string name of folder location with images
    """    
    #renaming all the files because I accidently added 2 to every page number
    for image in os.listdir(image_folder):
        filename = image[9:-4]
        pagenumber = int(filename[4:])
        pagenumber -= 2
        new_name = image_folder + "cookbook_page" + str(pagenumber) + ".png"
        image_location = image_folder + image
        os.rename(image_location, new_name)

In [None]:
#setting up recipe df. This only needs to be run once the first time. 
def create_recipe_dataframe() -> None:
    """
    Setup for the recipe_df dataframe.
    Doesn't need to be run again after the first time.
    """
    recipe_df = pd.DataFrame(columns=["page", "title", "author", "ingredients", "instructions", "serving_size", "misc"])
    recipe_df.to_pickle("recipe_df.pkl")
    
# create_recipe_dataframe()

In [4]:
#loading the text_df and recipe_df dataframes from pickle
text_df = pd.read_pickle("text_df.pkl")
recipe_df = pd.read_pickle("recipe_df.pkl")

In [None]:
	
page = 10
title = "Juanita's Cajun Doughnuts"
serving_size = "5 dozen"
author = "Juanita Ardoin"
ingredients = """
8 Cups White Flour (not self-rising)
2 Cups Sugar
1 Cup Evaporated Milk
1 Tablespoon Vanilla (pure vanilla)
6 teaspoons baking Powder
4 Eggs
"""
instructions = """
Cream Eggs & Sugar

Add Milk & Vanilla and stir well

Mix Flour and Baking Powder together

Add Flour mixture to liquid mixture

Stir until if forms a gummy ball

Prepare board by powdering with flour

Roll the dough to about 1/4 inch thickness

Cut into squares of about 3 inches

Cut the 3 inch squares diagonally. This will make
litde triangles of about 3x3 inches which are easy to
flip over when cooking.

Deep Fry until Golden Brown

Recipe makes several dozen. One way to handle excess
dough: line a cookie sheet with wax paper, layer triangles
80 they do not touch, cover this layer with another sheet of
wax paper and repeat the procedure. In quantities that meet
your needs, put in zip lock bags. Defrost about 10 minutes
before dropping into hot oil.
"""
misc = """
Juanita says, "I use Martha White all purpose flour, Canola cooking oil
and Clabber Girl baking powder. These doughnuts should
not absorb oil. If they do, try increasing the heat of your
cooking oil."
"""

#run to add new to to the recipe_df
add_row_to_recipe_df(page=page, title=title, author=author, ingredients=ingredients, instructions=instructions, serving_size=serving_size, misc=misc)

In [None]:

# run this to add the stuff above to the recipe_df and store it.
def add_row_to_recipe_df(page:str, title:str, author:str, ingredients:str, instructions:str, serving_size:str, misc:str) -> None:
    """Adds a new recipe row to the recipe_df dataframe

    Args:
        page (str): page_number
        title (str): title of recipe
        author (str): author of recipe
        ingredients (str): ingredients list for recipe
        instructions (str): cooking instructions for recipe
        serving_size (str): serving size of the recipe
        misc (str): other info that doesn't fit elsewhere
    """ 
    with pd.HDFStore('store.h5') as store:
        recipe_df = store["recipe_df"]
        #create new df from given data
        new_df = pd.DataFrame([{"page":[page],"title":[title],"author":[author], "ingredients":[ingredients], "instructions":[instructions],"serving_size":[serving_size],"misc":[misc]}])
        #concat recipe_df with the new_df and overwrite recipe_store with it
        store["recipe_store"] = pd.concat([recipe_df, new_df], ignore_index=True)
