# Libraries

In [1]:
import torch
import os
import re
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# check if GPU is accessible

In [2]:
#check gpu access
if torch.cuda.is_available():
    print("Using GPU: "+torch.cuda.get_device_name(0) + " is available")
else: #we are using cpu
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU is available


# I. Document/Text Processing and Embedding Creation

Ingredients:
* PDF document of choice.
* Embedding model of choice.

Steps:
1. Import PDF document.
2. Process text for embedding (e.g. split into chunks of sentences).
3. Embed text chunks with embedding model.
4. Save embeddings to file for later use (embeddings will store on file for many years or until you lose your hard drive).

## 1. Import PDF document

In [3]:
# Get PDF document
pdf_path = "greg_doucette_cookbook_2_0.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, go to the link in the README to download it.")
else:
  print(f"File {pdf_path} exists.")

File greg_doucette_cookbook_2_0.pdf exists.


In [4]:

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        #text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number+1,  # adjust page numbers since our PDF starts on page 42
                                 "page_char_count": len(text),
                                 "page_word_count": len(text.split(" ")),
                                 "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

205it [00:00, 517.36it/s]


In [5]:
len(pages_and_texts)

205

# Data Cleaning

We separate the pages by recipe, table for fruits and table for vegetables.

In [6]:
table_of_contents = pages_and_texts[len(pages_and_texts)-19:len(pages_and_texts)-2]
recipes = pages_and_texts[16:len(pages_and_texts)-21]
vegetables_raw_and_legumes_servings_reference_table=table_of_contents[-3:-1]
fruits_servings_reference_table=table_of_contents[-4]

In [7]:
len(recipes)

168

In [8]:
len(vegetables_raw_and_legumes_servings_reference_table)

2

In [9]:
len(fruits_servings_reference_table)

6

## Recipe dataset creation

We create a dataset with the following columns:
* The page Number
* The Ingredients list
* The directions list
* The preparation time
* The ready in time

In [10]:
df = pd.DataFrame(recipes)
df.head()
df.to_csv('cookbook_data.csv', index=False)

### Remove the empty content (they are images in the book)

In [11]:
#get the rows that contain ingredients and Directions since everything else is plain explanation
df = df[df.text.str.contains('Ingredients|Directions')]

In [12]:
df.text[0]

'17\nwww.gregdoucette.com\nT H E  U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0\nBack to Table of Contents\nAnabolic Apple\nPie Breakfast Bake\nDirections\n1.\nPre-heat the oven to 400°F (204°C).\n2.\nChop the apples into small pieces.\n3.\nIn a bowl, whisk egg whites, cinnamon, sweetener, and vanilla.\n4.\nTear the bread into small pieces and place in a bowl with the egg\nwhites, cinnamon, sweetener, and vanilla. Mix with your hands\nuntil the bread pieces are well soaked with the batter.\n5.\nSpray a casserole dish with cooking spray for 1 second. Pour the\negg white/bread mixture into the casserole dish.\n6.\nPlace the casserole dish uncovered in the middle rack and cook in\nthe oven at 400°F/204°C for 40-50 minutes.\nP R E P  T I M E\nR E A D Y  I N\n20 MINUTES\n1 HOUR\nIngredients\nM A K E S  1  B A T C H .  S E R V I N G \nS I Z E  V A R I E S  D E P E N D I N G \nO N  H O W  L A R G E  O R  S M A L L \nY O U  C U T  T H E  P I E C E S .\n18 slices regular ass bread (o

we get now the elements we need from the text

In [32]:
def extract_directions(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "\nIngredients\n",
        "R E A D Y"
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    directions_match = re.search(r'Directions\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if directions_match:
        # Return the found directions
        return directions_match.group(1).strip()
    return None

#we test it on the first 5 rows
df['directions'] = df.text.progress_apply(extract_directions)

100%|██████████| 130/130 [00:00<00:00, 4990.71it/s]


In [51]:
def extract_ingredients(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "R E A D Y",
        "C L I C K",
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    ingredients_match = re.search(r'\nIngredients\n\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if ingredients_match:
        # Return the found ingredients
        return ingredients_match.group(1).strip()
    return None


df['ingredients'] = df.text.progress_apply(extract_ingredients)

100%|██████████| 130/130 [00:00<00:00, 12583.01it/s]


In [89]:
# Function to extract prep time and ready in time
def extract_prep_time_and_or_ready_in(text: str) -> dict:
    prep_time = None
    ready_in = None

    # Find all occurrences of "MINUTES" and "HOURS" along with their preceding numbers
    times = re.findall(r'(\d+)\s*(MINUTES|HOURS)', text)

    # Find the occurrence of "P R E P  T I M E" and "R E A D Y  I N"
    prep_time_index = text.find("\nP R E P  T I M E")
    ready_in_index = text.find("\nR E A D Y  I N")

    if prep_time_index != -1 and ready_in_index != -1:
        # Determine the order of prep time and ready in time based on their positions in the text
        if prep_time_index < ready_in_index:
            prep_time = times[0][0] if len(times) > 0 else None
            ready_in = times[1][0] if len(times) > 1 else None
        else:
            ready_in = times[0][0] if len(times) > 0 else None
            prep_time = times[1][0] if len(times) > 1 else None
    elif prep_time_index != -1:
        prep_time = times[0][0] if len(times) > 0 else None
    elif ready_in_index != -1:
        ready_in = times[0][0] if len(times) > 0 else None

    return {
        "prep_time": prep_time,
        "ready_in": ready_in
    }
#it has only ready in
page_80_text=df[df.page_number==80].text.values[0]
page_78_text=df[df.page_number==78].text.values[0]
page_77_text=df[df.page_number==77].text.values[0]
page_58_text=df[df.page_number==58].text.values[0]

print(extract_prep_time_and_or_ready_in(page_80_text))
print(extract_prep_time_and_or_ready_in(page_78_text))
print(extract_prep_time_and_or_ready_in(page_77_text))
print(extract_prep_time_and_or_ready_in(page_58_text))

{'prep_time': None, 'ready_in': '3'}
{'prep_time': '3', 'ready_in': '5'}
{'prep_time': None, 'ready_in': '10'}
{'prep_time': '20', 'ready_in': None}


In [86]:
page_58_text=df[df.page_number==58].text.values[0]
page_58_text

'58\nwww.gregdoucette.com\nT H E  U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0\nBack to Table of Contents\nStrawberry Peach \nProtein Muffins\nDirections\n1.\nPreheat the oven to 350°F (177°C). Place liners in a muffin\ntin and spray them with cooking spray.\n2.\nIn a bowl, mix all the dry ingredients together well. In a\nseparate bowl or a stand mixer, whip together the rest of\nthe ingredients until smooth. Add the dry ingredients to\nthe wet ingredients and mix until incorporated. Fold in the\nstrawberries and peaches and mix gently with a spoon until\nmixed.\n3.\nFill the muffin liners about ¾ of the way full with the batter.\nBake the muffins in the oven for 25 minutes or until a\ntoothpick comes out clean when you prick the muffins.\n4.\nWhile the muffins are in the oven, place the frosting\ningredients in a bowl and mix with a fork until well-blended.\n5.\nRemove the muffins from the oven and allow to cool for 15\nminutes before placing the icing on top and serving.\

In [None]:
text=df['text']

text

In [None]:
#we get the page number of the recipes it is always the first 2-3 characters of the text
page_numbers=[]
for i in text:
    page_numbers.append(i[:3])
page_numbers 

In [None]:
#we remove the rows that have page_sentence_count_raw <5
df = df[df['page_sentence_count_raw'] > 5]
df = df.reset_index(drop=True)
df.head()

In [None]:
df.to_csv("cookbook_data.csv", index=False)

In [None]:
pages_and_texts[1]

We remove the starting title "www.gregdoucette.com T H E  U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0 Back to Table of Contents" along with all the text that remove before www.


In [None]:
# Function to clean the text
def clean_text(page):
    marker = "U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0 Back to Table of Contents"
    text = page["text"]
    if marker in text:
        text = text.split(marker, 1)[1]

    #if text does not contain at the beginning "Title" we add it
    if not text.startswith("Title"):
        text = "Title" + text
    text.replace("\u00b0F", "°F").replace("\u00b0C", "°C").replace('regular ass bread','regular bread')
    return text


for page in pages_and_texts:
    page["text"] = clean_text(page)

pages_and_texts[0]['text']

We identified that \u00b0F is Fahrenheit (°F) and \u00b0C is Celcius (°C) So we replace it. And we know it is only in the direction, because, why would the degree too cook be in the ingredient list ? 

We remove the "ass" in the dataset... because we are good boys.

In the cookbook, there are notes and links that would not be necessary, as such we will remove it.
It starts with "click to...", we don't need it, so we remove it.

In [None]:
# List of phrases to remove
click_to_and_get_strings = [
    "CLICK TO ORDER ICON MEALS PROTEIN BREAD",
    "CLICK TO ORDER LOW-CALORIE SYRUP",
    "CLICK TO PURCHASE GUAR GUM",
    "CLICK TO ORDER WALDEN FARMS SYRUP",
    "CLICK TO PURCHASE A NINJA BLENDER",
    "CLICK TO PURCHASE MUSCLE EGG",
    "CLICK TO PURCHASE LIQUID MUSCLE",
    "CLICK TO PURCHASE MISSION CARB BALANCE TORTILLA",
    "CLICK TO PURCHASE YVES VEGGIE TOFU DOGS",
    "CLICK TO PURCHASE PALMINI LOW-CARB LASAGNA",
    "CLICK TO PURCHASE VEGGIE GROUND \"MEAT\"",
    "CLICK TO PURCHASE SUGAR-FREE CHOCOLATE SAUCE",
    "GET SUGAR-FREE CHOCOLATE JELL-O PUDDING",
    "GET CHOCOLATE SUGAR-FREE JELLO PUDDING MIX",
    "GET GUAR GUM",
    "GET PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PUMPKIN PURÉE",
    "CLICK TO PURCHASE PB2 (POWDERED PEANUT BUTTER)",
    "CLICK TO PURCHASE FIBER ONE BROWNIE BAR",
    "CLICK TO PURCHASE CHOCOLATE PB2 POWDER",
    "GET BANANA SUGAR-FREE JELLO PUDDING MIX",
    "CLICK TO PURCHASE WALDEN FARMS MAPLE WALNUT SYRUP",
    "CLICK TO PURCHASE HERSHEY'S HEALTH SHELL TOPPING",
    "CLICK TO PURCHASE SUGAR-FREE JELLO CHEESECAKE PUDDING POWDER",
    "CLICK TO PURCHASE LIBBY’S 100% PURE PUMPKIN",
    "CLICK TO PURCHASE SUGAR-FREE VANILLA PUDDING JELL-O",
    "GET CHOCOLATE PB2 POWDERED PEANUT BUTTER",
]

# Function to remove specific phrases from text
def remove_phrases(text, phrases):
    for phrase in phrases:
        text = text.replace(phrase, '')
    return text

# Apply cleaning to all texts in the dataset
df['text'] = df['text'].apply(lambda x: remove_phrases(x, click_to_and_get_strings))

# Save the cleaned data to a new CSV file
cleaned_file_path = '/mnt/data/cleaned_cookbook_data.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned DataFrame
df.head()


In [None]:
spaced_strings

In [None]:
cleaned_pages = []
for page in pages_and_texts:
    text = page['text']
    cleaned_text = re.sub(r'CLICK TO.*', '', text)
    cleaned_pages.append({'page_number': page['page_number'], 'page_char_count': len(cleaned_text), 'page_word_count': len(cleaned_text.split()), 'page_sentence_count_raw': len(cleaned_text.split('. ')), 'page_token_count': len(cleaned_text) / 4, 'text': cleaned_text})

cleaned_pages

In [None]:
# Input text
text = pages_and_texts[0]["text"]
text

In [None]:
import re
import json

# Input text
text = pages_and_texts[0]["text"]

# Function to parse the recipe text
def extract_directions_and_ingredients(text):
    
    # Extracting the directions
    directions_match = re.search(r'Directions(.*?)P R E P  T I M E', text, re.DOTALL)
    directions = directions_match.group(1).strip() if directions_match else "No Directions Found"
    
    # Extracting the ingredients
    ingredients_match = re.search(r'Ingredients(.*?)N U T R I T I O N P E R  S E R V I N G', text, re.DOTALL)
    ingredients = ingredients_match.group(1).strip() if ingredients_match else "No Ingredients Found"
    
    return {
        "directions": directions,
        "ingredients": ingredients
    }

# Parsing the provided text
parsed_recipe = extract_directions_and_ingredients(text)

# Displaying the parsed recipe
parsed_recipe
