# Libraries

In [34]:
import torch
import os

# check if GPU is accessible

In [35]:
#check gpu access
if torch.cuda.is_available():
    print("Using GPU: "+torch.cuda.get_device_name(0) + " is available")
else: #we are using cpu
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU is available


# I. Document/Text Processing and Embedding Creation

Ingredients:
* PDF document of choice.
* Embedding model of choice.

Steps:
1. Import PDF document.
2. Process text for embedding (e.g. split into chunks of sentences).
3. Embed text chunks with embedding model.
4. Save embeddings to file for later use (embeddings will store on file for many years or until you lose your hard drive).

## 1. Import PDF document

In [36]:
# Get PDF document
pdf_path = "greg_doucette_cookbook_2_0.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, go to the link in the README to download it.")
else:
  print(f"File {pdf_path} exists.")

File greg_doucette_cookbook_2_0.pdf exists.


In [37]:
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

0it [00:00, ?it/s]

In [38]:
len(pages_and_texts)

205

# Data Cleaning

we remove all the pages that are not recipes, so we remove all the pages before case 16. and the last 21 pages


In [39]:
pages_and_texts = pages_and_texts[16:len(pages_and_texts)-21]

In [40]:
len(pages_and_texts)

168

We remove the starting title "www.gregdoucette.com T H E  U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0 Back to Table of Contents" along with all the text that remove before www.


In [41]:
# Function to clean the text
def clean_text(page):
    marker = "U L T I M A T E  A N A B O L I C  C O O K B O O K  2 . 0 Back to Table of Contents"
    text = page["text"]
    if marker in text:
        text = text.split(marker, 1)[1]

    text.replace("\u00b0F", "°F").replace("\u00b0C", "°C").replace('Anabolic','').replace("anabolic",'').replace('regular ass bread','regular bread')
    return 'Title' + text


for page in pages_and_texts:
    page["text"] = clean_text(page)

pages_and_texts[0]['text']

'Title Anabolic Apple Pie Breakfast Bake Directions 1. Pre-heat the oven to 400°F (204°C). 2. Chop the apples into small pieces. 3. In a bowl, whisk egg whites, cinnamon, sweetener, and vanilla. 4. Tear the bread into small pieces and place in a bowl with the egg whites, cinnamon, sweetener, and vanilla. Mix with your hands until the bread pieces are well soaked with the batter. 5. Spray a casserole dish with cooking spray for 1 second. Pour the egg white/bread mixture into the casserole dish. 6. Place the casserole dish uncovered in the middle rack and cook in the oven at 400°F/204°C for 40-50 minutes. P R E P  T I M E R E A D Y  I N 20 MINUTES 1 HOUR Ingredients M A K E S  1  B A T C H .  S E R V I N G  S I Z E  V A R I E S  D E P E N D I N G  O N  H O W  L A R G E  O R  S M A L L  Y O U  C U T  T H E  P I E C E S . 18 slices regular ass bread (or one  loaf [570g] of regular ass bread) 1920g (4 cartons/2000ml) egg  whites 21g (3 tbsp) cinnamon 15g (1 tbsp) vanilla extract 15 packets 

We identified that \u00b0F is Fahrenheit (°F) and \u00b0C is Celcius (°C) So we replace it. And we know it is only in the direction, because, why would the degree too cook be in the ingredient list ? 

We remove the "ass" in the dataset... because we are good boys.

In the cookbook, there are notes and links that would not be necessary, as such we will remove it.
It starts with "click to...", we don't need it, so we remove it.

In [42]:
# Input text
text = pages_and_texts[0]["text"]
text

'Title Anabolic Apple Pie Breakfast Bake Directions 1. Pre-heat the oven to 400°F (204°C). 2. Chop the apples into small pieces. 3. In a bowl, whisk egg whites, cinnamon, sweetener, and vanilla. 4. Tear the bread into small pieces and place in a bowl with the egg whites, cinnamon, sweetener, and vanilla. Mix with your hands until the bread pieces are well soaked with the batter. 5. Spray a casserole dish with cooking spray for 1 second. Pour the egg white/bread mixture into the casserole dish. 6. Place the casserole dish uncovered in the middle rack and cook in the oven at 400°F/204°C for 40-50 minutes. P R E P  T I M E R E A D Y  I N 20 MINUTES 1 HOUR Ingredients M A K E S  1  B A T C H .  S E R V I N G  S I Z E  V A R I E S  D E P E N D I N G  O N  H O W  L A R G E  O R  S M A L L  Y O U  C U T  T H E  P I E C E S . 18 slices regular ass bread (or one  loaf [570g] of regular ass bread) 1920g (4 cartons/2000ml) egg  whites 21g (3 tbsp) cinnamon 15g (1 tbsp) vanilla extract 15 packets 

In [33]:
import re
import json

# Input text
text = pages_and_texts[0]["text"]

# Function to parse the recipe text
def parse_recipe(text):
    # Extracting the title
    title_match = re.search(r'Title(.*?)Directions', text)
    title = title_match.group(1).strip() if title_match else "No Title Found"
    
    # Extracting the directions
    directions_match = re.search(r'Directions(.*?)P R E P  T I M E', text, re.DOTALL)
    directions = directions_match.group(1).strip() if directions_match else "No Directions Found"
    
    # Extracting the ingredients
    ingredients_match = re.search(r'Ingredients(.*?)N U T R I T I O N P E R  S E R V I N G', text, re.DOTALL)
    ingredients = ingredients_match.group(1).strip() if ingredients_match else "No Ingredients Found"
    
    return {
        "title": title,
        "directions": directions,
        "ingredients": ingredients
    }

# Parsing the provided text
parsed_recipe = parse_recipe(text)

# Displaying the parsed recipe
parsed_recipe


{'title': 'Anabolic Apple Pie Breakfast Bake',
 'directions': '1. Pre-heat the oven to 400°F (204°C). 2. Chop the apples into small pieces. 3. In a bowl, whisk egg whites, cinnamon, sweetener, and vanilla. 4. Tear the bread into small pieces and place in a bowl with the egg whites, cinnamon, sweetener, and vanilla. Mix with your hands until the bread pieces are well soaked with the batter. 5. Spray a casserole dish with cooking spray for 1 second. Pour the egg white/bread mixture into the casserole dish. 6. Place the casserole dish uncovered in the middle rack and cook in the oven at 400°F/204°C for 40-50 minutes.',
 'ingredients': 'M A K E S  1  B A T C H .  S E R V I N G  S I Z E  V A R I E S  D E P E N D I N G  O N  H O W  L A R G E  O R  S M A L L  Y O U  C U T  T H E  P I E C E S . 18 slices regular ass bread (or one  loaf [570g] of regular ass bread) 1920g (4 cartons/2000ml) egg  whites 21g (3 tbsp) cinnamon 15g (1 tbsp) vanilla extract 15 packets (⅝ cup) sweetener 1500g or ~10 a