# Libraries

In [1]:
import torch
import os
import re
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# check if GPU is accessible

In [2]:
#check gpu access
if torch.cuda.is_available():
    print("Using GPU: "+torch.cuda.get_device_name(0) + " is available")
else: #we are using cpu
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 2060 is available


# I. Document/Text Processing and Embedding Creation

Ingredients:
* PDF document of choice.
* Embedding model of choice.

Steps:
1. Import PDF document.
2. Process text for embedding (e.g. split into chunks of sentences).
3. Embed text chunks with embedding model.
4. Save embeddings to file for later use (embeddings will store on file for many years or until you lose your hard drive).

## 1. Import PDF document

In [3]:
# Get PDF document
pdf_path = "greg_doucette_cookbook_2_0.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, go to the link in the README to download it.")
else:
  print(f"File {pdf_path} exists.")

File greg_doucette_cookbook_2_0.pdf exists.


In [4]:

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        #text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number+1,  # adjust page numbers since our PDF starts on page 42
                                 "page_char_count": len(text),
                                 "page_word_count": len(text.split(" ")),
                                 "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

205it [00:00, 449.28it/s]


In [5]:
len(pages_and_texts)

205

# Data Cleaning

We separate the pages by recipe, table for fruits and table for vegetables.

In [6]:
table_of_contents = pages_and_texts[len(pages_and_texts)-19:len(pages_and_texts)-2]
recipes = pages_and_texts[16:len(pages_and_texts)-21]
vegetables_raw_and_legumes_servings_reference_table=table_of_contents[-3:-1]
fruits_servings_reference_table=table_of_contents[-4]

In [7]:
len(recipes)

168

In [8]:
len(vegetables_raw_and_legumes_servings_reference_table)

2

In [9]:
len(fruits_servings_reference_table)

6

## P_D_I_P_R database creation

We create a dataset with the following columns:
* The page Number
* The directions list
* The Ingredients list
* The preparation time
* The ready in time

In [10]:
df = pd.DataFrame(recipes)
df.head()
df.to_csv('cookbook_data.csv', index=False)

### Remove the empty content (they are images in the book)

In [11]:
#get the rows that contain ingredients and Directions since everything else is plain explanation
df = df[df.text.str.contains('Ingredients|Directions')]

we get now the elements we need from the text

In [12]:
def extract_directions(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "\nIngredients\n",
        "R E A D Y"
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    directions_match = re.search(r'Directions\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if directions_match:
        # Return the found directions
        return directions_match.group(1).strip()
    return None

#we test it on the first 5 rows
df['directions'] = df.text.progress_apply(extract_directions)

100%|██████████| 130/130 [00:00<00:00, 5760.07it/s]


In [13]:
def extract_ingredients(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "R E A D Y",
        "C L I C K",
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    ingredients_match = re.search(r'\nIngredients\n\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if ingredients_match:
        # Return the found ingredients
        return ingredients_match.group(1).strip()
    return None


df['ingredients'] = df.text.progress_apply(extract_ingredients)

100%|██████████| 130/130 [00:00<00:00, 7857.67it/s]


In [14]:
import re
import pandas as pd

# Function to extract prep time and ready in time
def extract_prep_time_and_or_ready_in(text: str) -> dict:
    prep_time = None
    ready_in = None

    # Find all occurrences of "MINUTES" and "HOURS" in uppercase along with their preceding numbers
    times = re.findall(r'(\d+)\s*(MINUTES|HOUR)', text)

    # Find the occurrence of "P R E P  T I M E" and "R E A D Y  I N"
    prep_time_index = text.find("\nP R E P  T I M E")
    ready_in_index = text.find("\nR E A D Y  I N")

    if prep_time_index != -1 and ready_in_index != -1:
        # Determine the order of prep time and ready in time based on their positions in the text
        if prep_time_index < ready_in_index:
            if len(times) > 0:
                prep_time = times[0][0] + " " + times[0][1] if times[0][1] == "MINUTES" else None
            if len(times) > 1:
                ready_in = times[1][0] + " " + times[1][1]
        else:
            if len(times) > 0:
                ready_in = times[0][0] + " " + times[0][1]
            if len(times) > 1:
                prep_time = times[1][0] + " " + times[1][1] if times[1][1] == "MINUTES" else None
    elif prep_time_index != -1:
        prep_time = times[0][0] + " " + times[0][1] if len(times) > 0 and times[0][1] == "MINUTES" else None
    elif ready_in_index != -1:
        ready_in = times[0][0] + " " + times[0][1] if len(times) > 0 else None

    return {
        "prep_time": prep_time,
        "ready_in": ready_in
    }


# Apply the function to the dataframe
df[['prep_time', 'ready_in']] = df['text'].apply(lambda x: pd.Series(extract_prep_time_and_or_ready_in(x)))


In [15]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,directions,ingredients,prep_time,ready_in
0,17,2206,765,6,551.5,17\nwww.gregdoucette.com\nT H E U L T I M A T...,1. Pre-heat the oven to 400°F (204°C).\n2. Cho...,M A K E S 1 B A T C H . S E R V I N G \nS I...,20 MINUTES,1 HOUR
1,18,2205,616,8,551.25,18\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10 MINUTES,20 MINUTES
3,20,2736,653,3,684.0,20\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, mix the filling ingredients with...",M A K E S 2 R O L L - U P S ( 1 S E R V I ...,10 MINUTES,20 MINUTES
4,21,2019,556,8,504.75,21\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10 MINUTES,20 MINUTES
5,22,2312,816,8,578.0,22\nwww.gregdoucette.com\nT H E U L T I M A T...,1. Pre-heat the oven to 400°F (204°C).\n2. Cut...,M A K E S 1 B A T C H . S E R V I N G \nS I...,15 MINUTES,1 HOUR


We identified that \u00b0F is Fahrenheit (°F) and \u00b0C is Celcius (°C) So we replace it. And we know it is only in the direction, because, why would the degree too cook be in the ingredient list ? 

We remove the "ass" in the dataset... because we are good boys.

In the cookbook, there are notes and links that would not be necessary, as such we will remove it.
It starts with "click to...", we don't need it, so we remove it.

In [16]:
# List of phrases to remove
click_to_and_get_strings = [
    "CLICK TO ORDER ICON MEALS PROTEIN BREAD",
    "CLICK TO ORDER LOW-CALORIE SYRUP",
    "CLICK TO PURCHASE GUAR GUM",
    "CLICK TO ORDER WALDEN FARMS SYRUP",
    "CLICK TO PURCHASE A NINJA BLENDER",
    "CLICK TO PURCHASE MUSCLE EGG",
    "CLICK TO PURCHASE LIQUID MUSCLE",
    "CLICK TO PURCHASE MISSION CARB BALANCE TORTILLA",
    "CLICK TO PURCHASE YVES VEGGIE TOFU DOGS",
    "CLICK TO PURCHASE PALMINI LOW-CARB LASAGNA",
    "CLICK TO PURCHASE VEGGIE GROUND \"MEAT\"",
    "CLICK TO PURCHASE SUGAR-FREE CHOCOLATE SAUCE",
    "GET SUGAR-FREE CHOCOLATE JELL-O PUDDING",
    "GET CHOCOLATE SUGAR-FREE JELLO PUDDING MIX",
    "GET GUAR GUM",
    "GET PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PUMPKIN PURÉE",
    "CLICK TO PURCHASE PB2 (POWDERED PEANUT BUTTER)",
    "CLICK TO PURCHASE FIBER ONE BROWNIE BAR",
    "CLICK TO PURCHASE CHOCOLATE PB2 POWDER",
    "GET BANANA SUGAR-FREE JELLO PUDDING MIX",
    "CLICK TO PURCHASE WALDEN FARMS MAPLE WALNUT SYRUP",
    "CLICK TO PURCHASE HERSHEY'S HEALTH SHELL TOPPING",
    "CLICK TO PURCHASE SUGAR-FREE JELLO CHEESECAKE PUDDING POWDER",
    "CLICK TO PURCHASE LIBBY’S 100% PURE PUMPKIN",
    "CLICK TO PURCHASE SUGAR-FREE VANILLA PUDDING JELL-O",
    "GET CHOCOLATE PB2 POWDERED PEANUT BUTTER",
]

# Function to remove specific phrases from text
def remove_phrases(text, phrases):
    for phrase in phrases:
        text = text.replace(phrase, '')
    return text

# Apply cleaning to all texts in the dataset
df['text'] = df['text'].apply(lambda x: remove_phrases(x, click_to_and_get_strings))

# Save the cleaned data to a new CSV file
cleaned_file_path = 'cleaned_cookbook_data.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned DataFrame
df.head()


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,directions,ingredients,prep_time,ready_in
0,17,2206,765,6,551.5,17\nwww.gregdoucette.com\nT H E U L T I M A T...,1. Pre-heat the oven to 400°F (204°C).\n2. Cho...,M A K E S 1 B A T C H . S E R V I N G \nS I...,20 MINUTES,1 HOUR
1,18,2205,616,8,551.25,18\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10 MINUTES,20 MINUTES
3,20,2736,653,3,684.0,20\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, mix the filling ingredients with...",M A K E S 2 R O L L - U P S ( 1 S E R V I ...,10 MINUTES,20 MINUTES
4,21,2019,556,8,504.75,21\nwww.gregdoucette.com\nT H E U L T I M A T...,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10 MINUTES,20 MINUTES
5,22,2312,816,8,578.0,22\nwww.gregdoucette.com\nT H E U L T I M A T...,1. Pre-heat the oven to 400°F (204°C).\n2. Cut...,M A K E S 1 B A T C H . S E R V I N G \nS I...,15 MINUTES,1 HOUR


In [17]:
#we replace de 1 hour by 60 minutes
df['ready_in'] = df['ready_in'].str.replace('1 HOUR', '60 MINUTES')

In [18]:
#we replace the column name ready_in by ready_in_minutes and prep_time by prep_time_minutes
df.rename(columns={'ready_in': 'ready_in_minutes', 'prep_time': 'prep_time_minutes'}, inplace=True)

#we remove the words MINUTES and HOUR
df['ready_in_minutes'] = df['ready_in_minutes'].str.replace('MINUTES', '')
df['prep_time_minutes'] = df['prep_time_minutes'].str.replace('MINUTES', '')
#we remove useless spaces
df['ready_in_minutes'] = df['ready_in_minutes'].str.strip()
df['prep_time_minutes'] = df['prep_time_minutes'].str.strip()

In [19]:
#we drop the useless columns LIKE page_number, page_char_count, page_word_count, page_sentence_count_raw, page_token_count
df.drop(columns=['text','page_char_count', 'page_word_count', 'page_sentence_count_raw', 'page_token_count'], inplace=True)

In [20]:
df.head()

Unnamed: 0,page_number,directions,ingredients,prep_time_minutes,ready_in_minutes
0,17,1. Pre-heat the oven to 400°F (204°C).\n2. Cho...,M A K E S 1 B A T C H . S E R V I N G \nS I...,20,60
1,18,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10,20
3,20,"1. In a bowl, mix the filling ingredients with...",M A K E S 2 R O L L - U P S ( 1 S E R V I ...,10,20
4,21,"1. In a bowl, add egg whites, sweetener, cinna...",M A K E S 1 S E R V I N G\n180g (¾ cup) egg ...,10,20
5,22,1. Pre-heat the oven to 400°F (204°C).\n2. Cut...,M A K E S 1 B A T C H . S E R V I N G \nS I...,15,60


In [21]:
df.to_csv('P_D_I_P_R.csv', index=False)

We completed our first database. That contains the directions of the recipe, along with the ingredients, the page number, the prep time and the ready in time. Now we will focus on creating a second database that contains the recipes title, along with the macronutrients and the page.

##  P_T_C_F_C_F_P database creation

We create a dataset with the following columns:
* The page Number
* The title of the recipe
* The calories
* The Fat content
* the Carb content
* The fiber content
* The Protein content

We start of by getting the associated page.