# Libraries

In [None]:
import torch
import os
import re
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# check if GPU is accessible

In [None]:
#check gpu access
if torch.cuda.is_available():
    print("Using GPU: "+torch.cuda.get_device_name(0) + " is available")
else: #we are using cpu
    print("Using CPU")

# I. Document/Text Processing and Embedding Creation

Ingredients:
* PDF document of choice.
* Embedding model of choice.

Steps:
1. Import PDF document.
2. Process text for embedding (e.g. split into chunks of sentences).
3. Embed text chunks with embedding model.
4. Save embeddings to file for later use (embeddings will store on file for many years or until you lose your hard drive).

## 1. Import PDF document

In [None]:
# Get PDF document
pdf_path = "greg_doucette_cookbook_2_0.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, go to the link in the README to download it.")
else:
  print(f"File {pdf_path} exists.")

In [None]:

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        #text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number+1,  # adjust page numbers since our PDF starts on page 42
                                 "page_char_count": len(text),
                                 "page_word_count": len(text.split(" ")),
                                 "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

In [None]:
len(pages_and_texts)

# Data Cleaning

We separate the pages by recipe, table for fruits and table for vegetables.

In [None]:
table_of_contents = pages_and_texts[len(pages_and_texts)-19:len(pages_and_texts)-2]
recipes = pages_and_texts[16:len(pages_and_texts)-21]
vegetables_raw_and_legumes_servings_reference_table=table_of_contents[-3:-1]
fruits_servings_reference_table=table_of_contents[-4]

In [None]:
len(recipes)

In [None]:
len(vegetables_raw_and_legumes_servings_reference_table)

In [None]:
len(fruits_servings_reference_table)

## P_D_I_P_R database creation

We create a dataset with the following columns:
* The page Number
* The directions list
* The Ingredients list
* The preparation time
* The ready in time

In [None]:
df = pd.DataFrame(recipes)
df.head()
df.to_csv('cookbook_data.csv', index=False)

### Remove the empty content (they are images in the book)

In [None]:
#get the rows that contain ingredients and Directions since everything else is plain explanation
df = df[df.text.str.contains('Ingredients|Directions')]

we get now the elements we need from the text

In [None]:
def extract_directions(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "\nIngredients\n",
        "R E A D Y"
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    directions_match = re.search(r'Directions\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if directions_match:
        # Return the found directions
        return directions_match.group(1).strip()
    return None

#we test it on the first 5 rows
df['directions'] = df.text.progress_apply(extract_directions)

In [None]:
def extract_ingredients(text: str) -> str:
    # List of separators to use in the search
    separators = [
        "N u t r i t i o n",
        "N o t e",
        "N U T R I T I O N",
        "N O T E",
        "P R E P",
        "T O T A L",
        "V E G E T A R I A N",
        "V E G A N",
        "R E A D Y",
        "C L I C K",
    ]
    
    # Construct a regex pattern to match any of the separators
    separator_pattern = r'|'.join([re.escape(separator) for separator in separators])
    
    # Remove the "number.\n" -> "number. " since it is not useful
    text = re.sub(r'(\d+)\.\n', r'\1. ', text)
    
    # Search for the section "Directions"
    ingredients_match = re.search(r'\nIngredients\n\s*([\s\S]*?)\s*(' + separator_pattern + r')', text, re.IGNORECASE)
    if ingredients_match:
        # Return the found ingredients
        return ingredients_match.group(1).strip()
    return None


df['ingredients'] = df.text.progress_apply(extract_ingredients)

In [None]:
import re
import pandas as pd

# Function to extract prep time and ready in time
def extract_prep_time_and_or_ready_in(text: str) -> dict:
    prep_time = None
    ready_in = None

    # Find all occurrences of "MINUTES" and "HOURS" in uppercase along with their preceding numbers
    times = re.findall(r'(\d+)\s*(MINUTES|HOUR)', text)

    # Find the occurrence of "P R E P  T I M E" and "R E A D Y  I N"
    prep_time_index = text.find("\nP R E P  T I M E")
    ready_in_index = text.find("\nR E A D Y  I N")

    if prep_time_index != -1 and ready_in_index != -1:
        # Determine the order of prep time and ready in time based on their positions in the text
        if prep_time_index < ready_in_index:
            if len(times) > 0:
                prep_time = times[0][0] + " " + times[0][1] if times[0][1] == "MINUTES" else None
            if len(times) > 1:
                ready_in = times[1][0] + " " + times[1][1]
        else:
            if len(times) > 0:
                ready_in = times[0][0] + " " + times[0][1]
            if len(times) > 1:
                prep_time = times[1][0] + " " + times[1][1] if times[1][1] == "MINUTES" else None
    elif prep_time_index != -1:
        prep_time = times[0][0] + " " + times[0][1] if len(times) > 0 and times[0][1] == "MINUTES" else None
    elif ready_in_index != -1:
        ready_in = times[0][0] + " " + times[0][1] if len(times) > 0 else None

    return {
        "prep_time": prep_time,
        "ready_in": ready_in
    }


# Apply the function to the dataframe
df[['prep_time', 'ready_in']] = df['text'].apply(lambda x: pd.Series(extract_prep_time_and_or_ready_in(x)))


In [None]:
df.head()

We identified that \u00b0F is Fahrenheit (°F) and \u00b0C is Celcius (°C) So we replace it. And we know it is only in the direction, because, why would the degree too cook be in the ingredient list ? 

We remove the "ass" in the dataset... because we are good boys.

In the cookbook, there are notes and links that would not be necessary, as such we will remove it.
It starts with "click to...", we don't need it, so we remove it.

In [None]:
# List of phrases to remove
click_to_and_get_strings = [
    "CLICK TO ORDER ICON MEALS PROTEIN BREAD",
    "CLICK TO ORDER LOW-CALORIE SYRUP",
    "CLICK TO PURCHASE GUAR GUM",
    "CLICK TO ORDER WALDEN FARMS SYRUP",
    "CLICK TO PURCHASE A NINJA BLENDER",
    "CLICK TO PURCHASE MUSCLE EGG",
    "CLICK TO PURCHASE LIQUID MUSCLE",
    "CLICK TO PURCHASE MISSION CARB BALANCE TORTILLA",
    "CLICK TO PURCHASE YVES VEGGIE TOFU DOGS",
    "CLICK TO PURCHASE PALMINI LOW-CARB LASAGNA",
    "CLICK TO PURCHASE VEGGIE GROUND \"MEAT\"",
    "CLICK TO PURCHASE SUGAR-FREE CHOCOLATE SAUCE",
    "GET SUGAR-FREE CHOCOLATE JELL-O PUDDING",
    "GET CHOCOLATE SUGAR-FREE JELLO PUDDING MIX",
    "GET GUAR GUM",
    "GET PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PB2 POWDERED PEANUT BUTTER",
    "CLICK TO PURCHASE PUMPKIN PURÉE",
    "CLICK TO PURCHASE PB2 (POWDERED PEANUT BUTTER)",
    "CLICK TO PURCHASE FIBER ONE BROWNIE BAR",
    "CLICK TO PURCHASE CHOCOLATE PB2 POWDER",
    "GET BANANA SUGAR-FREE JELLO PUDDING MIX",
    "CLICK TO PURCHASE WALDEN FARMS MAPLE WALNUT SYRUP",
    "CLICK TO PURCHASE HERSHEY'S HEALTH SHELL TOPPING",
    "CLICK TO PURCHASE SUGAR-FREE JELLO CHEESECAKE PUDDING POWDER",
    "CLICK TO PURCHASE LIBBY’S 100% PURE PUMPKIN",
    "CLICK TO PURCHASE SUGAR-FREE VANILLA PUDDING JELL-O",
    "GET CHOCOLATE PB2 POWDERED PEANUT BUTTER",
]

# Function to remove specific phrases from text
def remove_phrases(text, phrases):
    for phrase in phrases:
        text = text.replace(phrase, '')
    return text

# Apply cleaning to all texts in the dataset
df['text'] = df['text'].apply(lambda x: remove_phrases(x, click_to_and_get_strings))

# Save the cleaned data to a new CSV file
cleaned_file_path = 'cleaned_cookbook_data.csv'
df.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned DataFrame
df.head()


In [None]:
#we replace de 1 hour by 60 minutes
df['ready_in'] = df['ready_in'].str.replace('1 HOUR', '60 MINUTES')

In [None]:
#we replace the column name ready_in by ready_in_minutes and prep_time by prep_time_minutes
df.rename(columns={'ready_in': 'ready_in_minutes', 'prep_time': 'prep_time_minutes'}, inplace=True)

#we remove the words MINUTES and HOUR
df['ready_in_minutes'] = df['ready_in_minutes'].str.replace('MINUTES', '')
df['prep_time_minutes'] = df['prep_time_minutes'].str.replace('MINUTES', '')
#we remove useless spaces
df['ready_in_minutes'] = df['ready_in_minutes'].str.strip()
df['prep_time_minutes'] = df['prep_time_minutes'].str.strip()

In [None]:
#we drop the useless columns LIKE page_number, page_char_count, page_word_count, page_sentence_count_raw, page_token_count
df.drop(columns=['text','page_char_count', 'page_word_count', 'page_sentence_count_raw', 'page_token_count'], inplace=True)

In [None]:
df.head()

In [None]:
df.to_csv('P_D_I_P_R.csv', index=False)

We completed our first database. That contains the directions of the recipe, along with the ingredients, the page number, the prep time and the ready in time. Now we will focus on creating a second database that contains the recipes title, along with the macronutrients and the page.

##  P_T_C_F_C_F_P database creation

We create a dataset with the following columns:
* The page Number
* The title of the recipe
* The calories
* The Fat content
* the Carb content
* The fiber content
* The Protein content

We start of by getting the associated page.

In [None]:
master_recipe_nutrition_table = table_of_contents[:-4]
master_recipe_nutrition_table

We are going to do this page by page.

In [None]:
#definition of the method to retreive the dataframe
def get_dataframe_from_table_page(text):

    #first off we remove the lines until we find the first occurence of "Vegetarian" and we drop it along with everything that came before him.
    text = text[text.find("Vegetarian"):].replace("Vegetarian","")

    #when we see the pattern of any \n next to "-" we remove the "\n"
    text = text.replace("\n-","-").replace("\n -"," -").replace("-\n","- ").replace("\n-\n","-").replace("- \n","- ")

    # if we see \n next to serving or servings we remove it
    text = re.sub(r'\n(?=\b[sS]erving[s]?\b)', '', text)

    # #we remove the first \n
    text = text.replace("\n"," ",1)

    #we remove "\nY\n"
    text = text.replace("\nY","")

    array=text.split("\n")
    #we remove unwanted spaces
    array = [i.strip() for i in array]
    #we remove empty strings
    array = list(filter(None, array))

    #on parcours le tableau, et nous calculons la longueur de chaque element. Si la le voisin i+1 est plus long que 3 c'est que c'est un text qui est coupé en deux, on le merge avec le voisin i. SAUF si i a une taille infieur ou égale à 3, car dans ce cas i est un nombre.
    for i in range(len(array) - 1):
        if len(array[i+1]) > 3 and len(array[i]) > 3 and not array[i+1].isnumeric() and not is_number_without_period(array[i]):
            array[i] = array[i] + " " + array[i+1]
            array[i+1] = ""

    # we remove the empty strings
    array = list(filter(None, array))
    #we now that the first 7 elements are 1 recipe, so we can append by chunks of 7. The first element is the page, the second the recipe, the third the calories, the fourth the fat, the fifth the carbs, the sixth the fiber, the seventh the protein.

    pages=[]
    recipes=[]
    calories=[]
    fat=[]
    carbs=[]
    fiber=[]
    protein=[]

    for i in range(0,len(array),7):
        pages.append(array[i])
        recipes.append(array[i+1])
        calories.append(array[i+2])
        fat.append(array[i+3])
        carbs.append(array[i+4])
        fiber.append(array[i+5])
        protein.append(array[i+6])
        

    #we create a dictionnary to store the data
    nutrition_data = {
        "Page": pages,
        "Recipe": recipes,
        "Calories Per Serving": calories,
        "Fat (g) per serving": fat,
        "Carbs (g) per serving": carbs,
        "Fiber (g) per serving": fiber,
        "Protein (g) per serving": protein,
    }

    #we create a dataframe
    page_dataframe = pd.DataFrame(nutrition_data)

    return page_dataframe


# to remove . in the page number
def is_number_without_period(s):
    return s.replace(".", "").isnumeric()

### Processing Pages of nutritional table

In [None]:
for index, page in enumerate(master_recipe_nutrition_table):
    print(f"Processing page {index}...")
    df = get_dataframe_from_table_page(page['text'])
    df.to_csv(f'nutrition_table_{index}.csv', index=False)

In [None]:
#load all the nutrition_table csv files and fuse them into one
import os
import glob

#we load the pages one by one
page_1_dataframe = pd.read_csv('nutrition_table_0.csv')
page_2_dataframe = pd.read_csv('nutrition_table_1.csv')
page_3_dataframe = pd.read_csv('nutrition_table_2.csv')
page_4_dataframe = pd.read_csv('nutrition_table_3.csv')
page_5_dataframe = pd.read_csv('nutrition_table_4.csv')
page_6_dataframe = pd.read_csv('nutrition_table_5.csv')
page_7_dataframe = pd.read_csv('nutrition_table_6.csv')
page_8_dataframe = pd.read_csv('nutrition_table_7.csv')
page_9_dataframe = pd.read_csv('nutrition_table_8.csv')
page_10_dataframe = pd.read_csv('nutrition_table_9.csv')
page_11_dataframe = pd.read_csv('nutrition_table_10.csv')
page_12_dataframe = pd.read_csv('nutrition_table_11.csv')
page_13_dataframe = pd.read_csv('nutrition_table_12.csv')

#we concatenate all the dataframes
master_recipe_nutrition_table = pd.concat([page_1_dataframe, page_2_dataframe, page_3_dataframe, page_4_dataframe, page_5_dataframe, page_6_dataframe, page_7_dataframe, page_8_dataframe, page_9_dataframe, page_10_dataframe, page_11_dataframe, page_12_dataframe, page_13_dataframe])

In [None]:
# we order it by the page number
P_T_C_F_C_F_P_dataframe = master_recipe_nutrition_table.sort_values(by='Page')
P_T_C_F_C_F_P_dataframe

based on our observation on the cookbook and the dataset. For unknown reason the page number was not respected starting from 48 became 49, and from then on, all numbers are +1, we simply remove this additionned value and reset the input.

In [None]:
#from every row starting from 51 we reduce the page number by 1
#we replace the page column by the new values
for i in range(51,len(P_T_C_F_C_F_P_dataframe)):
    P_T_C_F_C_F_P_dataframe.iloc[i,0] = int(P_T_C_F_C_F_P_dataframe.iloc[i,0])-1

In [None]:
P_T_C_F_C_F_P_dataframe

what we now need to do is is to manually set the vegan and vegetarian column, we can check in the cookbook each recipe to see if it is vegan or vegetarian. 
We can see that on page 187 there are all recipe are vegetaria (except 29,30,31) and all ar non vegan. So we can just write a python hard code that does that.

In [None]:
#what we now need to do is is to manually set the vegan and vegetarian column, we can check in the cookbook each recipe to see if it is vegan or vegetarian. We can see that on page 187 there are all recipe are vegetaria (except 29,30,31) and all ar non vegan. So we can just write a python hard code that does that.

#pages that are vegan:
vegan_pages = [51,66,76,78,91,121,142,152,164,169,196,170,175]

#we create the vegan column
P_T_C_F_C_F_P_dataframe['Vegan'] = "No"

#we set the vegan column to Yes for the pages that are vegan
for page in vegan_pages:
    P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'] == page, 'Vegan'] = "Yes"

# we set the ChocolateProtein Mug Cake to no, because it is not vegan
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Chocolate Protein Mug Cake', 'Vegan'] = "No"

In [None]:
#we set the vegetarian column

#we set every column as vegetarian
P_T_C_F_C_F_P_dataframe['Vegetarian'] = "Yes"

#now we just set to no the recipes that are not vegetarian
non_vegetarian_pages=[29,30,31,
                      
                      63,64,68,70,
                      
                      75,
                      80,
                      85,
                      86,87,89,90,92,93,98,99,101,102,

                      105,107,108,110,111,113,115,116,118,

                      126,

                      152,

                      ]

for page in non_vegetarian_pages:
    P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'] == page, 'Vegetarian'] = "No"


#Egg White Wrap & Cauliflower PIzza Crust - Per 2 Meat Lovers Pizza is not vegetarian 
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Cauliflower PIzza Crust - Per 2 Meat Lovers Pizza', 'Vegetarian'] = "No"
# Cauliflower PIzza Crust - Per 2 Meat Lovers Pizza is not vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Cauliflower PIzza Crust - Per 2 Meat Lovers Pizza', 'Vegetarian'] = "No"
# Sloppy Greg Sandwich - Total is not vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Sloppy Greg Sandwich - Total', 'Vegetarian'] = "No"
# Sloppy Greg Sandwich - Per Serving is not vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Sloppy Greg Sandwich - Per Serving', 'Vegetarian'] = "No"
# Grilled Chicken Wrap with Mango Relish - 1 Wrap is not vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Grilled Chicken Wrap with Mango Relish - 1 Wrap', 'Vegetarian'] = "No"
# Egg Whites on Flatout Light OR La Tortilla OR 90-110 Calorie Wrap of Choice is vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Egg Whites on Flatout Light OR La Tortilla OR 90-110 Calorie Wrap of Choice', 'Vegetarian'] = "Yes"
# Egg Whites on Joseph's Lavash is vegetarian
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Recipe'] == 'Egg Whites on Joseph\'s Lavash', 'Vegetarian'] = "Yes"

Now we succeded, we can create a category based on the table of content. 
* Breakfast: all pages from 17 to 60
* Appetizer: all pages from 62 to 67
* Tacos, Wraps and Sandwiches : all pages from 70 to 92
* Dinner: 95 to 122
* Treats: 125 to 156
* Dessert: 158 to 185

In [None]:
# Now we succeded, we can create a category based on the table of content. 
# * Breakfast: all pages from 17 to 60
# * Appetizer: all pages from 62 to 67
# * Tacos, Wraps and Sandwiches : all pages from 70 to 92
# * Dinner: 95 to 122
# * Treats: 125 to 156
# * Dessert: 158 to 185

#we create the category column
P_T_C_F_C_F_P_dataframe['Category'] = "Breakfast"
#we set the category column to the right category
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'].between(62, 67), 'Category'] = "Appetizer"
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'].between(70, 92), 'Category'] = "Tacos, Wraps and Sandwiches"
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'].between(95, 122), 'Category'] = "Dinner"
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'].between(125, 156), 'Category'] = "Treats"
P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'].between(158, 186), 'Category'] = "Dessert"


In [None]:
#we save the dataframe
P_T_C_F_C_F_P_dataframe.to_csv('P_T_C_F_C_F_P.csv', index=False)

In [None]:
#Optional, we can delete the nutrition_table csv files
files = glob.glob('nutrition_table_*.csv')
for f in files:
    os.remove(f)

# Dataset Optimizations Possibilities:
- On the PTCFCFP dataset, we need to correct the pages, some recipes do not exist and must be deleted.
  - Ex: "Hot Hamburg","Ham and Cheese - Regular Ass White Bread","Chicken Burger" neither do "PB2 and Jam Sandwhich - ICON","PB2 and Jam Sandwhich - Ezekiel","PB2 and Jam Sandwhich - Regular Ass White Bread"
- Add another of the Fruits and Vegetable dataset for better informations.


# Get final dataset "data"
We have now the 2 Databases to have all the informations of the recipes. What we can do now is potentially fuse them into one to have it in one single Dataset. We can create a Dataset that contains the following structure:
* Page number
* page_char_count
* page_word_count
* page_sentence_count_raw
* page_token_count
* Recipe information that is composed of the "title", "ingredients", "directions", "Prep_and_ready_time_in_minutes","Diet" and "category.

In [None]:
#we Creat a dataframe called data that contains the following columns: Page, Recipe

#we have our 2 datasets P_D_I_P_R.csv and P_T_C_F_C_F_P.csv :

#we load the dataframes
P_D_I_P_R_dataframe = pd.read_csv('P_D_I_P_R.csv')
P_T_C_F_C_F_P_dataframe = pd.read_csv('P_T_C_F_C_F_P.csv')


In [None]:
P_D_I_P_R_dataframe

In [None]:

#we create a method that based on a certain page number, we return the recipe
def get_recipe_from_page(page_number):

    extract_directions = P_D_I_P_R_dataframe.loc[P_D_I_P_R_dataframe['page_number'] == page_number, 'directions'].values[0] 

    extract_ingredients = P_D_I_P_R_dataframe.loc[P_D_I_P_R_dataframe['page_number'] == page_number, 'ingredients'].values[0]

    extracted_prep_time = P_D_I_P_R_dataframe.loc[P_D_I_P_R_dataframe['page_number'] == page_number, 'prep_time_minutes'].values[0]

    extracted_ready_in = P_D_I_P_R_dataframe.loc[P_D_I_P_R_dataframe['page_number'] == page_number, 'ready_in_minutes'].values[0]

    text = """DIRECTIONS:\n{}\n\nINGREDIENTS:\n{}\nPREP TIME (in m): {}\nREADY IN (in m): {}
    """.format(extract_directions, extract_ingredients, extracted_prep_time, extracted_ready_in)
    return text

# we get the title and macros from a certain page number
def get_title_and_macros(page_number):

    #since it is possible to have multiple values associated to a page number, we set them in an array
    array = P_T_C_F_C_F_P_dataframe.loc[P_T_C_F_C_F_P_dataframe['Page'] == page_number].iloc[0:].to_dict('records')

    text=""

    
    # if the array equals to 1, that means that there is only one recipe on that page, so we can return the title and the macros
    #if len(array) == 1:
    for recipe in array:
        title = recipe['Recipe']
        calories = recipe['Calories Per Serving']
        fat = recipe['Fat (g) per serving']
        carbs = recipe['Carbs (g) per serving']
        fiber = recipe['Fiber (g) per serving']
        protein = recipe['Protein (g) per serving']
        diet = "vegan" if recipe['Vegan'] == "Yes" else "non-vegan"
        diet += " , vegetarian" if recipe['Vegetarian'] == "Yes" else " , non-vegetarian"

        text+="""TITLE: {}\nCALORIES (in kcal): {}\nFAT (in g): {}\nCARBS (in g): {}\nFIBER (in g): {}\nPROTEIN (in g): {}\nDIET: {}\n\n""".format(title, calories, fat, carbs, fiber, protein,diet)
    return text



#We write a method to get all the informations of a recipe:
def get_recipe(page_number):
    recipe = get_recipe_from_page(page_number)
    title_and_macros = get_title_and_macros(page_number)
    return title_and_macros + recipe

print(get_recipe(18))

Now we are going to create the dataset "data"

In [100]:
recipe_dataset = pd.DataFrame(columns=['page_number',
                                       'page_char_count',
                                        'page_word_count',
                                        'page_sentence_count_raw',
                                        'page_token_count',
                                       'text'])

In [101]:
recipe_dataset['page_number'] = P_D_I_P_R_dataframe['page_number']

recipe_dataset['text'] = recipe_dataset['page_number'].apply(get_recipe)
recipe_dataset['page_char_count'] = recipe_dataset['text'].apply(lambda x: len(x))
recipe_dataset['page_word_count'] = recipe_dataset['text'].apply(lambda x: len(x.split(" ")))
recipe_dataset['page_sentence_count_raw'] = recipe_dataset['text'].apply(lambda x: len(x.split(". ")))
                                                                         
recipe_dataset['page_token_count'] = recipe_dataset['page_char_count'] / 4


In [99]:
recipe_dataset.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,17,1561,296,10,390.25,TITLE: Anabolic Apple Pie Breakfast Bake - Ent...
1,18,1251,205,11,312.75,TITLE: Anabolic French Toast - Per Serving\nCA...
2,20,1885,324,10,471.25,"TITLE: Banana ""No""Tella French Toast Roll-Ups ..."
3,21,1408,235,12,352.0,TITLE: Blueberry French Toast - Large\nCALORIE...
4,22,1546,295,10,386.5,TITLE: MEGA Peach French Toast Bake - Total\nC...


In [103]:
#we check the description of the dataset
recipe_dataset.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,130.0,130.0,130.0,130.0,130.0
mean,101.5,1376.47,232.18,10.03,344.12
std,50.41,471.78,82.18,3.59,117.95
min,17.0,422.0,71.0,1.0,105.5
25%,55.25,1049.0,174.0,8.0,262.25
50%,101.5,1344.0,224.5,10.0,336.0
75%,144.75,1636.75,283.0,12.0,409.19
max,184.0,2515.0,431.0,20.0,628.75


Okay, looks like our average token count per page is 345.

For this particular use case, it means we could embed an average whole page with the `all-mpnet-base-v2` model (this model has an input capacity of 384).

### Further text processing (splitting pages into sentences)

We are going to decompose the strings into sentences:
we want to follow the workflow of:

`Ingest text -> split it into groups/chunks -> embed the groups/chunks -> use the embeddings`

* Easier to handle than larger pages of text (especially if pages are densely filled with text).
* Can get specific and find out which group of sentences were used to help within a RAG pipeline.
> **Resource:** See [spaCy install instructions](https://spacy.io/usage). 
Let's use spaCy to break our text into sentences since it's likely a bit more robust than just using `text.split(". ")`. 