# Indexing Recipes Using LazyGraphRag
This notebook demonstrates how to index using the LazyGraphRag library.

Learn more about LazyGraphRag here: [GraphRAG](https://datastax.github.io/graph-rag/examples/lazy-graph-rag/?h=lazy)

## Datasets
The datasets used in this notebook are:
- **CookingRecipes Dataset**:
    source: https://huggingface.co/datasets/CodeKapital/CookingRecipes
    description: A dataset of cooking recipes with ingredients, directions, and other relevant information.
- **Q&A For Recipes Dataset**:
    source: https://huggingface.co/datasets/Hieu-Pham/cooking_squad
    description: A dataset of cooking-related questions and answers to help users troubleshoot issues with recipe directions. The context of the questions are recipes from the CookingRecipes dataset.
- **General preference Q&A Dataset**:
    source: https://huggingface.co/datasets/andrewsiah/se_cooking_preference_sft
    description: A dataset of questions and answers to help better inform users about cooking techniques and ingredients.

## Instantiation

In [None]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv(dotenv_path='../.env')

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
# !pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

In [None]:
### This is a speed test for reading a parquet file with Pandas and cuDF
# import cudf
# import pandas as pd
# import time
# # Pandas
# pd_start = time.time()
# pf_df = pd.read_parquet("./data/cooking_recipes.parquet")
# # pf_df["tip_percentage"] = pf_df["tip"] / pf_df["total_bill"] * 100
# print(pf_df.shape)
# pd_end = time.time()
# print(f"Pandas read_csv took {pd_end - pd_start} seconds")

# # cuDF
# cudf_start = time.time()
# cudf_df = cudf.read_parquet("./data/cooking_recipes.parquet")
# # cudf_df["tip_percentage"] = cudf_df["tip"] / cudf_df["total_bill"] * 100
# print(cudf_df.shape)
# cudf_end = time.time()
# print(f"cuDF read_csv took {cudf_end - cudf_start} seconds")

############################################################################

## Prepare Recipe Data

In [None]:
# %load_ext cudf.pandas
# %reload_ext cudf.pandas
import requests 
import polars as pl
import pandas as pd
import os
import time

n=250000
# check if the data is saved
if os.path.exists('./data/cooking_recipes.parquet.gzip'):
    recipes_df = pd.read_parquet('./data/cooking_recipes.parquet.gzip').head(n)
else:
    r = requests.get("https://huggingface.co/api/datasets/CodeKapital/CookingRecipes/parquet/default/train")
    urls = [{'url': url, 'file_name': url.split('/')[-1]} for url in r.json()]

    for url in urls:
        print(url['url'])
        url['df'] = pl.read_parquet(url['url']).to_pandas()

    recipes_df = pd.concat([url['df'] for url in urls]).reindex()

    # only keep top 1000 rows
    recipes_df = recipes_df.head(n)
    # save the data to parquet in /data
    recipes_df.to_parquet('./data/cooking_recipes.parquet.gzip', compression='gzip')

# rename the columns 'Unnamed: 0' to 'i64'
recipes_df.rename(columns={'Unnamed: 0':'i64'}, inplace=True)

In [None]:
# recipes_data.head(5)

In [None]:
import json
import re
regex = re.compile('[^a-zA-Z]')

# recipes_df = recipes_data.copy()

# filter out rows with empty 'title' column
recipes_df = recipes_df[recipes_df['title'].notnull()]

# drop duplcates on the 'titlee' column
recipes_df.drop_duplicates(subset='title', inplace=True)
# rename the column "NER" to "ner"
recipes_df.rename(columns={'NER':'ner'}, inplace=True)

# create an id column that is a combination of the title and i64. title should be lowercased and spaces replaced with underscores. # Remove any non-alphanumeric characters from the id column
recipes_df['source_id'] = recipes_df.apply(lambda x: f"{x['i64']}_{regex.sub('', x['title']).lower().replace(' ', '_')}", axis=1)
# apply json.load to directions, ingredients and NER columns
recipes_df['directions'] = recipes_df['directions'].apply(json.loads)
recipes_df['ingredients'] = recipes_df['ingredients'].apply(json.loads)
recipes_df['ner'] = recipes_df['ner'].apply(json.loads)

In [None]:
# NOTE: The AI will recieve recipe context formatted as Markdown.

# Create a function that takes a row fro the recipe dataset and returns a string in markdown format
recipe_format_md = lambda r: """# {title}

## Ingredients
- {ingredients}

## Directions
- {directions}
""".format(title=r['title'], ingredients='\n- '.join(r['ingredients']), directions='\n- '.join(r['directions']))

recipes_df['md'] = recipes_df.apply(recipe_format_md, axis=1)


#########
# uncomment the following line to test the function on a single row
# print(recipes_df['md'][0])
#########


## Clean Named Entities in Recipe Data

In [None]:
def clean_ner_data(recipes_df):
    # Explode the 'ner' column into individual rows
    recipes_df = recipes_df[['source_id', 'ner']].explode('ner')
    # Aggregate source_id in an array by ner
    recipes_df = recipes_df.groupby('ner')['source_id'].apply(list).reset_index()
    # Filter out rows in 'ner' with single alphanumeric character
    recipes_df = recipes_df[recipes_df['ner'].str.len() > 1]
    # Trim leading and trailing whitespaces from 'ner' strings
    recipes_df['ner'] = recipes_df['ner'].str.strip()
    # # Remove leading "'s" or "," from ner strings
    # recipes_df['ner'] = recipes_df['ner'].str.replace(r"^['s,]+", '', regex=True)
    # Filter 'A.' from the ner column
    recipes_df = recipes_df[recipes_df['ner'] != 'A.']
    # Add column with length of 'ner' string
    recipes_df['ner_str_len'] = recipes_df['ner'].str.len()
    # Sort by column: 'ner_str_len' (ascending)
    recipes_df = recipes_df.sort_values(['ner_str_len'])
    # Filter rows where the ner_str_len < 3
    recipes_df = recipes_df[recipes_df['ner_str_len'] > 2]
    # Capitalize first characters of 'ner' strings
    recipes_df['ner'] = recipes_df['ner'].str.capitalize()
    # Explode source_id column into individual rows
    recipes_df = recipes_df.explode('source_id')[['source_id', 'ner']]
    # Aggregate source_id in an array by ner again
    recipes_df = recipes_df.groupby('ner')['source_id'].apply(list).reset_index()
    # Count items in source_id list
    recipes_df['item_count'] = recipes_df['source_id'].apply(lambda x: len(x))
    # Sort by column: 'item_count' (descending)
    recipes_df = recipes_df.sort_values(['item_count'], ascending=[False])
    # Filter out rows where ner == 'Alt'
    # recipes_df = recipes_df[recipes_df['ner'] != 'Alt']
    return recipes_df

ner_df = clean_ner_data(recipes_df.copy())
# ner_df.head()

### Use SpaCy to Consolodate Named Entities

In [None]:
import spacy 

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_lg')
# ner_df['ner_token'] = ner_df['ner'].apply(lambda x: nlp(x))
ner_df['ner_token'] = list(nlp.pipe(ner_df['ner'], disable=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]))


# # has_vector attribute
ner_df['ner_token_has_vector'] = ner_df['ner_token'].apply(lambda x: x.has_vector)
# Filter out rows where ner_token_has_vector == False
ner_df = ner_df[ner_df['ner_token_has_vector'] == True]
# # vector_norm attribute
# ner_df['ner_token_vector_norm'] = ner_df['ner_token'].apply(lambda x: x.vector_norm)
# # is_oov attribute
ner_df['ner_token_is_oov'] = ner_df['ner_token'].apply(lambda x: x.is_oov if hasattr(x, 'is_oov') else None)



In [None]:
from tqdm.notebook import tqdm
# TODO: Partition the data into chunks of 1000 rows each and save each chunk to a separate parquet file in /data. 
# cross join each partition with every other partition at least once and calculate the similarity between the NER tokens in each row. 
# After each cross join, save the resulting data to a parquet file in /data.

# # Partition the data into chunks of 3000 rows each
ner_df_partitions = [ner_df.iloc[i:i+3000] for i in range(0, len(ner_df), 3000)]
ner_df_len = len(ner_df_partitions)
print(ner_df_len)

In [None]:
# Warning: This cell will take a long time to run

from tqdm.notebook import trange, tqdm
ner_similarity_partitions = []
for i in trange(ner_df_len):
    part_ner_df_1 = ner_df_partitions[i]
    for j in trange(ner_df_len):
        save_name = f'./data/similarity_scores/ner_similarity_{i}_{j}.parquet.gzip'
        if os.path.exists(save_name):
            cross_join_df = pd.read_parquet(save_name)
        else:
            part_ner_df_2 = ner_df_partitions[j]
            # # Cross join each partition with every other partition at least once
            cross_join_df = pd.merge(part_ner_df_1[['ner', 'ner_token']], part_ner_df_2[['ner', 'ner_token']], how='cross', suffixes=('_1', '_2'))
            # Calculate the similarity between the NER tokens in each row
            cross_join_df['similarity'] = cross_join_df.apply(lambda x: x['ner_token_1'].similarity(x['ner_token_2']), axis=1)
            # filter out rows where similarity < 0.99
            cross_join_df = cross_join_df[cross_join_df['similarity'] > 0.99]
            # Save the resulting data to a parquet file in /data. Only all columns except ner_token_1 and ner_token_2
            saved_columns = [col for col in cross_join_df.columns if col.startswith('ner_token') == False]
            cross_join_df[saved_columns].to_parquet(save_name, compression='gzip')
        ner_similarity_partitions.append(cross_join_df)

In [None]:
ner_grouped_similarities = pd.concat(ner_similarity_partitions)

# Aggregate ner_2 in an array by ner_1
ner_grouped_similarities = ner_grouped_similarities.groupby('ner_1')['ner_2'].apply(list).reset_index()

# join ner_grouped_similarities with ner_df on ner column
ner_grouped_similarities = pd.merge(ner_grouped_similarities, ner_df[['ner', 'source_id']], left_on='ner_1', right_on='ner')
# Explode ner_2 column into individual rows
ner_grouped_similarities = ner_grouped_similarities.explode('ner_2')[['source_id', 'ner_1', 'ner_2']]
# rename source_id to ner_source_id
ner_grouped_similarities.rename(columns={'source_id':'ner_1_source_id'}, inplace=True)
# join ner_grouped_similarities with ner_df on ner_2 column
ner_grouped_similarities = pd.merge(ner_grouped_similarities, ner_df[['ner', 'source_id']], left_on='ner_2', right_on='ner')
# rename source_id to ner_2_source_id
ner_grouped_similarities.rename(columns={'source_id':'ner_2_source_id'}, inplace=True)
# Drop ner column
ner_grouped_similarities.drop(columns=['ner'], inplace=True)
# rename ner_1 to ner
ner_grouped_similarities.rename(columns={'ner_1':'ner'}, inplace=True)
# combine ner_1_source_id and ner_2_source_id into a single column
ner_grouped_similarities['source_id'] = ner_grouped_similarities.apply(lambda x: x['ner_1_source_id'] + x['ner_2_source_id'], axis=1)
# drop ner_2, ner_1_source_id and ner_2_source_id columns
ner_grouped_similarities.drop(columns=['ner_1_source_id', 'ner_2', 'ner_2_source_id'], inplace=True)
# concatenate ner_df and ner_grouped_similarities
cleaned_ner_df = pd.concat([ner_df[['ner', 'source_id']], ner_grouped_similarities])


# Remove "'s" from the beginning of ner strings
cleaned_ner_df['ner'] = cleaned_ner_df['ner'].str.replace(r"^'s", '', regex=True)
# Remove quotation marks from the 'ner' column
cleaned_ner_df['ner'] = cleaned_ner_df['ner'].str.replace('"', '', regex=False)
# Remove specific characters at the start of 'ner'
cleaned_ner_df['ner'] = cleaned_ner_df['ner'].str.lstrip('()+/:,.')
# Trim white spaces in the 'ner' column
cleaned_ner_df['ner'] = cleaned_ner_df['ner'].str.strip()
# Capitalize the first letter of 'ner' strings
cleaned_ner_df['ner'] = cleaned_ner_df['ner'].str.capitalize()
# explode source_id column into individual rows
cleaned_ner_df = cleaned_ner_df.explode('source_id')
# Aggregate ner in an array by source_id
cleaned_ner_df = cleaned_ner_df.groupby('source_id')['ner'].apply(list).reset_index()
# Rename ner to cleaned_ners
cleaned_ner_df.rename(columns={'ner':'cleaned_ner'}, inplace=True)

In [None]:
# join recipes_df with ner_grouped_similarities on source_id
recipes_df = pd.merge(recipes_df, cleaned_ner_df, on='source_id', how='left')
# fill NaN values in cleaned_ners column with empty list
recipes_df['cleaned_ner'] = recipes_df['cleaned_ner'].apply(lambda x: x if isinstance(x, list) else [])

### Recipe Documents
Before loading the recipe data, we need to prepare the recipe documents.
The `page_content` will be the Markdown representation of the recipe.
LazyGraphRag will generate the graph edges using metadata from the recipe documents:
- `keywords`: The CookingRecipes dataset came with a `ner` field that contains entities extracted from the recipe. These entities would be ingredients found in the recipe.
- `source_id`: The unique identifier for the recipe.
- `type`: All recipes will have the type `recipe`. This will help distinguish the recipe nodes from other nodes in the graph, such as the question-answer nodes.

In [None]:
from langchain_core.documents import Document

# convert the recipes to langchain documents
recipe_docs = [Document(page_content=r['md'], id=r['source_id'], metadata={'keywords':r['cleaned_ner'], 'source_id': r['source_id'], 'type':'recipe'}) for r in recipes_df.to_dict(orient='records')]

## Prepare Cooking Q&A W/ Recipe Context
As discussed earlier, [Hieu-Pham's dataset](https://huggingface.co/datasets/Hieu-Pham/cooking_squad) contains questions and answers related to the CookingRecipes dataset. We will use this dataset to generate the question-answer nodes in the graph. We will leverage the connection between the recipe and the question-answer nodes to generate the graph edges.

In [None]:
recipe_qa_df = pd.read_json("hf://datasets/Hieu-Pham/cooking_squad/squad_cooking_transformed.json")

# explode the 'answers' column
recipe_qa_df['answer_start'] = recipe_qa_df['answers'].apply(lambda x: x['answer_start'])
recipe_qa_df['answer'] = recipe_qa_df['answers'].apply(lambda x: x['text'])

# drop the initial 'answers' column
recipe_qa_df.drop(columns=['answers'], inplace=True)

# Grab the title of the column from splitting the 'context' column on the first '\n'
recipe_qa_df['title'] = recipe_qa_df['context'].apply(lambda x: x.split('\n')[0])
# drop the original 'context' column
recipe_qa_df.drop(columns=['context'], inplace=True)

# join recipe_qa_df with recipes_df on the 'title' column. keep all rows in recipe_qa_df
recipe_qa_df = recipe_qa_df.merge(recipes_df[['title', 'source_id', 'md']], on='title', how='left')

# rename the 'md' column to 'context'
recipe_qa_df.rename(columns={'md':'context'}, inplace=True)

# format the qa pairs in markdown for the AI
qa_format_md = lambda qa: """
<question>
{question} 
</question>

<answer>
{answer} 
</answer>

<context>
{context}
</context>
""".format(question=qa['question'], answer=qa['answer'], context=qa['context'])

recipe_qa_df['md'] = recipe_qa_df.apply(qa_format_md, axis=1)

# rename the 'id' column to 'qa_id'
recipe_qa_df.rename(columns={'id':'qa_id'}, inplace=True)


In [None]:
recipe_qa_df.head(5)

### Question & Answer Documents
Before loading the recipe data, we need to prepare the documents again.
The `page_content` will be the Markdown representation of the Q&A.
LazyGraphRag will generate the graph edges using metadata from the documents:
- `source_id`: The unique identifier for the recipe context linked to the document.
- `type`: All Q&A documents will have the type `question-answer`. This will help distinguish the nodes from other nodes in the graph.

In [None]:

# Prepare Question-Answer Document
recipe_qa_docs = [Document(page_content=qa['md'], id=qa['qa_id'], metadata={'source_id': qa['source_id'], 'type':'question-answer'}) for qa in recipe_qa_df.to_dict(orient='records')]

## Populating the Vector store

In [None]:
from langchain_chroma.vectorstores import Chroma
from langchain_graph_retriever.transformers import ShreddingTransformer

#########
# If you want to only store the recipe documents, uncomment the following variable assignment and comment the one below it
# vector_store = Chroma.from_documents(
#     documents=list(ShreddingTransformer().transform_documents(recipe_docs)),
#     embedding=embeddings,
#     collection_name="recipes",
#     persist_directory="./data/recipes_chroma_db"
# )
#########
shredder = ShreddingTransformer() 
vector_store = Chroma.from_documents(
    documents=list(shredder.transform_documents(recipe_docs + recipe_qa_docs)),
    embedding=embeddings,
    collection_name="recipe_qa_combined",
    persist_directory="./data/recipe_qa_combined_chroma_db"
)

## Graph Traversal

In [None]:
from graph_retriever.strategies import Eager
from langchain_graph_retriever import GraphRetriever
from langchain_graph_retriever.adapters.chroma import ChromaAdapter

traversal_retriever = GraphRetriever(
    store = ChromaAdapter(vector_store, shredder, {"keywords"}),
    edges = [("keywords", "keywords"), ("source_id", "source_id")],
    strategy = Eager(k=5, start_k=2, max_depth=3),
)

In [None]:
# Test the retrieval on a single question. This should return relevant recipes and their context
results = traversal_retriever.invoke("I'm in Ohio and I just had a small round chocolate that had peanut butter. I can't remeber the name of it. All I remember is that it had an 'eye' in the name. If you find it, get me the recipe")
#########
# If you want to test the retrieval on a single question that test the retrieval of a Q&A on a specific recipe, uncomment the following line
# results = traversal_retriever.invoke("No Bake Cookies: How long should the clusters stand until the firm up?")
#########
for doc in results:
    print(f"{doc.id}:\n{doc.page_content}")
    print(doc.metadata.get('keywords', []))
    print("\n\n")

## Use within a chain

In [None]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

prompt = ChatPromptTemplate.from_template(
    """Answer the question based only on the context provided.

Context: {context}

Question: {question}"""
)


def format_docs(docs):
    return "\n\n".join(
        f"text: {doc.page_content} metadata: {doc.metadata}" for doc in docs
    )


# chain = (
#     {"sources": traversal_retriever}
#     | {"context": RunnableLambda(lambda x: format_docs(x['sources'])), "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

chain = (
    {"context": traversal_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    # | StrOutputParser()
)

In [None]:
# response = chain.invoke("I'm in Ohio and I just had a small round chocolate that had peanut butter. I can't remeber the name of it. All I remember is that it had an 'eye' in the name. If you find it, get me the recipe")
# response = chain.invoke("What are some recipe that use chocolate and creamcheese? Give me the recipes")
response = chain.invoke("I'm looking for some seafood recipes. Can you help me?")
# response = chain.invoke("I'm looking for some chili recipes that use pork tenderloin?")
# response = chain.invoke("What is the id for the recipe Fruit Medley?")
response

In [None]:
response.model_dump()

In [None]:
response = chain.invoke("Get me the recipe for Seafood And Pasta Salad")
print(response)