In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
from typing import List
from langchain.schema import Document
def extracted_require_data(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document]=[]
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata ={"source": src}
            )
        )
    return minimal_docs

In [None]:
#split the Data into Text chunk
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [62]:
#Embedding The text
from langchain.embeddings import HuggingFaceBgeEmbeddings
def Hugging_face_embedding():
    embedding = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding
embeddings = Hugging_face_embedding()

In [63]:
extracted_data = load_pdf_file(data='data/')

In [65]:

minimal_docs = extracted_require_data(extracted_data)
text_chunk = text_split(minimal_docs)

text_chunk

[Document(metadata={'source': 'data\\cookbook.pdf'}, page_content='FOOD HERO\n FOOD HERO\nMy Recipes\nBe a \nFood Hero \nCook together. Eat together. Talk together.'),
 Document(metadata={'source': 'data\\cookbook.pdf'}, page_content='Table of Contents\nAll About the Recipes\nThe recipes used in this book are some \nof Food Hero’s most popular and require \na limited set of common ingredients and \nbasic cooking tools. Each recipe provides \nideas to make it your own.\nRefrigerate or freeze leftovers within  \n1 to 2 hours.\nNutrition Facts Labels provided are \nestimates based on federal food data and \ndo not include any “optional” foods listed in \nthe ingredients lists. Your recipe may vary.\nVisit www.foodhero.org to:'),
 Document(metadata={'source': 'data\\cookbook.pdf'}, page_content='• view or share recipes in this book, see \nrecipe user comments and find more \nrecipes.\n• sign-up to receive the award-winning \nFood Hero Monthly by email—each month \na different food ingredie

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY =os.environ.get('OPENAI_API_KEY ')


In [57]:
#initialize the pinecone
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key = pinecone_api_key)



In [59]:
#create index in pinecone
from pinecone import ServerlessSpec
index_name ="food-recipes"

if not pc.has_index(index_name):
    pc.create_index (
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
      )
index = pc.Index(index_name)

In [66]:
#store each value inside pinecone
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents = text_chunk,
    index_name = index_name,
    embedding=embeddings
)

In [67]:
#Load existing Index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding=embeddings
)

In [69]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs={"k":3})

In [72]:
retriever_docs = retriever.invoke("Give me some food Recipes")
retriever_docs

[Document(id='4723d4ac-54c5-4b65-b700-4b3294c41286', metadata={'source': 'data\\cookbook.pdf'}, page_content='• view or share recipes in this book, see \nrecipe user comments and find more \nrecipes.\n• sign-up to receive the award-winning \nFood Hero Monthly by email—each month \na different food ingredient is featured.\n• connect with Food Hero on social media.\nWe would love to hear from you!\nEmail us at food.hero.@oregonstate.edu.\nDownload this book  with extra content \nat: https:/ /foodhero.org/cookbook.\nFruit Storage and Cooking Tools .................... 3–4'),
 Document(id='0f4c6664-7ee4-4b02-9e6c-03b9cfbd004a', metadata={'source': 'data\\cookbook.pdf'}, page_content='Table of Contents\nAll About the Recipes\nThe recipes used in this book are some \nof Food Hero’s most popular and require \na limited set of common ingredients and \nbasic cooking tools. Each recipe provides \nideas to make it your own.\nRefrigerate or freeze leftovers within  \n1 to 2 hours.\nNutrition Facts

In [73]:
from langchain_openai import ChatOpenAI
chatModel = ChatOpenAI(model="gpt-4o")

In [74]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [75]:
system_prompt = (
    "You are an assistant for question-answering tasks for Food Recipes. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [77]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [83]:
response = rag_chain.invoke({"input":"how many Table of contents you have"})
print(response["answer"])

The provided text includes a table of contents with distinct sections for various recipes and guides, including "Baked Cauliflower Tots," "Roasted Vegetables," "Stovetop Rice," "Vegetable Storage and Flavored Water," and a "Kitchen Measuring Guide." There are five items listed in the table of contents from pages 29 to 33.
