In [2]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
from PyPDF2 import PdfReader

In [3]:
load_dotenv()

#using openai api key and loading data
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
data_path = "./data/"

db_directory = "./vector/"

pdf_files= os.listdir(data_path)
print(pdf_files)

def get_pdf_text(data_path, pdf_files):
    
    text = ""

    for pdf_file in pdf_files:
        reader = PdfReader(data_path+pdf_file)
        for page in reader.pages:
            text += page.extract_text()

    return text

text = get_pdf_text(data_path, pdf_files)

['Easy_recipes.pdf', 'Recipe-Book.pdf']


In [5]:
def get_chunk_text(text):
    
    text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
    )

    chunks = text_splitter.split_text(text)

    return chunks

# print(get_chunk_text(text))
data = get_chunk_text(text)
print(data)

Created a chunk of size 1916, which is longer than the specified 1000
Created a chunk of size 3436, which is longer than the specified 1000
Created a chunk of size 1055, which is longer than the specified 1000
Created a chunk of size 2777, which is longer than the specified 1000
Created a chunk of size 1467, which is longer than the specified 1000
Created a chunk of size 5395, which is longer than the specified 1000
Created a chunk of size 1206, which is longer than the specified 1000


['Quick\t\r \xa0and\t\r \xa0easy\t\r \xa0recipes\t\r \xa0\t\r \xa0Start\t\r \xa0cooking\t\r \xa0today!\t\r \xa0You\t\r \xa0can\t\r \xa0do\t\r \xa0it!\t\r \xa0\t\r \xa0Cooking\t\r \xa0meals\t\r \xa0is\t\r \xa0much\t\r \xa0cheaper\t\r \xa0than\t\r \xa0eating\t\r \xa0outside.\t\r \xa0This\t\r \xa0is\t\r \xa0true\t\r \xa0everywhere\t\r \xa0but\t\r \xa0even\t\r \xa0more\t\r \xa0in\t\r \xa0Geneva...\t\r \xa0Take\t\r \xa0turns\t\r \xa0and\t\r \xa0cook\t\r \xa0for\t\r \xa0your\t\r \xa0roommate\t\r \xa0and\t\r \xa0your\t\r \xa0friends!\t\r \xa0 You’ll\t\r \xa0 save\t\r \xa0 even\t\r \xa0 more\t\r \xa0m o n e y\t\r \xa0–\t\r \xa0a n d \t\r \xa0t h e \t\r \xa0k i t c h e n \t\r \xa0w i l l \t\r \xa0p r o b a b l y \t\r \xa0s t a y \t\r \xa0cleaner...\t\r \xa0\t\r \xa0I\t\r \xa0have\t\r \xa0selected\t\r \xa0only\t\r \xa0a\t\r \xa0few\t\r \xa0recipes\t\r \xa0from\t\r \xa0these\t\r \xa0websites:\t\r \xa0http://www.studentrecipes.com/\t\r \xa0http://www.squidoo.com/studentfood#module55467342\t\r \xa0

In [6]:
pdf_loader = DirectoryLoader("./data/", glob="**/*.pdf")
document = pdf_loader.load()
print(type(document))

<class 'list'>


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
document1= text_splitter.split_documents(document)

chat_history = []

embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents=document1, embedding=embeddings, persist_directory=db_directory)

qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever())

query = "give me the omelette recipe with ingredients and directions"

response = qa({"question": query, "chat_history": chat_history})

#index_creator = VectorstoreIndexCreator()

# docsearch = index_creator.from_loaders(document)

# chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.vectorstore.as_retriever(), input_key="question")

# query = "Recommend me a soup recipe which contains tomato and give me the ingredients and the instructions"

print(response["answer"])



Ingredients

3 Eggs
Salt and Pepper
Butter
Grated cheese (optional)
Mixed herbs (optional)

Method

1) Mix the eggs in any sort of bowl or jug you like (a pint glass works well!) with salt, pepper and herbs if desired.

2) Heat the butter (or oil) in a reasonably large frying pan.

3) Pour in the egg mixture slowly and swirl the pan a little to even it out.

4) After 30 seconds or so the top should firm up, at this point add grated cheese if you like.

5) Continue cooking a little until the cheese begins to melt, then fold the omelette in half with a bendy spatula.

6) If its nice and cooked on the bottom already then serve, otherwise, continue cooking on each side a little more!
