In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_KEY=os.environ["OPENAI_API_KEY"]

## 0. Install Dependencies

In [2]:
# !pip install langchain
# !pip install weaviate-client
# !pip install openai
# !pip install "unstructuCan the Advisor charge for meal time?red[pdf]"

## 1. Data Reading

In [3]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('../files', glob="**/*.pdf")
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(f'You have {len(data)} documents in your data')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 1 documents in your data
There are 15047 characters in your document


## 2. Text Splitting

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=256)
docs = text_splitter.split_documents(data)

## 3. Embedding Conversion

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)

  warn_deprecated(


## 4. Vector Database Storage

In [7]:
import weaviate
from langchain.vectorstores import Weaviate

client = weaviate.Client(
    url="http://localhost:8080",
    additional_headers={"X-OpenAI-Api-Key": OPENAI_KEY},
    startup_period=10
)

In [8]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)

vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])



In [9]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['f3f1a1f5-9cd5-41a6-9fb2-bf238bf4ae1c',
 '237a9daf-0944-402c-8a5b-f0875dcf6ca9',
 '9778bb23-d5bf-4b83-a5e7-c0f691d73f83',
 '75dd5629-cfc9-4816-a92f-2946ff4f2e87',
 'e53e0017-4570-41a7-9e75-ee5d9c0b6c8e',
 '120daaa0-cf54-405a-b7c8-fa5480583af4',
 '4d1c4b53-e488-407e-b66b-7ec643c85a0d',
 '3a0bc228-666f-4be1-bc1d-f6273e1d61a2',
 '3e88727a-b8e6-4dca-830e-4244c516f163',
 'fadcdbfe-ca98-4a1c-b90d-576423b99500',
 '6df9d397-a600-4758-a300-ebc6570d0251',
 '07b6a059-3951-4c3c-9954-42d042b83c59',
 '24cdee46-fb01-46b4-8b9d-d44e9ebdb0f9',
 '9c59d2f9-98f3-487e-a0a8-bbb0203d45bd',
 'eb20f911-4539-4312-82ac-f7042a95bb7f',
 '434dfb73-7002-44c7-9cbe-88052722a3a5',
 'd40bdaf4-b2b1-4f15-8ff6-8c8c0fc50200',
 'dc4b45ec-65cf-40f6-9921-5e48a6baf105',
 'd22835fc-821e-4eb5-acc4-4ff310f9396f',
 'f41dab57-a801-42da-87de-279584678feb',
 '420e9dee-1c50-4b4b-87c1-f62bec51189d',
 '51da12a1-714b-4dbd-8d12-24c769c13c50',
 'd6973439-e73f-498b-81e6-0c7644f73b00',
 '1b805d23-3150-4561-8de7-a62c5e51b3d2',
 '0fd8cda2-4be5-

## 5. Similarity Search

In [10]:
query = "What are the payments to the Advisor under the Agreement?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, k=4)

## 6.Our Custom ChatBot

In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_KEY,temperature=0),
    chain_type="stuff")

# create answer
chain.run(input_documents=docs, question=query)

  warn_deprecated(
/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/langchain_community/llms/openai.py:466: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


' The payments to the Advisor under the Agreement consist of a combination of salary payments and payments for other statutory rights and benefits as an employee of the Company, as well as hourly fees for services rendered and a monthly payment for a workspace.'