In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_KEY=os.environ["OPENAI_API_KEY"]

## 0. Install Dependencies

In [3]:
# !pip install langchain
# !pip install weaviate-client
# !pip install openai
# !pip install "unstructured[pdf]"

## 1. Data Reading

In [4]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('../files', glob="**/*.pdf")
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(f'You have {len(data)} documents in your data')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 1 documents in your data
There are 15047 characters in your document


## 2. Text Splitting

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

## 3. Embedding Conversion

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)

NameError: name 'OPENAI_API_KEY' is not defined

## 4. Vector Database Storage

In [None]:
import weaviate
from langchain.vectorstores import Weaviate

client = weaviate.Client(
    url="http://localhost:8080",
    additional_headers={"X-OpenAI-Api-Key": OPENAI_KEY},
    startup_period=10
)

In [None]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)

vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])



In [None]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['cc77bd4d-f181-4b84-91ef-ee6a22259674',
 'c87eb877-cce7-4655-b6af-bc055def14ea',
 '92afd204-78af-4432-892b-d259ddba1bd8',
 'a8870587-e42b-4066-b7e8-2340157e0c54',
 '4711f0e6-f4f5-4cc8-9906-09c1ce125d9a',
 '808a7b13-0841-4495-aaa5-75bf3c48f9b7',
 '3fcbf53f-2480-47fd-9282-f20d17de391a',
 'cd1aa59c-9012-403e-a0bb-657313f16620',
 '45ce2966-dfe7-4cb1-b825-7e9e8926997f',
 '7b6d9362-1010-4a81-8f57-7b6d887eb4ba',
 'e4699c02-50b4-4c4c-b63b-b0d54ddf1cb9',
 '6638a77a-403c-4ed7-83b0-352b855a3f6d',
 'fa8f1c10-33d6-438e-bf48-4f4d86183e78',
 '08f22077-720d-4640-af29-472cc6e96500',
 'e6941473-93a0-46a9-9689-cee4e31b2b51',
 'ed902d1c-c21e-4cbc-87e0-034a63ea1ea3',
 'b37716cc-d7c5-49c5-ada3-3f6b3ed9a8cb',
 '95036cf2-c1b3-463d-a3ae-829964789bce',
 'cabfdee5-2e8c-45b2-8517-3a770b5e8c65',
 'd4da63ab-f169-45b1-a076-06eb91f4549d',
 '80948d8d-212c-4326-864c-1af43b68cc68']

## 5. Similarity Search

In [None]:
query = "Is there a non-compete obligation to the Advisor?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, k=5)

## 6.Our Custom ChatBot

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_KEY,temperature=0),
    chain_type="stuff")

# create answer
chain.run(input_documents=docs, question=query)

/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/langchain_community/llms/openai.py:466: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


' No, the Advisor cannot charge for meal time as it is not considered a Billable Hour under the definition provided in the context.'