In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_KEY=os.environ["OPENAI_API_KEY"]

## 0. Install Dependencies

In [2]:
# !pip install langchain
# !pip install weaviate-client
# !pip install openai
# !pip install "unstructured[pdf]"

## 1. Data Reading

In [3]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('../files', glob="**/*.pdf")
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(f'You have {len(data)} documents in your data')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 1 documents in your data
There are 15047 characters in your document


## 2. Text Splitting

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

## 3. Embedding Conversion

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)

  warn_deprecated(


## 4. Vector Database Storage

In [7]:
import weaviate
from langchain.vectorstores import Weaviate

client = weaviate.Client(
    url="http://localhost:8080",
    additional_headers={"X-OpenAI-Api-Key": OPENAI_KEY},
    startup_period=10
)

In [8]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)

vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])



In [9]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['6f115e22-eef7-47e4-9008-128429a811ff',
 'e0b9f53c-e662-4218-a043-73e53b5b385a',
 '5ae3792f-ed14-431a-b118-65a6bd425dae',
 '7e6ca4e3-8cb9-4186-b889-89141b7e4281',
 'c766d8e7-940f-47d6-8259-049551d60852',
 'f5d63202-5727-4a7d-a822-12c8f1901eb8',
 '8d2cfca5-906a-44d6-a726-16aa7d400c0c',
 '04fdd3ec-f5e3-4993-9a1d-9d80351c50fe',
 '30e7cf5a-a1c7-46e6-8a68-320f46cfba1e',
 '19f910c7-266c-43a3-83c5-80800c6d1577',
 'd501b365-7994-44d6-ac0b-5f5919df4aea',
 'e6d0f822-a890-4196-8e20-6cac8037ab31',
 'f753bdda-1528-4f0f-b92c-90e4d84b5849',
 'da6cb60c-7e7f-415a-b517-0767314c70ff',
 '07d7925b-4c9c-489e-9c20-6f323faf481c',
 'afcb42d8-e592-4626-b2ed-c1471216c695',
 'eccc3323-e294-4deb-a1d8-aed2b5909b4f',
 '169eda9d-5660-49f7-8305-c3091b8aeeb3',
 '79737103-6d11-42f8-a000-92d2d7739add',
 '6da57ef2-d536-459c-bfe0-d1328380afc2',
 'c3f7ba33-b7de-4f37-ba9a-6e85235c53e8']

## 5. Similarity Search

In [10]:
query = "Is there a non-compete obligation to the Advisor?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, k=5)

## 6.Our Custom ChatBot

In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_KEY,temperature=0),
    chain_type="stuff")

# create answer
chain.run(input_documents=docs, question=query)

  warn_deprecated(
/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/langchain_community/llms/openai.py:466: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/home/mubarek/anaconda3/envs/auto/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


' Yes, there is a non-compete obligation to the Advisor for a period of 12 months after the term of engagement with the Company.'