In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

In [3]:
os.environ["ACTIVELOOP_TOKEN"] = os.environ["DEEPLAKE_API_KEY"]

In [4]:
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"]

In [5]:
my_activeloop_dataset_name = "basic-rag-with-deeplake"

In [6]:
from langchain_openai import OpenAIEmbeddings

In [7]:
from langchain_community.vectorstores import DeepLake



In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
from langchain.chains import RetrievalQA

In [10]:
usa_curious_facts = [
    """
    The US celebrates Independence Day from the British Empire 
    on July 4. However, the country’s Declaration of Independence 
    was passed on July 2. It was only officially ratified on July 4.
    """,
    """
    The very first documented European to arrive in North America was 
    the Spaniard Juan Ponce de León, who landed in Florida in 1513.
    """
]

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 0
)

In [12]:
doc_chunks = text_splitter.create_documents(usa_curious_facts)

In [13]:
print(f"Now you have {len(doc_chunks)} chunks.")

Now you have 2 chunks.


In [14]:
embeddings = OpenAIEmbeddings()

In [15]:
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

In [16]:
db = DeepLake(
    dataset_path=dataset_path,
    embedding=embeddings
)

Deep Lake Dataset in hub://bilal420/basic-rag-with-deeplake already exists, loading from the storage


In [17]:
db.add_documents(doc_chunks)

Creating 2 embeddings in 1 batches of size 2:: 100%|█████████████████████████████████████| 1/1 [00:24<00:00, 24.85s/it]

Dataset(path='hub://bilal420/basic-rag-with-deeplake', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (2, 1)      str     None   
 metadata     json      (2, 1)      str     None   
 embedding  embedding  (2, 1536)  float32   None   
    id        text      (2, 1)      str     None   





['96a6e8bc-2a70-11ef-91f3-dc215cf36ec4',
 '96a6e8bd-2a70-11ef-a70a-dc215cf36ec4']

In [17]:
qa_chain.run("What is the largest state in the US?")

NameError: name 'qa_chain' is not defined

In [18]:
from langchain_openai import OpenAI

In [19]:
llm = OpenAI()

In [20]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever()
)

In [21]:
qa_chain.run("When was actually passed the U.S. Declaration of Independence?")

  warn_deprecated(


" The U.S. Declaration of Independence was officially passed on July 4, but it was first proposed on July 2 and wasn't ratified until July 4."

In [22]:
qa_chain.invoke("When was actually passed the U.S. Declaration of Independence?")

{'query': 'When was actually passed the U.S. Declaration of Independence?',
 'result': ' The Declaration of Independence was passed on July 2.'}

In [23]:
additional_usa_curious_facts = [
    """
    Alaska is the largest state in the US, and used to belong 
    to the Russian Empire before the US purchased it.
    """,
    """
    Big cities and regions have their own style of pizza: Chicago 
    Deep-Dish, New York Style, Detroit Pizza, St Louis-Style, and 
    New England Beach Pizza are just a few different varieties.
    """
]

In [24]:
additional_doc_chunks = text_splitter.create_documents(additional_usa_curious_facts)

In [25]:
db.add_documents(additional_doc_chunks)

Creating 2 embeddings in 1 batches of size 2:: 100%|█████████████████████████████████████| 1/1 [00:33<00:00, 33.29s/it]

Dataset(path='hub://bilal420/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (4, 1536)  float32   None   
    id        text      (4, 1)      str     None   
 metadata     json      (4, 1)      str     None   
   text       text      (4, 1)      str     None   





['49dacce7-3540-11ef-899e-dc215cf36ec4',
 '49dacce8-3540-11ef-b859-dc215cf36ec4']