In [1]:
!pip install -q llama-index==0.9.14.post3 deeplake==3.8.8 openai==1.3.8 cohere==4.37

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.2.1 requires langchain-core<0.4,>=0.3, but you have langchain-core 0.0.13 which is incompatible.
langchain-openai 0.2.1 requires openai<2.0.0,>=1.40.0, but you have openai 1.3.8 which is incompatible.
langchain-openai 0.2.1 requires tiktoken<1,>=0.7, but you have tiktoken 0.5.2 which is incompatible.[0m[31m
[0m

In [None]:
import os

os.environ['OPENAI_API_KEY'] = '<OPENAI_KEY>'
os.environ['ACTIVELOOP_TOKEN'] = '<YOUR_ACTIVELOOP_KEY>'

In [None]:
import logging
import sys

#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# LlamaHub Wikipedia Integration

In [None]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()



In [None]:
documents = loader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])

In [None]:
len( documents )

2

# Save on DeepLake

In [None]:
from llama_index.vector_stores import DeepLakeVectorStore

my_activeloop_org_id = "genai360"
my_activeloop_dataset_name = "LlamaIndex_intro"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

# Create an index over the documnts
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=False)

Your Deep Lake dataset has been successfully created!




In [None]:
from llama_index.storage.storage_context import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

Uploading data to deeplake dataset.


100%|██████████| 23/23 [00:00<00:00, 69.43it/s]


Dataset(path='hub://genai360/LlamaIndex_intro', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (23, 1)      str     None   
 metadata     json      (23, 1)      str     None   
 embedding  embedding  (23, 1536)  float32   None   
    id        text      (23, 1)      str     None   




# Create Nodes

In [None]:
from llama_index.node_parser import SimpleNodeParser

# Assuming documents have already been loaded

# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print( len( nodes ) )

48


# Create index from Documents

In [None]:
from llama_index import GPTVectorStoreIndex

index = GPTVectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What does NLP stands for?")
response.response

'NLP stands for Natural Language Processing.'

# Saving and Loading Indexes Locally

In [None]:
# store index as vector embeddings on the disk
index.storage_context.persist()
# This saves the data in the 'storage' by default
# to minimize repetitive processing

In [None]:
# Index Storage Checks
import os.path
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import download_loader

# Let's see if our index already exists in storage.
if not os.path.exists("./storage"):
    # If not, we'll load the Wikipedia data and create a new index
    WikipediaReader = download_loader("WikipediaReader")
    loader = WikipediaReader()
    documents = loader.load_data(pages=['Natural Language Processing', 'Artificial Intelligence'])
    index = VectorStoreIndex.from_documents(documents)
    # Index storing
    index.storage_context.persist()

else:
    # If the index already exists, we'll just load it:
    storage_context = StorageContext.from_defaults(persist_dir="./storage")
    index = load_index_from_storage(storage_context)

# Routers

In [None]:
# not from the course but referenced by the course
# code from https://docs.llamaindex.ai/en/stable/module_guides/querying/router/#routers

from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool


list_tool = QueryEngineTool.from_defaults(
    query_engine=list_query_engine,
    description="Useful for summarization questions related to the data source",
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for retrieving specific context related to the data source",
)

query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
    ],
)
query_engine.query("<query>")

# LangChain vs. LlamaIndex

**LlamaIndex:** LlamaIndex specializes in processing, structuring, and accessing private or domain-specific data, with a focus on specific LLM interactions. It works for tasks that demand high precision and quality when dealing with specialized, domain-specific data. Its main strength lies in linking Large Language Models (LLMs) to any data source.

**LangChain** is dynamic, suited for context-rich interactions, and effective for applications like chatbots and virtual assistants. These features render it highly appropriate for quick prototyping and application development.

While generally used independently, it is worth noting that it can be possible to combine functions from both LangChain and LlamaIndex where they have different strengths. Both can be complementary tools. We also designed a little table below to help you understand the differences better. The attached video in the course also aims to help you decide which tool you should use for your application: LlamaIndex, LangChain, OpenAI Assistants, or doing it all from scratch (yourself).

# Environment

In [None]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.1
aiosignal                        1.3.1
alabaster                        0.7.13
albumentations                   1.3.1
altair                           4.2.2
anyio                            3.7.1
appdirs                          1.4.4
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array-record                     0.5.0
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.0
attrs                            23.1.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.13.1
backcall                         0.2.0
backoff                          2.2.1
beautifulsoup4                   4.12.2
bi