### 🔨 Setup

⚠ In order to run this notebook you need two separate services:

- ChromaDB: `chroma run --path ./data/chromadb --port 7000`
- vLLM:     `vllm serve <your-model> --chat-template <your-template>`

In [1]:
from pathlib import Path

LOCAL_OPENAI_API_URI = "http://localhost:8000/v1"
LOCAL_CHROMA_URI = "localhost:7000"
OPENAI_API_KEY = "EMPTY"
DATA_PATH = "./data"
DOCS_PATH = Path(DATA_PATH) / "processed_docs"

DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
DEFAULT_CHROMA_COLLECTION_NAME = "llm.xplore"

# FIXME: VSCode has trouble importing local modules if the notebook is not at the root
# See:
# - https://github.com/microsoft/pylance-release/issues/3035
# - https://github.com/microsoft/vscode-jupyter/issues/7926
# -
# So we change to the root of the repo (RUN ONLY ONCE!)
%cd ..
print(Path.cwd())

/home/ubuntu/code/LLM-exploratory
/home/ubuntu/code/LLM-exploratory


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### 🔌 OpenAI-like API

In [None]:
from openai import OpenAI

# Check what model is available from the API
client = OpenAI(api_key=OPENAI_API_KEY, base_url=LOCAL_OPENAI_API_URI)
available_llms = client.models.list().data
print(f"Available models:")
print(" - " + " - ".join(llm.id for llm in available_llms))

In [3]:
from llama_index.llms.openai_like import OpenAILike

llm = OpenAILike(
    model=available_llms[0].id,
    api_base=LOCAL_OPENAI_API_URI,
    api_key=OPENAI_API_KEY
)

response = llm.complete("What is the best thing about LLMs?")
print(str(response))

  from .autonotebook import tqdm as notebook_tqdm


 The best thing about LLMs is their ability to learn and improve over time


### ❓ DocQA

#### 🧬 Embeddings

> NOTE: Try out `BGE` and `Instruct` embeddings VS `SBERT`

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
embeddings = embed_model.get_text_embedding("What is the best thing about LLMs?")
print(len(embeddings))
print(embeddings[:5])

  from .autonotebook import tqdm as notebook_tqdm


384
[-0.039648737758398056, -0.0409272238612175, 0.020214181393384933, -0.00946382712572813, 0.00874999351799488]


#### 📚 Indexing

> TODO: Check out [ingestion pipelines](https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/)

In [4]:
import json

from llama_index.core.schema import Document
from llm.xplore.helpers import collect_files

# Index some documents
doc_files = collect_files(DOCS_PATH)
print(f"📚 Total files: {len(doc_files)}")

documents = []
for doc_file in doc_files:
    print(f"📖 Indexing doc '{doc_file.name}'")
    doc_data = json.loads(doc_file.open().read())
    documents.append(Document(
        doc_id=doc_data["sha1"],
        text="\n\n".join(doc_data["text"]),
        extra_info={
            "name": doc_data["name"],
            "type": doc_data["type"],
        },
    ))

📚 Total files: 8
📖 Indexing doc 'image_page.json'
📖 Indexing doc 'california-commercial-lease-agreement_full_text.json'
📖 Indexing doc 'DATA_README.json'
📖 Indexing doc '20220405 - Glennmont Partners 2-way NDA.json'
📖 Indexing doc 'ConstructionSampleClause2_image_only.json'
📖 Indexing doc 'LinkedIn_MNG_4.19.2024_Adv, Mktg, Digial_Contract Cover Sheet_encryption_layer.json'
📖 Indexing doc 'mixed_page.json'
📖 Indexing doc 'Copy of MMG.DD NDA_Fails_OCR.json'


In [6]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from IPython.display import Markdown, display
import chromadb


# db = chromadb.PersistentClient(path=str(DATA_PATH / "chromadb"))
host, port = LOCAL_CHROMA_URI.split(":")
db = chromadb.HttpClient(host, port)
chroma_collection = db.get_or_create_collection(DEFAULT_CHROMA_COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


# TODO: Check index.insert for subsequent inserts
# TODO: explore 'SemanticSplitterNodeParser'
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    embed_model=embed_model,
    transformations=[SentenceSplitter(chunk_size=100, chunk_overlap=20)],
)

# OR to get the index directly
index = VectorStoreIndex.from_vector_store(vector_store, embed_model)

In [15]:
print("Collection [BEFORE]: ", db.list_collections())
# db.delete_collection(DEFAULT_CHROMA_COLLECTION_NAME)
# print("Collections [AFTER]: ", db.list_collections())

# col = db.get_collection(DEFAULT_CHROMA_COLLECTION_NAME)
# col.get()["metadatas"]

Collection [BEFORE]:  [Collection(id=e03d96d9-111b-48c9-8372-6fcf6847dfe7, name=llm.xplore)]
Collections [AFTER]:  []


#### 🧲 Retrieval

##### Filtering by metadata

In [17]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

def print_sources(retrieved_nodes, print_text:bool = False):
    for node in retrieved_nodes:
        print(f"Document ID: {node.node.ref_doc_id} [node ID: {node.node.node_id}]")
        if print_text:
            print(f"Text: {node.node.text}")
        print(f"Score: {node.score}")
        print("---")


# filtering by metadata
filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="type", operator=FilterOperator.EQ, value="Reseller Agreement"
        ),
    ]
)

# Retrieve nodes for a specific metadata filter
retriever = index.as_retriever(filters=filters)
retrieved_nodes = retriever.retrieve("")
print_sources(retrieved_nodes)

Document ID: ebfc841fbb0e2126d6cc29df0aa73d0454711e69 [node ID: 2de34741-2e79-4d73-b131-39000d6e66a5]
Score: None
---
Document ID: ebfc841fbb0e2126d6cc29df0aa73d0454711e69 [node ID: 95d33659-58af-493e-92a5-ca2ad76d44bd]
Score: None
---


##### Filtering by doc IDs

In [13]:
# Retrieve nodes for a specific document
# FIXME: Filtering by `doc_ids` doesn't work.
# See [llama_index/issues/14121](https://github.com/run-llama/llama_index/issues/14121)
retriever = index.as_retriever(doc_ids=["ebfc841fbb0e2126d6cc29df0aa73d0454711e69"])
retrieved_nodes = retriever.retrieve("")
print_sources(retrieved_nodes)

Document ID: 17f14f0a89df3ba96cc4966588682288e07fbc8d [node ID: e09a844f-f7e7-473d-b40d-7632f25768bd]
Score: None
---
Document ID: 17f14f0a89df3ba96cc4966588682288e07fbc8d [node ID: 527df76b-8035-4e84-b67b-5a432a849040]
Score: None
---


##### Filtering by Query

In [14]:
# Retrieve nodes for a specific query
retriever = index.as_retriever()
retrieved_nodes = retriever.retrieve("terms of the lease agreement in California")
print_sources(retrieved_nodes, True)

Document ID: 132d216ac71cd53e993251450243e588f548bf1d [node ID: 187cc315-3f02-48c7-8cb7-014b3251da41]
Text: 24. Headings. 
The headings used in this Lease are for convenience of the parties only and shall not be 
considered in interpreting the meaning of any provision of this Lease. 
25. Successors. 
The provisions of this Lease shall extend to and be binding upon Landlord and Tenant and their 
respective legal representatives, successors and assigns. 
26. Consent.
Score: 0.7072424642345998
---
Document ID: 132d216ac71cd53e993251450243e588f548bf1d [node ID: 0c758248-d190-43b9-8ab0-a49474ccbbcb]
Text: Tenant 
shall be relieved from paying rent and other charges during any portion of the Lease term that the 
Leased Premises are inoperable or unfit for occupancy, or use, in whole or in part, for Tenant's 
Download Free Templates & Forms at Speedy Template http://www.SpeedyTemplate.com/


purposes.
Score: 0.7049398819743917
---


#### 🦜 Generate answers

In [56]:
# Query Data from the index
query_engine = index.as_query_engine(
    llm=llm,
    doc_ids=["78944398eb5ef89fba1e4e7cd512066619982766"]
)
response = query_engine.query("What does LinkedIn provide?")
display(Markdown(f"{response}"))

 A Sales Navigator Advanced tool that allows our BDR and Sales teams to connect with

In [57]:
response = query_engine.query("Who are the parties?")
display(Markdown(f"{response}"))

 Glennmont Partners and the other party is not specified in the given context information.

In [58]:
response = query_engine.query("Who are is the disclosing party?")
display(Markdown(f"{response}"))

8.2 The Disclosing Party (including its Representatives) does not make any

### 🧪 Semantic chunking

In [5]:
from llama_index.core.node_parser import SemanticSplitterNodeParser

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
nodes = splitter.get_nodes_from_documents(documents)

In [26]:
from llm.xplore.helpers import print_table

print_table(
    "Semantic Nodes",
    columns=[
        {"name": "Doc Name", "style": "cyan", "max_width": 25},
        {"name": "Node", "style": "magenta", "min_width": 75},
    ],
    rows=[(node.metadata["name"], node.text) for node in nodes],
)

ImportError: cannot import name 'print_table' from 'llm.xplore.helpers' (/home/ubuntu/code/LLM-exploratory/llm/xplore/helpers.py)