# Constructing an Index-Based Deep Lake Vector Store for Semantic Search with LlamaIndex and OpenAI

copyright 2024, Denis Rothman

A Practical Guide to Building a Semantic Search Engine with Deep Lake, LlamaIndex, and OpenAI:

*   Installing the Environment
*   Creating and populating the Vector Store &   dataset
*   Getting started with  index-based semantic search




# Installing the environment

In [3]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os
import pickle

# Scopes for Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate_google_drive():
    creds = None
    # Check for previously saved credentials
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If no valid credentials, authenticate user
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for future use
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return build('drive', 'v3', credentials=creds)

def list_files(service):
    results = service.files().list(
        pageSize=10, fields="files(id, name)").execute()
    items = results.get('files', [])
    if not items:
        print('No files found.')
    else:
        print('Files:')
        for item in items:
            print(f"{item['name']} ({item['id']})")

if __name__ == '__main__':
    drive_service = authenticate_google_drive()
    list_files(drive_service)




Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=527172115948-rog70b503hkbpkl9fai3ral4mjn28ugl.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A45623%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=6WSC3knuD2MjzcscTdXmrTmM14wDl8&access_type=offline


[90489:90515:1231/163923.083857:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.084342:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.084423:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.084453:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.085278:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.388584:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.388665:ERROR:bus.cc(407)] Failed to connect t

"{{\"ping\",\"success\"}}}"


[90489:90515:1231/163923.627617:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90515:1231/163923.627707:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/user/1000/bus: No such file or directory
[90489:90597:1231/163923.920985:ERROR:object_proxy.cc(576)] Failed to call method: org.freedesktop.DBus.Properties.Get: object_path= /org/freedesktop/UPower: org.freedesktop.DBus.Error.ServiceUnknown: The name org.freedesktop.UPower was not provided by any .service files
[90489:90597:1231/163923.921345:ERROR:object_proxy.cc(576)] Failed to call method: org.freedesktop.UPower.GetDisplayDevice: object_path= /org/freedesktop/UPower: org.freedesktop.DBus.Error.ServiceUnknown: The name org.freedesktop.UPower was not provided by any .service files
[90489:90597:1231/163923.921688:ERROR:object_proxy.cc(576)] Failed to call method: org.freedesktop.UPower.EnumerateDevices: object_path= /org/

Files:
activeloop.txt (1eRoLD9eFtfsTjv_SPYYuNiMXKHe6HIDC)
activeloop.txt (19iifJ1d_Do5qVVwry0JijpGFBmI0Ox-V)
api_key.txt (1u4M-skJmfqO7Xp5QirzWQYu7WoExcuC4)
api_key.txt (1x055GOhnKBrqBNQPOn79JJ6ivEfh5FDsJqhSyfNo1tE)
openai_api_key.txt (1i15toAA1iEn7GXFD357wyvMT3vRl7kw4)
RAG_book (1JYnqwBSgAlTNJRwwuDz9bfHgupxoFMNW)
openai_api_key.txt (15eZSHywfoScnR9vJJr3zi5HKmpbfiJd0)
AdvancedCyber_RAG_v2.ipynb (1rb8k02TdNLxirA0w0naTKCl7IXVmnioG)
On Being Human - Reading Group.pdf (1BDK8x1LgH8ocn2qpEOwspQW0MlFHM6R-)
Finding_Ranking_v3.ipynb (1244BZvf_rcuKMz_-GeaKkr8B0_1_s3QX)


*First run the following cells and restart Google Colab session if prompted. Then run the notebook again cell by cell to explore the code.*

In [26]:

def download_file(service, file_id, output_path):
    request = service.files().get_media(fileId=file_id)
    with open(output_path, 'wb') as f:
        f.write(request.execute())
    print(f"File downloaded to {output_path}")


file_id = "1u4M-skJmfqO7Xp5QirzWQYu7WoExcuC4" 
download_file(drive_service, file_id, 'api_key.txt')

# Read the API key from the file
with open('api_key.txt', 'r') as file:
    api_key = file.read().strip()
    OPENAI_API_KEY = api_key

os.environ['OPENAI_API_KEY'] = api_key
print(f"API Key: {api_key}")




File downloaded to api_key.txt
API Key: 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


In [27]:
file_id = "1eRoLD9eFtfsTjv_SPYYuNiMXKHe6HIDC" 
download_file(drive_service, file_id, 'activeloop.txt')

# Read the API key from the file
with open('activeloop.txt', 'r') as file:
    activeloop_api_key = file.read().strip()


os.environ['ACTIVELOOP_TOKEN'] =activeloop_api_key


File downloaded to activeloop.txt


In [6]:
!pip install llama-index-vector-stores-deeplake==0.1.6

Collecting llama-index-vector-stores-deeplake==0.1.6
  Downloading llama_index_vector_stores_deeplake-0.1.6-py3-none-any.whl.metadata (709 bytes)
Collecting llama-index-core<0.11.0,>=0.10.1 (from llama-index-vector-stores-deeplake==0.1.6)
  Downloading llama_index_core-0.10.68.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting SQLAlchemy>=1.4.49 (from SQLAlchemy[asyncio]>=1.4.49->llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.6)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.6)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.6)
  Downloading Deprecated-1.2.15-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11

LlamaIndex supports Deep Lake vector stores through the DeepLakeVectorStore class.

In [14]:
!pip install deeplake==3.9.18



In [15]:
!pip install llama-index==0.10.64



Next, let's import the required modules and set the needed environmental variables:

In [16]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

In [17]:
!pip install sentence-transformers==3.0.1



# Pipeline 1 : Collecting and preparing the documents

In [18]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [19]:
import requests
from bs4 import BeautifulSoup
import re
import os

urls = [
    "https://github.com/VisDrone/VisDrone-Dataset",
    "https://paperswithcode.com/dataset/visdrone",
    "https://openaccess.thecvf.com/content_ECCVW_2018/papers/11133/Zhu_VisDrone-DET2018_The_Vision_Meets_Drone_Object_Detection_in_Image_Challenge_ECCVW_2018_paper.pdf",
    "https://github.com/VisDrone/VisDrone2018-MOT-toolkit",
    "https://en.wikipedia.org/wiki/Object_detection",
    "https://en.wikipedia.org/wiki/Computer_vision",
    "https://en.wikipedia.org/wiki/Convolutional_neural_network",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://www.faa.gov/uas/",
    "https://www.tensorflow.org/",
    "https://pytorch.org/",
    "https://keras.io/",
    "https://arxiv.org/abs/1804.06985",
    "https://arxiv.org/abs/2202.11983",
    "https://motchallenge.net/",
    "http://www.cvlibs.net/datasets/kitti/",
    "https://www.dronedeploy.com/",
    "https://www.dji.com/",
    "https://arxiv.org/",
    "https://openaccess.thecvf.com/",
    "https://roboflow.com/",
    "https://www.kaggle.com/",
    "https://paperswithcode.com/",
    "https://github.com/"
]

In [20]:
import requests
import re
import os
from bs4 import BeautifulSoup

def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception for bad responses (e.g., 404)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritize "mw-parser-output" but fall back to "content" class if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None

        # Remove specific sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()
                section.parent.decompose()
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)
        text = clean_text(text)
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None  # Return None on error

# Directory to store the output files
output_dir = './data/'  # More descriptive name
os.makedirs(output_dir, exist_ok=True)

# Processing each URL (and skipping invalid ones)
for url in urls:
    article_name = url.split('/')[-1].replace('.html', '')  # Handle .html extension
    filename = os.path.join(output_dir, f"{article_name}.txt")

    clean_article_text = fetch_and_clean(url)
    if clean_article_text:  # Only write to file if content exists
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(clean_article_text)

print(f"Content(ones that were possible) written to files in the '{output_dir}' directory.")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Content(ones that were possible) written to files in the './data/' directory.


In [21]:
# load documents
documents = SimpleDirectoryReader("./data/").load_data()

In [22]:
documents[0]

Document(id_='141a7bc8-69d0-490f-86bc-f08d42e43326', embedding=None, metadata={'file_path': '/home/dk/RAG_book_1/RAG-Driven-Generative-AI/Chapter03/data/1804.06985.txt', 'file_name': '1804.06985.txt', 'file_type': 'text/plain', 'file_size': 3798, 'creation_date': '2024-12-31', 'last_modified_date': '2024-12-31'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='High Energy Physics  Theory arXiv1804.06985 hepth Submitted on 19 Apr 2018 Title A Near Horizon Extreme Binary Black Hole Geometry Authors Jacob Ciafre  Maria J. Rodriguez View a PDF of the paper titled A Near Horizon Extreme Binary Black Hole Geometry by Jacob Ciafre and Maria J. Rodriguez View PDF Abstract A new solution of fourdimensional vacuum General Relativity is presented. It describes the 

# Pipeline 2 : Creating and populating a Deep Lake Vector Store

**Replace `hub://denis76/drone_v2` by your organization and dataset name**

In [28]:
from llama_index.core import StorageContext

vector_store_path = "hub://stoneygalatia/rag_book_ch3"
dataset_path = "hub://stoneygalatia/rag_book_ch3"
import openai
import os

from openai import OpenAI
client = OpenAI(api_key=api_key)

# overwrite=True will overwrite dataset, False will append it
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Create an index over the documents
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Your Deep Lake dataset has been successfully created!




Uploading data to deeplake dataset.


100%|██████████| 86/86 [00:01<00:00, 55.00it/s]
\

Dataset(path='hub://stoneygalatia/rag_book_ch3', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (86, 1)      str     None   
 metadata     json      (86, 1)      str     None   
 embedding  embedding  (86, 1536)  float32   None   
    id        text      (86, 1)      str     None   


 

In [29]:
import deeplake
ds = deeplake.load(dataset_path)  # Load the dataset

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/stoneygalatia/rag_book_ch3



|

hub://stoneygalatia/rag_book_ch3 loaded successfully.



 

In [30]:
import json
import pandas as pd
import numpy as np

# Assuming 'ds' is your loaded Deep Lake dataset

# Create a dictionary to hold the data
data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

In [31]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Function call to display a record
rec = 0  # Replace with the desired record number
display_record(rec)

ID:
['f5a1b9bb-1985-44cb-b9b6-4978ba8615b0']

Metadata:
file_path: /home/dk/RAG_book_1/RAG-Driven-Generative-AI/Chapter03/data/1804.06985.txt
file_name: 1804.06985.txt
file_type: text/plain
file_size: 3798
creation_date: 2024-12-31
last_modified_date: 2024-12-31
_node_content: {"id_": "f5a1b9bb-1985-44cb-b9b6-4978ba8615b0", "embedding": null, "metadata": {"file_path": "/home/dk/RAG_book_1/RAG-Driven-Generative-AI/Chapter03/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3798, "creation_date": "2024-12-31", "last_modified_date": "2024-12-31"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "141a7bc8-69d0-490f-86bc-f08d42e43326", "node_type": "4", "metadata": {"file_path": "/home/dk/RAG_book_1/

# Original documents

In [32]:
# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

# Pipeline 3:Index-based RAG

## User input and RAG parameters

In [33]:
user_input="How do drones identify vehicles?"

#similarity_top_k
k=3
#temperature
temp=0.1
#num_output
mt=1024

## Cosine similarity metric

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]

  from tqdm.autonotebook import tqdm, trange


# Vector store index query engine

In [36]:
from llama_index.core import VectorStoreIndex
vector_store_index = VectorStoreIndex.from_documents(documents)

In [37]:
print(type(vector_store_index))

<class 'llama_index.core.indices.vector_store.base.VectorStoreIndex'>


In [38]:
vector_query_engine = vector_store_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

In [39]:
print(type(vector_query_engine))

<class 'llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine'>


## Query response and source

In [42]:
import pandas as pd
import textwrap

#Function to manage the query and return information on the content of the response.
def index_query(input_query):
    response = vector_query_engine.query(input_query)

    
    print(textwrap.fill(str(response), 100))

    node_data = []
    for node_with_score in response.source_nodes:
        node = node_with_score.node
        node_info = {
            'Node ID': node.id_,
            'Score': node_with_score.score,
            'Text': node.text
        }
        node_data.append(node_info)

    df = pd.DataFrame(node_data)

    # Instead of printing, return the DataFrame and the response object
    return df, response


In [43]:
import time
#start the timer
start_time = time.time()
df, response = index_query(user_input)
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(df.to_markdown(index=False, numalign="left", stralign="left"))  # Display the DataFrame using markdown

Drones identify vehicles through Remote ID, which is a system that makes drone locations, controller
locations, and other information public from takeoff to shutdown. This system helps in identifying
and tracking vehicles for various purposes.
Query execution time: 1.1656 seconds
| Node ID                              | Score    | Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

Node information and relationships

In [44]:
nodeid=response.source_nodes[0].node_id
nodeid

'b5bf379c-fd54-48b7-8d90-63050c9c9769'

In [45]:
response.source_nodes[0].get_text()

"['pp. 1223 1232. arXiv  2103.13933 .  Organisciak Daniel Poyser Matthew Alsehaim Aishah Hu Shanfeng IsaacMedina Brian K. S. Breckon Toby P. Shum Hubert P. H. 2022. UAVReID A Benchmark on Unmanned Aerial Vehicle Reidentification in Video Imagery. Proceedings of the 17th International Joint Conference on Computer Vision Imaging and Computer Graphics Theory and Applications . SciTePress. pp. 136 146. arXiv  2104.06219 . doi  10.52200010836600003124 . ISBN 9789897585555 .  Heathrow picks CUAS to combat drone disruption . Archived from the original on 9 November 2019 . Retrieved 13 March 2019 .  Muscat International Airport to install USD10 million Aaronia counterUAS system . 21 January 2019. Archived from the original on 9 November 2019 . Retrieved 21 January 2019 .  GrandClément Sarah Bajon Theò 19 October 2022. Uncrewed Aerial Systems A Primer . United Nations Institute for Disarmament Research . Archived from the original on 5 January 2023 . Retrieved 5 January 2023 .  cite journal    

## Optimized chunking

In [46]:
# Assuming you have the 'response' object from query_engine.query()

for node_with_score in response.source_nodes:
    node = node_with_score.node  # Extract the Node object from NodeWithScore
    chunk_size = len(node.text)
    print(f"Node ID: {node.id_}, Chunk Size: {chunk_size} characters")

Node ID: b5bf379c-fd54-48b7-8d90-63050c9c9769, Chunk Size: 3309 characters
Node ID: a4a32362-7f88-4eaa-b7ae-fd65da5f5668, Chunk Size: 4044 characters
Node ID: 51ec3466-8165-4267-b212-915e8361d3d0, Chunk Size: 3309 characters


## Performance metric

In [47]:
import numpy as np

def info_metrics(response):
  # Calculate the performance (handling None scores)
  scores = [node.score for node in response.source_nodes if node.score is not None]
  if scores:  # Check if there are any valid scores
      weights = np.exp(scores) / np.sum(np.exp(scores))
      perf = np.average(scores, weights=weights) / elapsed_time
  else:
      perf = 0  # Or some other default value if all scores are None

  average_score=np.average(scores, weights=weights)
  print(f"Average score: {average_score:.4f}")
  print(f"Query execution time: {elapsed_time:.4f} seconds")
  print(f"Performance metric: {perf:.4f}")

In [48]:
info_metrics(response)

Average score: 0.8321
Query execution time: 1.1656 seconds
Performance metric: 0.7139


# Tree index query engine

In [None]:
from llama_index.core import TreeIndex
tree_index = TreeIndex.from_documents(documents)

In [None]:
print(type(tree_index))

<class 'llama_index.core.indices.tree.base.TreeIndex'>


In [None]:
tree_query_engine = tree_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

In [None]:
import time
import textwrap
# Start the timer
start_time = time.time()
response = tree_query_engine.query(user_input)
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

Query execution time: 4.3360 seconds
Drones identify vehicles using computer vision technology related to object detection. This
technology involves detecting instances of semantic objects of a certain class, such as vehicles, in
digital images and videos. Drones can be equipped with object detection algorithms, such as YOLOv3
models trained on datasets like COCO, to detect vehicles in real-time by analyzing the visual data
captured by the drone's cameras.


## Performance metric

In [None]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.731
Query execution time: 4.3360 seconds
Performance metric: 0.1686


# List index query engine

In [49]:
from llama_index.core import ListIndex
list_index = ListIndex.from_documents(documents)

In [50]:
print(type(list_index))

<class 'llama_index.core.indices.list.base.SummaryIndex'>


In [51]:
list_query_engine = list_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

In [52]:
#start the timer
start_time = time.time()
response = list_query_engine.query(user_input)
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

Query execution time: 11.7559 seconds
Drones identify vehicles through computer vision techniques, analyzing image data from onboard
cameras to detect and classify objects based on features, shapes, and movements. Machine learning
and deep learning models are used to train drones for real-time vehicle recognition, supporting
applications like traffic monitoring and autonomous navigation.


## Performance metric

In [53]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.697
Query execution time: 11.7559 seconds
Performance metric: 0.0593


# Keyword index query index

In [54]:
from llama_index.core import KeywordTableIndex
keyword_index = KeywordTableIndex.from_documents(documents)

In [55]:
# Extract data for DataFrame
data = []
for keyword, doc_ids in keyword_index.index_struct.table.items():
    for doc_id in doc_ids:
        data.append({"Keyword": keyword, "Document ID": doc_id})

# Create the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Keyword,Document ID
0,asymptotically flat,2e99d219-413b-405b-a005-acf4054a44bb
1,high,2e99d219-413b-405b-a005-acf4054a44bb
2,high,29a6e137-1e32-4ac8-add4-e73b285ade7f
3,entropy,2e99d219-413b-405b-a005-acf4054a44bb
4,asymptotically,2e99d219-413b-405b-a005-acf4054a44bb
...,...,...
4364,countries,0c90c20f-4ebc-4af5-8a1a-140e01104e75
4365,vancouver island university,0c90c20f-4ebc-4af5-8a1a-140e01104e75
4366,countries producing stealth aircraft,0c90c20f-4ebc-4af5-8a1a-140e01104e75
4367,civil aircraft,0c90c20f-4ebc-4af5-8a1a-140e01104e75


In [58]:
keyword_query_engine = keyword_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

In [59]:
import time

# Start the timer
start_time = time.time()

# Execute the query (using .query() method)
response = keyword_query_engine.query(user_input)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

Query execution time: 1.2054 seconds
Drones can identify vehicles through various means such as visual sensors, cameras, and advanced
imaging technologies. These technologies allow drones to capture images and videos of vehicles,
analyze their shapes, sizes, and movements, and use this data to identify and track vehicles
effectively.


## Performance metric

In [60]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.817
Query execution time: 1.2054 seconds
Performance metric: 0.6774
