In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

## 파이프라인 1: 문서 수집과 준비

In [3]:
import os
import re
import requests
from bs4 import BeautifulSoup

urls = [
    "https://github.com/VisDrone/VisDrone-Dataset",
    "https://paperswithcode.com/dataset/visdrone",
    "https://openaccess.thecvf.com/content_ECCVW_2018/papers/11133/Zhu_VisDrone-DET2018_The_Vision_Meets_Drone_Object_Detection_in_Image_Challenge_ECCVW_2018_paper.pdf",
    "https://github.com/VisDrone/VisDrone2018-MOT-toolkit",
    "https://en.wikipedia.org/wiki/Object_detection",
    "https://en.wikipedia.org/wiki/Computer_vision",
    "https://en.wikipedia.org/wiki/Convolutional_neural_network",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://www.faa.gov/uas/",
    "https://www.tensorflow.org/",
    "https://pytorch.org/",
    "https://keras.io/",
    "https://arxiv.org/abs/1804.06985",
    "https://arxiv.org/abs/2202.11983",
    "https://motchallenge.net/",
    "http://www.cvlibs.net/datasets/kitti/",
    "https://www.dronedeploy.com/",
    "https://www.dji.com/",
    "https://arxiv.org/",
    "https://openaccess.thecvf.com/",
    "https://roboflow.com/",
    "https://www.kaggle.com/",
    "https://paperswithcode.com/",
    "https://github.com/"
]

In [4]:
def clean_text(content):
    content = re.sub(r'\[\d+\]', '', content) # 참조 제거
    content = re.sub(r'[^\w\s\.]', '', content) # 문장부호 제거 (마침표 제외)
    return content

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status() # 나쁜 응답에 대한 예외 발생 (예: 404)
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None

        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()
                section.parent.decompose()
                section = content.find('span', id=section_title)
            
            text = content.get_text(separator=' ', strip=True)
            text = clean_text(text)
            return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None

In [4]:
output_dir = './data'
os.makedirs(output_dir, exist_ok=True)

for url in urls:
    article_name = url.split('/')[-1].replace('.html', '')
    filename = os.path.join(output_dir, f'{article_name}.txt')

    clean_article_text = fetch_and_clean(url)
    if clean_article_text:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(clean_article_text)

print(f"Content(ones that were possible) written to files in the '{output_dir} directory.")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Content(ones that were possible) written to files in the './data directory.


In [5]:
import textwrap

def print_formatted(text):
    # Wrap text to 80 characters width
    wrapped_text = textwrap.fill(text, width=80)
    print(wrapped_text)

In [6]:
documents = SimpleDirectoryReader("./data/").load_data()

In [6]:
documents[0].text

'High Energy Physics  Theory arXiv1804.06985 hepth Submitted on 19 Apr 2018 Title A Near Horizon Extreme Binary Black Hole Geometry Authors Jacob Ciafre  Maria J. Rodriguez View a PDF of the paper titled A Near Horizon Extreme Binary Black Hole Geometry by Jacob Ciafre and Maria J. Rodriguez View PDF Abstract A new solution of fourdimensional vacuum General Relativity is presented. It describes the near horizon region of the extreme maximally spinning binary black hole system with two identical extreme Kerr black holes held in equilibrium by a massless strut. This is the first example of a nonsupersymmetric asymptotically flat near horizon extreme binary black hole geometry of two uncharged black holes. The black holes are corotating and the solution is uniquely specified by the mass. The binary extreme system has finite entropy. The distance between the black holes is fixed but there is a zerodistance limit where the objects collapse into one. This limiting geometry corresponds to the

## 파이프라인 2: 벡터 저장소 준비

In [7]:
from llama_index.core import StorageContext

vector_store_path = "hub://secufibre/drone_v2"
dataset_path = "hub://secufibre/drone_v2"

In [8]:
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Your Deep Lake dataset has been successfully created!


 

Uploading data to deeplake dataset.


100%|██████████| 90/90 [00:02<00:00, 40.80it/s]
|

Dataset(path='hub://secufibre/drone_v2', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (90, 1)      str     None   
 metadata     json      (90, 1)      str     None   
 embedding  embedding  (90, 1536)  float32   None   
    id        text      (90, 1)      str     None   


 

In [9]:
documents

[Document(id_='b73ced47-88b3-4d67-9999-7bedd6d96266', embedding=None, metadata={'file_path': '/storage2/RAG/003_RAG_index-based_llamaindex_deeplake_openai/data/1804.06985.txt', 'file_name': '1804.06985.txt', 'file_type': 'text/plain', 'file_size': 3798, 'creation_date': '2025-04-03', 'last_modified_date': '2025-04-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='High Energy Physics  Theory arXiv1804.06985 hepth Submitted on 19 Apr 2018 Title A Near Horizon Extreme Binary Black Hole Geometry Authors Jacob Ciafre  Maria J. Rodriguez View a PDF of the paper titled A Near Horizon Extreme Binary Black Hole Geometry by Jacob Ciafre and Mari

In [9]:
import deeplake

ds = deeplake.load(dataset_path)

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/secufibre/drone_v2



 

hub://secufibre/drone_v2 loaded successfully.



 

In [28]:
import json
import pandas as pd
import numpy as np

data = {}

for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    if tensor_data.ndim > 1:
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

df = pd.DataFrame(data)

In [29]:
df.head()

Unnamed: 0,embedding,id,metadata,text
0,"[-0.0027042930014431477, 0.007869635708630085,...",[668b88ea-59f2-496b-86b0-35219acff460],[{'file_path': '/storage2/RAG/003_RAG_index-ba...,[High Energy Physics Theory arXiv1804.06985 h...
1,"[-0.014088690280914307, 0.014641465619206429, ...",[bb732a80-c435-416b-88df-51415d4b19ac],[{'file_path': '/storage2/RAG/003_RAG_index-ba...,[Computer Science Computer Vision and Pattern...
2,"[-0.014408417046070099, -0.0004371690738480538...",[fea05bc2-e717-4a11-8e32-1480be7231ad],[{'file_path': '/storage2/RAG/003_RAG_index-ba...,[Computerized information extraction from imag...
3,"[-0.004417059011757374, -0.0012542684562504292...",[93a9d953-f1d4-4e17-8678-63562939a727],[{'file_path': '/storage2/RAG/003_RAG_index-ba...,[These include the concept of scalespace the ...
4,"[-0.018791090697050095, 0.00625045457854867, 0...",[1e4ffa59-1f44-4861-8386-1eb19358dbb7],[{'file_path': '/storage2/RAG/003_RAG_index-ba...,[Also some of the learningbased methods develo...


In [30]:
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record["id"] if "id" in record else "N/A",
        "Metadata": record["metadata"] if "metadata" in record else "N/A",
        "Text": record["text"] if "text" in record else "N/A",
        "Embedding": record["embedding"] if "embedding" in record else "N/A"
    }
    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

In [31]:
rec = 0
display_record(rec)

ID:
['668b88ea-59f2-496b-86b0-35219acff460']

Metadata:
file_path: /storage2/RAG/003_RAG_index-based_llamaindex_deeplake_openai/data/1804.06985.txt
file_name: 1804.06985.txt
file_type: text/plain
file_size: 3798
creation_date: 2025-04-03
last_modified_date: 2025-04-03
_node_content: {"id_": "668b88ea-59f2-496b-86b0-35219acff460", "embedding": null, "metadata": {"file_path": "/storage2/RAG/003_RAG_index-based_llamaindex_deeplake_openai/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3798, "creation_date": "2025-04-03", "last_modified_date": "2025-04-03"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "958609f8-f8f4-4ff8-b0b4-718934cb602a", "node_type": "4", "metadata": {"file_path": "/storage

## 파이프라인 3: 색인 기반 RAG

In [32]:
user_input = "How do drones identify vehicles?"

In [33]:
k = 3
temp = 0.1
mt = 1024

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]

In [35]:
from llama_index.core import VectorStoreIndex

vector_store_index = VectorStoreIndex.from_documents(documents)

In [36]:
print(type(vector_store_index))

<class 'llama_index.core.indices.vector_store.base.VectorStoreIndex'>


In [37]:
vector_query_engine = vector_store_index.as_query_engine(
    similarity_top_k=k,
    temperature=temp,
    num_output=mt
)

In [38]:
import textwrap
import pandas as pd

def index_query(input_query):
    response = vector_query_engine.query(input_query)
    print(textwrap.fill(str(response), width=100))

    node_data = []
    for node_with_score in response.source_nodes:
        node = node_with_score.node
        node_info = {
            'Node ID': node.id_,
            'Score': node_with_score.score,
            'Text': node.text
        }
        node_data.append(node_info)
    
    df = pd.DataFrame(node_data)
    return df, response

In [40]:
import time

start_time = time.time()
df, response = index_query(user_input)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(df.to_markdown(index=False, numalign="left", stralign="left"))

Drones can identify vehicles across different cameras with different viewpoints and hardware
specifications using reidentification methods.
Query execution time: 1.5290 seconds
| Node ID                              | Score    | Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [41]:
nodeid = response.source_nodes[0].node_id
nodeid

'9882d5d1-8ad4-4e93-88ae-54ce9aefc25e'

In [42]:
response.source_nodes[0].get_text()

'Automatic tracking and detection of UAVs from commercial cameras have become accurate thanks to the development of deep learning based machine learning algorithms.  218  It is also possible to automatically identify UAVs across different cameras with different viewpoints and hardware specification with reidentification methods.  219  Commercial systems such as the Aaronia AARTOS have been installed on major international airports.  220   221  Once a UAV is detected it can be countered with kinetic force missiles projectiles or another UAV or by nonkinetic force laser microwaves communications jamming.  222  Antiaircraft missile systems such as the Iron Dome are also being enhanced with CUAS technologies. Utilising a smart UAV swarm to counter one or more hostile UAVs is also proposed.  223  Regulation  edit  Main article Regulation of unmanned aerial vehicles Regulatory bodies around the world are developing unmanned aircraft system traffic management solutions to better integrate UAV

In [52]:
response.source_nodes[0].node

TextNode(id_='9882d5d1-8ad4-4e93-88ae-54ce9aefc25e', embedding=None, metadata={'file_path': '/storage2/RAG/003_RAG_index-based_llamaindex_deeplake_openai/data/Unmanned_aerial_vehicle.txt', 'file_name': 'Unmanned_aerial_vehicle.txt', 'file_type': 'text/plain', 'file_size': 104466, 'creation_date': '2025-04-03', 'last_modified_date': '2025-04-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7c0b268e-7f65-4502-a93b-a59dfbf6121f', node_type='4', metadata={'file_path': '/storage2/RAG/003_RAG_index-based_llamaindex_deeplake_openai/data/Unmanned_aerial_vehicle.txt', 'file_name': 'Unmanned_aerial_vehicle.txt', 'file_type': 'text/plain', 'file_size': 104466, 'creation_date': '2025-04-03', 'last_modified_date': 

In [53]:
for node_with_score in response.source_nodes:
    node = node_with_score.node
    chunk_size = len(node.text)
    print(f"Node ID: {node.id_}, Chunk Size: {chunk_size} characters")

Node ID: 9882d5d1-8ad4-4e93-88ae-54ce9aefc25e, Chunk Size: 4645 characters
Node ID: cf53226b-ac11-4092-a3b2-9945f3203f87, Chunk Size: 4622 characters
Node ID: 9919f108-7cd0-409d-8c6f-fd741ba99853, Chunk Size: 4955 characters


In [56]:
import numpy as np

def info_metrics(response):
    scores = [node.score for node in response.source_nodes if node.score is not None]
    if scores:
        weights = np.exp(scores) / np.sum(np.exp(scores))
        perf = np.average(scores, weights=weights) / elapsed_time
    else:
        perf = 0

    average_score=np.average(scores, weights=weights)
    print(f"Average score: {average_score:.4f}")
    print(f"Query execution time: {elapsed_time:.4f} seconds")
    print(f"Performance metric: {perf:.4f}")

In [57]:
info_metrics(response)

Average score: 0.8325
Query execution time: 1.5290 seconds
Performance metric: 0.5445


## 트리 색인 쿼리 엔진

In [58]:
from llama_index.core import TreeIndex

tree_index = TreeIndex.from_documents(documents)

In [59]:
print(type(tree_index))

<class 'llama_index.core.indices.tree.base.TreeIndex'>


In [60]:
tree_query_engine = tree_index.as_query_engine(
    similarity_top_k=k,
    temperature=temp,
    num_output=mt
)

In [61]:
import time
import textwrap

start_time = time.time()
response = tree_query_engine.query(user_input)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 4.0114 seconds
Drones identify vehicles by utilizing convolutional neural networks (CNNs) for image processing.
These networks are trained on large datasets to recognize patterns and features in images, allowing
drones to distinguish vehicles based on visual information captured by their cameras.


In [63]:
response.response

'Drones identify vehicles by utilizing convolutional neural networks (CNNs) for image processing. These networks are trained on large datasets to recognize patterns and features in images, allowing drones to distinguish vehicles based on visual information captured by their cameras.'

In [64]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response.response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")

performance = similarity_score / elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.774
Query execution time: 4.0114 seconds
Performance metric: 0.1930


# 목록 색인 쿼리 엔진

In [65]:
from llama_index.core import ListIndex

list_index = ListIndex.from_documents(documents)

In [66]:
print(type(list_index))

<class 'llama_index.core.indices.list.base.SummaryIndex'>


In [67]:
list_query_engine = list_index.as_query_engine(
    similarity_top_k=k,
    temperature=temp,
    num_output=mt
)

In [68]:
start_time = time.time()
response = list_query_engine.query(user_input)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 19.0948 seconds
Drones can identify vehicles through computer vision methods, which analyze images or video from the
drone's cameras to detect and recognize vehicles based on visual characteristics. Object detection
algorithms, often utilizing convolutional neural networks (CNNs) or other machine learning models,
are employed to classify and pinpoint vehicles within the captured scenes.


In [69]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response.response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")

performance = similarity_score / elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.764
Query execution time: 19.0948 seconds
Performance metric: 0.0400


## 키워드 색인 쿼리 엔진

In [70]:
from llama_index.core import KeywordTableIndex

keyword_index = KeywordTableIndex.from_documents(documents)

In [72]:
data = []
for keyword, doc_ids in keyword_index.index_struct.table.items():
    for doc_id in doc_ids:
        data.append({"Keyword": keyword, "Document ID": doc_id})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Keyword,Document ID
0,binary,e625c299-bf5c-4af2-b9bb-c472fc7f7d15
1,kerr,e625c299-bf5c-4af2-b9bb-c472fc7f7d15
2,hole,e625c299-bf5c-4af2-b9bb-c472fc7f7d15
3,extreme binary black hole geometry,e625c299-bf5c-4af2-b9bb-c472fc7f7d15
4,general relativity,e625c299-bf5c-4af2-b9bb-c472fc7f7d15


In [73]:
keyword_query_engine = keyword_index.as_query_engine(
    similarity_top_k=k,
    temperature=temp,
    num_output=mt
)

In [74]:
start_time = time.time()
response = keyword_query_engine.query(user_input)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 1.8174 seconds
Drones can identify vehicles through various methods such as radar-electrooptical data fusion for
noncooperative sense and avoid systems. Additionally, drones can utilize advanced technologies like
adaptive control systems and artificial intelligence for vehicle identification purposes.


In [75]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response.response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")

performance = similarity_score / elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.783
Query execution time: 1.8174 seconds
Performance metric: 0.4310
