In [169]:
import os
from dotenv import load_dotenv

import logging
import sys

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext, load_index_from_storage

from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

load_dotenv()

True

## Settings

In [40]:
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [44]:
query = "What are the misunderstandings in Warbreaker? Return output as a list."

In [41]:

api_key = os.environ.get("AZURE_OPENAI_API_KEY")
azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
api_version = os.environ.get("AZURE_OPENAI_API_VERSION")
model_name = os.environ.get('AZURE_OPENAI_MODEL')
embed_model_name = 'text-embedding-3-small'
deployment_name = os.environ.get('AZURE_OPENAI_DEPLOYMENT_NAME')

llm = AzureOpenAI(
    model=model_name,
    deployment_name=deployment_name,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model=embed_model_name,
    deployment_name=embed_model_name,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)



Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
documents = SimpleDirectoryReader('data', required_exts='.txt').load_data()
index = VectorStoreIndex.from_documents(documents)



In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)

# Storage - Saving you Index

In [None]:
PERSIST_DIR = ".data/storage"
if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)

## Obsidian Reader

In [200]:
from llama_index.readers.obsidian import ObsidianReader

In [202]:
obsidian_path = "/Users/mo/Library/Mobile Documents/iCloud~md~obsidian/Documents/MainVault"
obsidian_documents = ObsidianReader(input_dir=obsidian_path).load_data()
len(obsidian_documents)

2863

In [203]:
def filter_empty_documents(documents):
    empty_node_ids = []
    
    for document in documents:
        if document.text == "":
            empty_node_ids.append(document.id_)
            
    return list(filter(lambda x: x.id_ not in empty_node_ids, documents))

obsidian_documents = filter_empty_documents(obsidian_documents)

In [None]:
len(obsidian_documents)

In [None]:
obsidian_index = VectorStoreIndex.from_documents(obsidian_documents)

In [None]:
def create_persisted_index(path, index):
    PERSIST_DIR = "./data/obsidipans_index_storage"
    if not os.path.exists(path):
        # store it for later
        obsidian_index.storage_context.persist(persist_dir=PERSIST_DIR)

In [None]:
persist_path = "./data/obsidians_index_storage"
create_persisted_index(persist_path, obsidian_index)

In [None]:
query_engine = obsidian_index.as_query_engine()

In [None]:

response = query_engine.query("What do we undestand about Warbreaker?")
print(response)

# Load from Storage

In [78]:
path = "data/obsidians_index_storage"

def load_index(path):
    if os.path.exists(path):
    # load the existing index
        storage_context = StorageContext.from_defaults(persist_dir=path)
        index = load_index_from_storage(storage_context)
        return index


In [128]:
index = load_index(path)
query_engine = index.as_query_engine()

In [45]:
response = query_engine.query(query)
print(response)

1. The Hallandren gods believe that Idrians are rebels, while the Idrians simply want to be left alone.
2. Vivenna thinks Siri is being forced to wear colorful clothes and participate in heresies to demonstrate control, but Siri is actually enjoying it.
3. There is a misconception that one religion is the same as Hallandren, but they only accept the God King as their ruler, not as their God.


# Index with LanceDB


In [198]:
from llama_index.vector_stores.lancedb import LanceDBVectorStore

In [204]:
vector_store = LanceDBVectorStore(
    uri="./data/obsidian_lancdb", mode="overwrite", query_type="hybrid"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    obsidian_documents, storage_context=storage_context
)

[2024-09-04T10:38:25Z WARN  lance::dataset] No existing dataset at /Users/mo/Projects/obsidian-rag-chat/data/obsidian_lancdb/vectors.lance, it will be created


In [209]:
l_query_engine = index.as_query_engine()


# Load Index from Landb

In [210]:
db = lancedb.connect(uri='./data/obsidian_lancdb')
lance_table = db.open_table("vectors")

In [211]:
def index_from_lancedb(table):
    vec_store = LanceDBVectorStore.from_table(table)
    index = VectorStoreIndex.from_vector_store(vec_store)
    return index

# def index_from_lancedb_vectorstrore

loaded_index = index_from_lancedb(lance_table)

In [212]:
loaded_query_engine = loaded_index.as_query_engine()

In [213]:
loaded_query_engine.query(query)

Response(response='The provided context does not contain any information regarding misunderstandings in "Warbreaker." Therefore, I cannot provide a list related to that topic.', source_nodes=[NodeWithScore(node=TextNode(id_='9ed08dbf-6fd1-4a4c-8cd8-5c3c4853703c', embedding=[0.004109594970941544, 0.0010162964463233948, 0.04003695398569107, -0.03465314581990242, -0.014976280741393566, 0.01990916021168232, 0.007255843840539455, -0.025238312780857086, -0.02777990698814392, 0.008150868117809296, -0.00858813151717186, -0.04315245896577835, -0.000634544703643769, -0.010214205831289291, 0.017381230369210243, 0.07012616097927094, 0.021808525547385216, 0.029405981302261353, -0.02559358812868595, 0.062255412340164185, 0.012311705388128757, -0.008226023055613041, -0.01966319978237152, -0.033751290291547775, -0.005472627934068441, -0.0036279219202697277, 0.015891801565885544, -0.005954301450401545, 0.021507905796170235, -0.01575515605509281, -0.02286069095134735, -0.018009796738624573, -0.074225507

# Save to LanceDB

In [19]:
from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.vector_stores.lancedb import LanceDBVectorStore
import lancedb
# Initialize LanceDBVectorStore
# vector_store = LanceDBVectorStore(
#     uri="./data/obsidian_lancedb",  # Local directory or remote URI for LanceDB storage
#     table_name="obsidian_notes",  # Table name in LanceDB
#     vector_column_name="vector",  # Column name for vectors
#     nprobes=20,  # Number of probes for search
#     refine_factor=None  # Refine factor for re-ranking (optional)
# )

In [171]:

# from llama_index.indices.base import BaseIndex
def get_embeddings_from_index(index) -> list:
    embeddings = []
    texts = []
    metadata = []

    # Access the underlying vector store
    vector_store = index.vector_store

    # Iterate through all entries in the vector store
    for node_id, node in index.docstore.docs.items():
        # Fetch the embedding for this node
        embedding = vector_store.get(node_id)
        
        if embedding is not None:
            embeddings.append(embedding)
            texts.append(node.text)
            metadata.append(node.metadata)
        else:
            print(f"Warning: No embedding found for node {node_id}")

    return embeddings, texts, metadata


embeddings, texts, metadata = get_embeddings_from_index(index)

In [172]:
uri = "./data/obsidian_lancedb"
table_name = "obsidian_notes"

In [146]:
from lancedb.pydantic import Vector, LanceModel
from typing import List, Dict, Any
import json
from pydantic import BaseModel, Field

class Metadata(BaseModel):
    source: str
    tags: List[str]
    # Add other metadata fields as needed

class Note(LanceModel):
    text: str
    embedding: Vector(1536)
    metadata: Metadata

    # @property
    # def metadata(self) -> Metadata:
    #     import json
    #     return Metadata(**json.loads(self.metadata_json))

In [147]:
a = list(zip(texts, embeddings))
print(len(a))

2858


In [149]:
import pyarrow as pa

schema = pa.schema([
    pa.field("text", pa.string()),
    pa.field("embedding", pa.list_(pa.float32(), 1536)),  # 1536-dimensional vector
    pa.field("metadata", pa.struct([]))  # Empty struct for metadata
])

In [194]:
    

db = lancedb.connect(uri)

# Create a new table (or open an existing one)
# table = db.create_table(table_name, data=[
#     {"text": text, "vector": emb, **meta}
#     for text, emb, meta in zip(texts, embeddings, metadata)
# ],  mode="overwrite")


table = db.create_table(table_name, data=[
    {"text": text, "vector": emb, "metadata": ""}
    for text, emb in zip(texts, embeddings)
],  mode="overwrite")

In [188]:
# table.alter_table('metadata', 'map', default={})

AttributeError: 'LanceTable' object has no attribute 'alter_table'

In [190]:
table.schema

text: string
vector: fixed_size_list<item: float>[1536]
  child 0, item: float
metadata: struct<>

In [185]:
for l in metadata:
    if l:
        print(l)

In [179]:
# table = db.drop_table(table_name)

# Index from LanceDB Table

In [195]:
from llama_index.vector_stores.lancedb import LanceDBVectorStore

In [196]:
def index_from_lancedb(table):
    vec_store = LanceDBVectorStore.from_table(table)
    index = VectorStoreIndex.from_vector_store(vec_store)
    return index

# def index_from_lancedb_vectorstrore

l_index = index_from_lancedb(table)

In [197]:
l_index.as_query_engine().query(query)

AttributeError: 'Series' object has no attribute 'id'

In [151]:
from llama_index.vector_stores.lancedb import LanceDBVectorStore
 
vector_store = LanceDBVectorStore(
    uri="./data/obsidian_lancedb",  # Local directory or remote URI for LanceDB storage
    table_name="obsidian_notes",  # Table name in LanceDB
    vector_column_name="embedding",  # Column name for vectors
    nprobes=20,  # Number of probes for search
    refine_factor=None  # Refine factor for re-ranking (optional)
)

In [152]:
lancedb_index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = lancedb_index.as_query_engine()

In [153]:
query_engine.query(query,)

OSError: Io error: Execution error: LanceError(Arrow): Invalid argument error: all columns in a record batch must have the same length, /Users/runner/work/lance/lance/rust/lance-file/src/reader.rs:439:12