In [9]:
import os
from elasticsearch import Elasticsearch
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv
from mistralai import Mistral

load_dotenv()

True

In [10]:
# Step 1: Load the lotr.txt file
file_path = Path("../../lotr.txt")
with open(file_path, "r") as file:
    text = file.read()
    
print(text[:468])

chapters = text.split("\n\n")

Three Rings for the Elven-kings under the sky,
               Seven for the Dwarf-lords in their halls of stone,
            Nine for Mortal Men doomed to die,
              One for the Dark Lord on his dark throne
           In the Land of Mordor where the Shadows lie.
               One Ring to rule them all, One Ring to find them,
               One Ring to bring them all and in the darkness bind them
           In the Land of Mordor where the Shadows lie.
    


In [11]:
# Step 2: Create an elasticsearch client (local)
# client = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

# index_name = "lotr_index"

# if not client.indices.exists(index=index_name):
#     client.indices.create(index=index_name)

# for i, chapter in tqdm(enumerate(chapters)):
#     doc = {
#         "chapter_number": i + 1,
#         "content": chapter
#     }
#     client.index(index=index_name, id=i + 1, document=doc)

In [12]:
# Step 2: Create an elasticsearch client (managed)
client = Elasticsearch(
    "https://420a234f58fe4d28bdededc52bf27dbc.westeurope.azure.elastic-cloud.com:443",
    api_key=os.getenv("ELASTICSEARCH_API_KEY")
)

In [13]:
# Create an index with the specified mappings
index_name = "lotr_index"
if not client.indices.exists(index=index_name):
    client.indices.create(
        index=index_name,
        mappings={
            "properties": {
                "text": {"type": "text"}
            }
        }
    )

# Index each chapter
for i, chapter in enumerate(chapters):
    doc = {
        "text": chapter
    }
    client.index(index=index_name, id=i + 1, document=doc)

In [14]:
def keyword_search(query, index_name="lotr_index"):
    res = client.search(index=index_name, query={"match": {"text": query}})
    return res['hits']['hits']

# Example usage
results = keyword_search("Frodo")
for hit in results:
    print(f"Chapter {hit['_id']}: {hit['_source']['text'][:100]}...")

Chapter 766:      He was naked, lying as if in a swoon on a heap of filthy rags: his arm was flung up, shielding ...
Chapter 37:      Frodo took it from his breeches-pocket, where it was clasped to a chain that hung from his belt...
Chapter 344:      Aragorn sprang swiftly away and went in pursuit of Sam. Just as he reached the little lawn amon...
Chapter 56:      The song ended. 'And _now_ to bed! And _now_ to bed!' sang Pippin in a high voice.
     'Hush!'...
Chapter 73: 
     There was a terrific splash, and a shout of _Whoa!_ from Frodo. It appeared that a lot of Pipp...
Chapter 770:      Sam had just wits enough left to thrust the phial back into his breast. 'Run, Mr. Frodo!' he cr...
Chapter 589:      Faramir sat for a moment in thought. `Very good,' he said at last. `I surrender you to your mas...
Chapter 767:      Frodo sat for a while and shivered, dreadful fears chasing one another through his mind. Then h...
Chapter 34:      It was just at this time that Gandalf reappeared a

In [15]:
# Step 3: Vector search
vector_index_name = "lotr_vectors"
client.indices.create(
    index=vector_index_name,
    mappings={
        "properties": {
            "vector": {"type": "dense_vector", "dims": 1024},
            "text": {"type": "text"}
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'lotr_vectors'})

In [None]:
batch_size = 20
batches = [chapters[i:i + batch_size] for i in range(0, len(chapters), batch_size)]

# Generate embeddings using Mistral AI
api_key = os.getenv("MISTRAL_API_KEY")
model = "mistral-embed"

mistral_client = Mistral(api_key=api_key)

# Iterate over each batch and generate embeddings
for batch_index, batch in enumerate(batches):
    embeddings_batch_response = mistral_client.embeddings.create(
        model=model,
        inputs=batch,  # Send the current batch of chapters
    )

    # Extract and store embeddings for the current batch
    for i, response in enumerate(embeddings_batch_response.data):
        embedding_vector = response.embedding
        chapter_index = batch_index * batch_size + i + 1  # Calculate the chapter number
        chapter_content = batch[i]

        # Index the chapter along with its embedding in Elasticsearch
        doc = {
            "chapter_number": chapter_index,
            "content": chapter_content,
            "embedding": embedding_vector
        }
        client.index(index=vector_index_name, id=chapter_index, document=doc)
    
    print(f"Generated and indexed embeddings for batch with {len(batch)} chapters.")


In [25]:
# Define a function to perform a vector search in Elasticsearch
def vector_search(query_embedding, index_name=vector_index_name, size=5):
    script_score_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding}
            }
        }
    }
    response = client.search(
        index=index_name,
        query=script_score_query,
        size=size
    )
    return response['hits']['hits']

def get_query_embeddings(query):
    embeddings_batch_response = mistral_client.embeddings.create(
        model="mistral-embed",
        inputs=query,  # Send the current batch of chapters
    )
    return embeddings_batch_response.data[0].embedding

sample_query_embedding = get_query_embeddings("Where was the council of Elrond held?")  # Replace with actual query embedding
results = vector_search(sample_query_embedding)
for hit in results:
    print(f"Chapter {hit['_source']['chapter_number']}: {hit['_source']['content']}...")

Chapter 225: 
                           _Chapter 2_
            The Council of Elrond...
Chapter 226:      Next day Frodo woke early, feeling refreshed and well. He walked along the terraces above the loud-flowing Bruinen and watched the pale, cool sun rise above the far mountains, and shine down. Slanting through the thin silver mist; the dew upon the yellow leaves was glimmering, and the woven nets of gossamer twinkled on every bush. Sam walked beside him, saying nothing. but sniffing the air, and looking every now and again with wonder in his eyes at the great heights in the East. The snow was white upon their peaks.
     On a seat cut in the stone beside a turn in the path they came upon Gandalf and Bilbo deep in talk. `Hullo! Good morning!' said Bilbo. `Feel ready for the great council?'
     `I feel ready for anything,' answered Frodo. `But most of all I should like to go walking today and explore the valley. I should like to get into those pine-woods up there.' He pointed away 

In [None]:
# Step 4: Reranker

see for example: https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb

In [None]:
# Step 5: LLM and final steps