In [1]:
import sys
from pathlib import Path

# This points to /desktop/open-books/
project_root = str(Path.cwd().parent.parent)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Now 'from src.ingestion...' will work because 'src' is a folder inside project_root


In [2]:
from src.ingestion.parsers.get_parser import get_parser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
parser = get_parser()

In [4]:
pdf_path = Path("../../data/Word2Vec.pdf")

# Verify it exists
import os

print(f"File exists: {os.path.exists(pdf_path)}")

File exists: True


In [5]:
parsed_doc = parser.parse(pdf_path=pdf_path)

[32m2026-01-27 13:54:53[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m22[0m - [1mStarting to parse PDF: ../../data/Word2Vec.pdf[0m
[32m2026-01-27 13:55:06[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m42[0m - [1mDocument converted successfully: 12 pages[0m
[32m2026-01-27 13:55:06[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m64[0m - [1mStructure extracted: 23 chapters[0m
[32m2026-01-27 13:55:06[0m | [32m[1mSUCCESS [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m66[0m - [32m[1mSuccessfully parsed Word2Vec.pdf[0m


In [6]:
from src.ingestion.chunking.get_chunker import get_chunker

In [7]:
chunker = get_chunker()

In [8]:
chunked_doc = chunker.chunk(parsed_doc)

In [9]:
chunked_doc

[Chunk(content='## Efficient Estimation of Word Representations in Vector Space\n\n', metadata=ChunkMetadata(source_doc_title='Word2Vec', chapter_name='Efficient Estimation of Word Representations in Vector Space', page_range=(1, 1), char_span=(0, 65), chunk_id=UUID('19fa3370-79d2-4014-9fa0-af77b51b198a'))),
 Chunk(content='## Tomas Mikolov\n\nGoogle Inc., Mountain View, CA tmikolov@google.com\n\nGreg Corrado Google Inc., Mountain View, CA\n\ngcorrado@google.com\n\nKai Chen\n\nGoogle Inc., Mountain View, CA kaichen@google.com\n\nJeffrey Dean Google Inc., Mountain View, CA jeff@google.com\n\n', metadata=ChunkMetadata(source_doc_title='Word2Vec', chapter_name='Tomas Mikolov', page_range=(1, 1), char_span=(65, 323), chunk_id=UUID('9a41cbb8-5542-44eb-873e-6c79d534dd8d'))),
 Chunk(content='## Abstract\n\nWepropose two novel model architectures for computing continuous vector representations of words from very large data sets. The quality of these representations is measured in a word simila

In [10]:
from src.ingestion.embedding.get_embbedder import get_embedder

In [11]:
embedder = get_embedder()

[32m2026-01-27 13:55:06[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m
[32m2026-01-27 13:55:10[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m


In [12]:
embeddings = embedder.embed_chunk(chunks=chunked_doc)

[32m2026-01-27 13:55:10[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.base_embed[0m:[36membed_chunk[0m:[36m62[0m - [1mSuccessfully embedded 33 chunks[0m


In [13]:
for embedding in embeddings:
    print(embedding.embedding)

[-0.018124599009752274, -0.0657244324684143, 0.03242418169975281, 0.04572271928191185, 0.010175470262765884, 0.09852025657892227, 0.07759235799312592, -0.010129225440323353, 0.08585502952337265, -0.03264926001429558, -0.022502316161990166, 0.004979969933629036, 0.07186438888311386, 0.046965450048446655, -0.0974593311548233, 0.07228535413742065, 0.06446640938520432, 0.03975442051887512, -0.05744875594973564, -0.06686121225357056, 0.004967727232724428, 0.016569532454013824, -0.014979768544435501, -0.05204135179519653, 0.11752759665250778, 0.08382836729288101, -0.05942831188440323, -0.04501870647072792, 0.0009588769171386957, 0.03139142319560051, 0.021123964339494705, 0.06622755527496338, 0.06624582409858704, 0.014868378639221191, -0.05803801491856575, -0.003226876026019454, -0.0869993045926094, 0.0037594810128211975, -0.030382655560970306, -0.025092557072639465, -0.05093955248594284, 0.0074411421082913876, -0.044781263917684555, 0.12565484642982483, 0.12197480350732803, 0.017130622640252

In [14]:
import chromadb

In [15]:
client = chromadb.Client()

In [16]:
collection = client.get_or_create_collection(name="my-chunks")

In [17]:
from chromadb.api.models.Collection import Collection
from src.shared.models import EmbeddedChunk
from typing import List, cast
from chromadb.api.types import Metadata, Embedding

In [54]:
def add_deformed(colect: Collection, embch: List[EmbeddedChunk]):
    ids = [str(embed.vector_id) for embed in embch]
    metadatas: List[Metadata] = [
        embed.metadata.model_dump(mode="json") for embed in embch
    ]
    documents = [embed.content for embed in embch]
    embeddings: List[Embedding] = cast(
        List[Embedding], [embed.embedding for embed in embch]
    )

    colect.add(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas)


In [19]:
add_deformed(colect=collection, embch=embeddings)

In [20]:
collection.peek()

{'ids': ['19fa3370-79d2-4014-9fa0-af77b51b198a',
  '9a41cbb8-5542-44eb-873e-6c79d534dd8d',
  '72f8ecb9-77ab-40eb-bafd-385af9ad7f40',
  '9ddf52ec-e0f3-4f2a-aa63-2cf82e03fc75',
  '5effe0d4-7fdf-4667-a4a3-e9496d5ab282',
  'ccd8a396-fa41-4594-a33e-bbc5dd273e07',
  '1c744b63-52ae-493d-aa91-494aa1f0d00b',
  '81869f89-f502-4acc-9f4d-1614370a0a02',
  '520fc9ac-6e59-4aa2-b298-a54b9f23f2c9',
  '7ec2cab9-1a69-4939-8254-e1d9b2e1e990'],
 'embeddings': array([[-0.0181246 , -0.06572443,  0.03242418, ...,  0.03036081,
          0.0118225 , -0.02007868],
        [-0.09818865, -0.02859351,  0.07665604, ..., -0.06689522,
         -0.05767047, -0.02376575],
        [-0.01030247, -0.13050696, -0.04658394, ...,  0.0074774 ,
         -0.00310586,  0.05391414],
        ...,
        [-0.02716015, -0.12122066, -0.02988934, ...,  0.04182279,
          0.00207599,  0.02949899],
        [ 0.00143512, -0.0763093 ,  0.01381659, ..., -0.0299913 ,
          0.01125311, -0.00047237],
        [-0.0614777 , -0.09614382, 

In [42]:
query_embedding = cast(
    List[Embedding],
    embedder._embed_batch(["what kind of model is used", "explain the architecture"]),
)

In [36]:
results = collection.query(query_embeddings=query_embedding, n_results=5)

In [38]:
json_data = results["metadatas"][1][1]
json_data

{'chunk_id': '2f0dcbea-aeb7-46bd-9d4a-83328ec3cb65',
 'chapter_name': '3.2 Continuous Skip-gram Model',
 'page_range': '4-5',
 'source_doc_title': 'Word2Vec',
 'char_span': '13878-15383'}

In [None]:
from src.shared.models import ChunkMetadata, Chunk

metadata_obj = ChunkMetadata.model_validate(json_data)
metadata_obj

ChunkMetadata(source_doc_title='Word2Vec', chapter_name='3.2 Continuous Skip-gram Model', page_range=(4, 5), char_span=(13878, 15383), chunk_id=UUID('2f0dcbea-aeb7-46bd-9d4a-83328ec3cb65'))

In [40]:
results["documents"]

[['## 2.2 Recurrent Neural Net Language Model (RNNLM)\n\nRecurrent neural network based language model has been proposed to overcome certain limitations of the feedforward NNLM, such as the need to specify the context length (the order of the model N ), and because theoretically RNNs can efficiently represent more complex patterns than the shallow neural networks [15, 2]. The RNN model does not have a projection layer; only input, hidden and output layer. What is special for this type of model is the recurrent matrix that connects hidden layer to itself, using time-delayed connections. This allows the recurrent model to form some kind of short term memory, as information from the past can be represented by the hidden layer state that gets updated based on the current input and the state of the hidden layer in the previous time step.\n\nThe complexity per training example of the RNN model is\n\n$$Q = H \\times H + H \\times V ,$$\n\nwhere the word representations D have the same dimensi

In [41]:
results["distances"]

[[1.551588773727417,
  1.6100014448165894,
  1.623084545135498,
  1.6412570476531982,
  1.6435401439666748],
 [1.5580549240112305,
  1.664104700088501,
  1.664544939994812,
  1.6696982383728027,
  1.6891388893127441]]

In [44]:
query_embedding

[[-0.07572223991155624,
  -0.03169030696153641,
  -0.01724713109433651,
  -0.013565749861299992,
  -0.011240549385547638,
  0.034649807959795,
  -0.05242108553647995,
  0.07192692905664444,
  0.06342612206935883,
  0.015297705307602882,
  0.053337212651968,
  0.047895465046167374,
  0.007288785185664892,
  0.051717039197683334,
  -0.04862339794635773,
  -0.08518565446138382,
  0.06834647059440613,
  -0.0022389728110283613,
  -0.011954331770539284,
  -0.0014335684245452285,
  0.015452960506081581,
  0.007973087951540947,
  -0.07643653452396393,
  0.03734312579035759,
  -0.043057799339294434,
  -0.04115079715847969,
  0.015157021582126617,
  0.06808321923017502,
  0.024052733555436134,
  -0.1167907640337944,
  0.010339424014091492,
  0.031004028394818306,
  0.02035791613161564,
  0.000696839124429971,
  -0.0647440180182457,
  -0.0663616806268692,
  0.0013286208268254995,
  -0.006806595716625452,
  -0.050364136695861816,
  -0.007163184229284525,
  -0.016460007056593895,
  -0.0735343322157

In [None]:
from pydantic import BaseModel, Field

In [50]:
class SearchResult(Chunk):
    score: float = Field(description="Similarity score (closer to 0 is better for L2)")

In [56]:
chunks_output: List[List[SearchResult]] = []

for i in range(len(results["ids"])):
    query_results: List[SearchResult] = []
    

    current_docs = results["documents"][i]
    current_metas = results["metadatas"][i]
    current_dists = results["distances"][i]
    
    for doc_text, meta_json, score in zip(current_docs, current_metas, current_dists):
        
        metadata = ChunkMetadata.model_validate(meta_json)
        
        query_results.append(
            SearchResult(
                content=doc_text, 
                metadata=metadata, 
                score=score
            )
        )
    
    chunks_output.append(query_results)


In [57]:
chunks_output

[[SearchResult(content='## 2.2 Recurrent Neural Net Language Model (RNNLM)\n\nRecurrent neural network based language model has been proposed to overcome certain limitations of the feedforward NNLM, such as the need to specify the context length (the order of the model N ), and because theoretically RNNs can efficiently represent more complex patterns than the shallow neural networks [15, 2]. The RNN model does not have a projection layer; only input, hidden and output layer. What is special for this type of model is the recurrent matrix that connects hidden layer to itself, using time-delayed connections. This allows the recurrent model to form some kind of short term memory, as information from the past can be represented by the hidden layer state that gets updated based on the current input and the state of the hidden layer in the previous time step.\n\nThe complexity per training example of the RNN model is\n\n$$Q = H \\times H + H \\times V ,$$\n\nwhere the word representations D 

In [None]:
from src.shared.models import Chunk, ChunkMetadata

In [None]:
class ChromaStore:
    def __init__(self, client_path, collection_name) -> None:
        self.client = chromadb.PersistentClient(path=client_path)
        try:
            self.collection = self.client.get_or_create_collection(name=collection_name)
        except Exception as e:
            raise ValueError(f"Probably messed up the  name  huh {e}")
        self.embedder =  get_embedder()
        
    def ingest(self, embch: List[EmbeddedChunk])->None:
        ids = [str(embed.vector_id) for embed in embch]
        metadatas: List[Metadata] = [
            embed.metadata.model_dump(mode="json") for embed in embch
        ]
        documents = [embed.content for embed in embch]
        embeddings: List[Embedding] = cast(
            List[Embedding], [embed.embedding for embed in embch]
        )

        self.collection.add(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas)
    def query(self, sentences:List[str],n_result:int)->List[List[SearchResult]]:
        query_embedding = cast(
        List[Embedding],
        embedder._embed_batch(sentences),
    )
        results = self.collection.query(query_embeddings=query_embedding,n_results=n_result)
        chunks_output: List[List[SearchResult]] = []

        for i in range(len(results["ids"])):
            query_results: List[SearchResult] = []
            

            current_docs = results["documents"][i]
            current_metas = results["metadatas"][i]
            current_dists = results["distances"][i]
            
            for doc_text, meta_json, score in zip(current_docs, current_metas, current_dists):
                
                metadata = ChunkMetadata.model_validate(meta_json)
                
                query_results.append(
                    SearchResult(
                        content=doc_text, 
                        metadata=metadata, 
                        score=score
                    )
                )
            
            chunks_output.append(query_results)
        return chunks_output
    def count(self)->int:
        return self.collection.count()
                    
                


In [None]:
class RedisCache:
    def __init__(self) -> None:
        pass