In [11]:
import sys
import warnings

sys.path.extend(["..\\", "..\\dataScripts"])
warnings.filterwarnings("ignore")

import torch
import numpy as np


from data_utils import TextEmbedding, PdfChunkReader, normalize_vectors
from vector_database_fill import FaissLoader

In [12]:
torch.cuda.is_available()

True

**VECTOR EMBEDDINGS**

In [13]:
text_embed = TextEmbedding(model_name='Snowflake/snowflake-arctic-embed-l-v2.0')
text_embed

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

<data_utils.TextEmbedding at 0x24089d9e7b0>

In [14]:
docs = ['My name is Luchian!', "I really have to be careful with my daydreaming", "Never you mind that", "Bears are really big", "Never you mind that"]
docs

['My name is Luchian!',
 'I really have to be careful with my daydreaming',
 'Never you mind that',
 'Bears are really big',
 'Never you mind that']

In [15]:
result = text_embed.embed(docs)
for chunk_batch in result:
    print(chunk_batch, end = "\n\n")

[[-0.01403946 -0.01075113 -0.07139118 ... -0.01289096 -0.0308235
  -0.00545011]
 [-0.02945525  0.01525113  0.00643799 ... -0.02518091 -0.02136866
   0.01746517]
 [ 0.00131223  0.00379818 -0.00503858 ...  0.00616909 -0.00267411
  -0.00238802]]



In [16]:
text_embed.model.embeddings.word_embeddings.weight.shape[1]

1024

**PDF CHUNKING**

In [17]:
pdf_chunking = PdfChunkReader(pdf_path="C:\\main\\GitHub\\documentReviewSystem\\knowledge_data\\Think like a Scientist_ Physics-guided LLM Agent for Equation Discovery.pdf")

In [18]:
chunk_result = pdf_chunking.get_chunks()
chunk_result[:5]

['Think like a Scientist: Physics-guided LLM Agent for Equation Discovery\nJianke Yang1 Ohm Venkatachalam1 Mohammad Kianezhad 1 Sharvaree Vadgama1 Rose Yu1\nAbstract\nExplaining observed phenomena through sym-\nbolic, interpretable formulas is a fundamental\ngoal of science. Recently, large language mod-\nels (LLMs) have emerged as promising tools for\nsymbolic equation discovery, owing to their broad\ndomain knowledge and strong reasoning capabili-\nties. However, most existing LLM-based systems\ntry to guess equations directly from data, with-\nout modeling the multi-step reasoning process\nthat scientists often follow: first inferring phys-\nical properties such as symmetries, then using\nthese as priors to restrict the space of candidate\nequations. We introduceKeplerAgent, an agentic\nframework that explicitly follows this scientific\nreasoning process. The agent coordinates physics-\nbased tools to extract intermediate structure and\nuses these results to configure symbolic regre

In [19]:
print(f"Length of chunk list: {len(chunk_result)}")
print(f"Chunk lengths: {list(map(lambda lst: len(lst), chunk_result))}")

Length of chunk list: 35
Chunk lengths: [2442, 2498, 2475, 2444, 2494, 2470, 2479, 2450, 2471, 2453, 2499, 2487, 2487, 2464, 2468, 2488, 2485, 2454, 2460, 2482, 2481, 2487, 2452, 2442, 2467, 2474, 2492, 2489, 2401, 2499, 2470, 2429, 2488, 2388, 317]


**VECTORS NORMALIZATION**

In [20]:
row_vectors = np.random.random((50, 50))
print(f"Array shape: {row_vectors.shape}", end='\n\n')
print(row_vectors)

Array shape: (50, 50)

[[0.71395498 0.98597342 0.06117661 ... 0.87636645 0.63359924 0.92423667]
 [0.09507358 0.88459612 0.82751223 ... 0.66849116 0.86123556 0.60486795]
 [0.24279599 0.99529196 0.45899813 ... 0.75227985 0.02561144 0.4009926 ]
 ...
 [0.26141672 0.1991353  0.91255568 ... 0.63875685 0.03916873 0.29501127]
 [0.92662155 0.8422958  0.34323096 ... 0.69827152 0.53924579 0.83517773]
 [0.5133955  0.59265681 0.62781872 ... 0.2795474  0.23262978 0.90813289]]


In [21]:
norm_vectors = normalize_vectors(row_vectors)
print(f"Norm vectors shape: {norm_vectors.shape}")
norm_vectors

Norm vectors shape: (50, 50)


array([[0.17064237, 0.23565749, 0.01462182, ..., 0.20946033, 0.15143654,
        0.22090179],
       [0.02184984, 0.20329815, 0.19017911, ..., 0.15363284, 0.19792942,
        0.13901094],
       [0.0634686 , 0.26017641, 0.11998538, ..., 0.19665131, 0.00669501,
        0.10482232],
       ...,
       [0.06822845, 0.05197331, 0.23817244, ..., 0.16671232, 0.01022284,
        0.07699646],
       [0.21680767, 0.19707743, 0.08030798, ..., 0.16337913, 0.12617084,
        0.19541196],
       [0.11327456, 0.13076262, 0.13852067, ..., 0.06167878, 0.05132697,
        0.20036863]], shape=(50, 50))

In [22]:
print("Average norm of vectors: ")
np.apply_along_axis(np.linalg.norm, arr = norm_vectors, axis = 1).mean().item()

Average norm of vectors: 


1.0

**FAISS LOADING**

In [23]:
embedding_class = TextEmbedding(model_name='Snowflake/snowflake-arctic-embed-l-v2.0', device="cpu")
faiss_loader = FaissLoader(embedding_class=embedding_class)

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

In [24]:
import faiss

read_index = faiss.read_index("C:\\main\\GitHub\\documentReviewSystem\\project_data\\initial_vector_db.index")
read_index

<faiss.swigfaiss_avx2.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x000002409ABBEA90> >

In [25]:
read_index.ntotal

312

**EXPERIMENTS**