In [1]:
import sys
import warnings

sys.path.extend(["..\\", "..\\dataScripts"])
warnings.filterwarnings("ignore")

import torch
import numpy as np


from data_utils import TextEmbedding, PdfChunkReader, normalize_vectors
from vector_database_fill import FaissLoader

In [2]:
torch.cuda.is_available()

True

**VECTOR EMBEDDINGS**

In [3]:
text_embed = TextEmbedding(model_name='Snowflake/snowflake-arctic-embed-l-v2.0')
text_embed



Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

<data_utils.TextEmbedding at 0x180b39e9e80>

In [4]:
docs = ['My name is Luchian!', "I really have to be careful with my daydreaming", "Never you mind that", "Bears are really big", "Never you mind that"]
docs

['My name is Luchian!',
 'I really have to be careful with my daydreaming',
 'Never you mind that',
 'Bears are really big',
 'Never you mind that']

In [5]:
result = text_embed.embed(docs)
for chunk_batch in result:
    print(chunk_batch, end = "\n\n")

[[-0.0140395  -0.01075119 -0.07139113 ... -0.01289097 -0.03082362
  -0.00545018]
 [-0.0294552   0.01525114  0.00643797 ... -0.02518092 -0.02136865
   0.01746519]
 [ 0.00131221  0.00379814 -0.00503861 ...  0.0061691  -0.00267408
  -0.00238797]]



In [6]:
text_embed.model.embeddings.word_embeddings.weight.shape[1]

1024

**PDF CHUNKING**

In [7]:
pdf_chunking = PdfChunkReader(pdf_path="C:\\main\\GitHub\\documentReviewSystem\\knowledge_data\\Think like a Scientist_ Physics-guided LLM Agent for Equation Discovery.pdf")

In [8]:
chunk_result = pdf_chunking.get_chunks()
chunk_result[:5]

['Think like a Scientist: Physics-guided LLM Agent for Equation Discovery\nJianke Yang1 Ohm Venkatachalam1 Mohammad Kianezhad 1 Sharvaree Vadgama1 Rose Yu1\nAbstract\nExplaining observed phenomena through sym-\nbolic, interpretable formulas is a fundamental\ngoal of science. Recently, large language mod-\nels (LLMs) have emerged as promising tools for\nsymbolic equation discovery, owing to their broad\ndomain knowledge and strong reasoning capabili-\nties. However, most existing LLM-based systems\ntry to guess equations directly from data, with-\nout modeling the multi-step reasoning process\nthat scientists often follow: first inferring phys-\nical properties such as symmetries, then using\nthese as priors to restrict the space of candidate\nequations. We introduceKeplerAgent, an agentic\nframework that explicitly follows this scientific\nreasoning process. The agent coordinates physics-\nbased tools to extract intermediate structure and\nuses these results to configure symbolic regre

In [9]:
print(f"Length of chunk list: {len(chunk_result)}")
print(f"Chunk lengths: {list(map(lambda lst: len(lst), chunk_result))}")

Length of chunk list: 35
Chunk lengths: [2442, 2498, 2475, 2444, 2494, 2470, 2479, 2450, 2471, 2453, 2499, 2487, 2487, 2464, 2468, 2488, 2485, 2454, 2460, 2482, 2481, 2487, 2452, 2442, 2467, 2474, 2492, 2489, 2401, 2499, 2470, 2429, 2488, 2388, 317]


**VECTORS NORMALIZATION**

In [10]:
row_vectors = np.random.random((50, 50))
print(f"Array shape: {row_vectors.shape}", end='\n\n')
print(row_vectors)

Array shape: (50, 50)

[[0.9893784  0.88823197 0.84266298 ... 0.78338485 0.06438843 0.05812483]
 [0.98246577 0.59662927 0.86581921 ... 0.68747287 0.15787295 0.44664512]
 [0.55978128 0.49564102 0.26215102 ... 0.85227642 0.72454081 0.62760113]
 ...
 [0.57052043 0.31251165 0.42543481 ... 0.05119564 0.27779329 0.72722329]
 [0.16713336 0.34794824 0.67314635 ... 0.38260872 0.95604403 0.52796018]
 [0.48332006 0.91987695 0.78523003 ... 0.10945848 0.61015699 0.43100638]]


In [11]:
norm_vectors = normalize_vectors(row_vectors)
print(f"Norm vectors shape: {norm_vectors.shape}")
norm_vectors

Norm vectors shape: (50, 50)


array([[0.24143029, 0.21674831, 0.20562847, ..., 0.19116329, 0.01571221,
        0.01418375],
       [0.23389192, 0.14203728, 0.20612231, ..., 0.16366407, 0.03758422,
        0.10633112],
       [0.13479136, 0.11934684, 0.0631241 , ..., 0.20522211, 0.17446428,
        0.1511219 ],
       ...,
       [0.14021402, 0.07680446, 0.10455704, ..., 0.0125821 , 0.0682719 ,
        0.17872612],
       [0.0389014 , 0.08098727, 0.15667929, ..., 0.08905473, 0.22252561,
        0.12288625],
       [0.11289462, 0.21486623, 0.1834152 , ..., 0.02556747, 0.14252138,
        0.10067511]], shape=(50, 50))

In [12]:
print("Average norm of vectors: ")
np.apply_along_axis(np.linalg.norm, arr = norm_vectors, axis = 1).mean().item()

Average norm of vectors: 


1.0

**FAISS LOADING**

In [13]:
embedding_class = TextEmbedding(model_name='Snowflake/snowflake-arctic-embed-l-v2.0', device="cpu")
faiss_loader = FaissLoader(embedding_class=embedding_class)

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

In [15]:
faiss_loader.load_and_save_database("C:\\main\\GitHub\\documentReviewSystem\\knowledge_data", save_to="C:\\main\\GitHub\\documentReviewSystem\\project_data\\initial_vector_db.index")

Going through filenames...: 100%|██████████| 10/10 [14:09<00:00, 84.96s/it]
