# Text Extraction from PDF

In [None]:
import PyPDF2

# Define the minimum word count for a paragraph to be considered valid
MIN_WORD_COUNT = 8  

# Open the PDF file in read-binary mode
with open('data/paper.pdf', 'rb') as file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Get the number of pages
    num_pages = len(pdf_reader.pages)
    
    # List to store extracted paragraphs with page numbers
    paragraphs_with_pages = []

    # Loop through each page and extract text
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        
        if text:  # Ensure the page contains text
            # Split text into paragraphs based on newlines
            paragraphs = text.split("\n")
            
            # Clean up empty lines and unwanted characters
            paragraphs = [p.strip() for p in paragraphs if p.strip()]
            
            # Filter out short paragraphs
            valid_paragraphs = [p for p in paragraphs if len(p.split()) >= MIN_WORD_COUNT]
            
            # Store paragraphs with the associated page number
            for paragraph in valid_paragraphs:
                paragraphs_with_pages.append((paragraph, page_num + 1))


FloatObject (b'0.00-16291952') invalid; use 0.0 instead
FloatObject (b'0.00-17076502') invalid; use 0.0 instead
FloatObject (b'0.00-18528813') invalid; use 0.0 instead
FloatObject (b'0.00-16291952') invalid; use 0.0 instead
FloatObject (b'0.00-16291952') invalid; use 0.0 instead


Total valid paragraphs: 1337
(Page 1) Towards characterizing dark matter subhalo perturbations in stellar streams with graph neural

(Page 1) 1Department of Astronomy, UC Berkeley, 501 Campbell Hall, Berkeley, CA, 94720, United States of America

(Page 1) 2Department of Mathematics, University of Toronto, 40 St. George Street, Toronto, ON, M5S 2E4, Canada

(Page 1) 3Department of Physics, Imperial College London, Blackett Laboratory, Prince Consort Road, London, SW7 2AZ, United Kingdom

(Page 1) 4Dunlap Institute for Astronomy and Astrophysics, University of Toronto, 50 St. George Street, Toronto, ON, M5S 3H4, Canada

(Page 1) 5David A. Dunlap Department of Astronomy and Astrophysics, University of Toronto,

(Page 1) 50 St. George Street, Toronto, ON, M5S 3H4, Canada

(Page 1) 6Department of Science, Technology and Society, Division of Natural Science, York University,

(Page 1) 218 Bethune College, Toronto, ON, M3J 1P3, Canada

(Page 1) The phase space of stellar streams is proposed t

### Separation of extracted text in paragraphs

### Filtering by length

# Model import

In [73]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Vector Database

### Creation

In [74]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

### Embedding

In [75]:
"""texts = ["king",
         "queen",
         "dictator",
         "hitler",
         "Berlin is the German capital city and is near Austria, Hitler's birthplace",
         "austria",
         "Hitler was born in the city of Vienna",
         "The city of Vienna is in Austria"
]"""

'texts = ["king",\n         "queen",\n         "dictator",\n         "hitler",\n         "Berlin is the German capital city and is near Austria, Hitler\'s birthplace",\n         "austria",\n         "Hitler was born in the city of Vienna",\n         "The city of Vienna is in Austria"\n]'

In [77]:
embeddings = embedding_model.encode(paragraphs)
print(embeddings.shape)  # (2, 384)

(5189, 384)


### Embeddings import in database

In [None]:
from qdrant_client.models import PointStruct

points = [
    PointStruct(
        id=idx,
        vector=vector,
        payload={
            "text": text,
            "page": num_pag
        },
    )
    for idx, (vector, (text, num_pag)) in enumerate(zip(embeddings, paragraphs_with_pages))
    
]

print(len(points))

1337


In [81]:
from qdrant_client.models import VectorParams, Distance

collection_name = "pdf_embeddings"

client.delete_collection(collection_name)

client.create_collection(
    collection_name,
    vectors_config=VectorParams(
        size=384,
        distance=Distance.COSINE,
    ),
)

True

### Uploading embeddings to database

In [82]:
# Example of batching points
batch_size = 1000  # Adjust as needed
for i in range(0, len(points), batch_size):
    batch = points[i:i + batch_size]
    client.upsert(collection_name, batch)

# Querying Database

In [None]:
result = client.query_points(
    collection_name=collection_name,
    query=embedding_model.encode("asd?"),
    limit=10
)

In [None]:
for point in result.points:
    print(point.payload["page"])

313
5
318
71
305
187
498
168
156
228


In [None]:
result

QueryResponse(points=[ScoredPoint(id=312, version=0, score=0.37698925, payload={'text': 'sequentially as they are in ASM. The Resource', 'page': 313}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=4, version=0, score=0.34734514, payload={'text': 'This book is provided “as -is” and expresses the', 'page': 5}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=317, version=0, score=0.33376485, payload={'text': 'the ASM model, all services had to be updated at', 'page': 318}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=70, version=0, score=0.33192196, payload={'text': 'but also for those who need a refresher a nd', 'page': 71}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=304, version=0, score=0.26356944, payload={'text': 'the same resource group and manage a nd', 'page': 305}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=186, version=0, score=0.25505117, payload={'text': 'software services you require on 