# Text Extraction from PDF

In [48]:
import PyPDF2

# Open the PDF file in read-binary mode
with open('data/azure_docs.pdf', 'rb') as file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Get the number of pages in the PDF
    num_pages = len(pdf_reader.pages)
    
    # Initialize a string to hold the extracted text
    extracted_text = ''
    
    # Loop through each page and extract text
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        extracted_text += page.extract_text()

### Separation of extracted text in paragraphs

In [49]:
# Split the text into paragraphs based on newlines or custom delimiters
paragraphs = extracted_text.split('\n')

# Clean up empty lines or unwanted characters
paragraphs = [p.strip() for p in paragraphs if p.strip()]


# Model import

In [50]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Vector Database

### Creation

In [51]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

### Embedding

In [52]:
"""texts = ["king",
         "queen",
         "dictator",
         "hitler",
         "Berlin is the German capital city and is near Austria, Hitler's birthplace",
         "austria",
         "Hitler was born in the city of Vienna",
         "The city of Vienna is in Austria"
]"""

'texts = ["king",\n         "queen",\n         "dictator",\n         "hitler",\n         "Berlin is the German capital city and is near Austria, Hitler\'s birthplace",\n         "austria",\n         "Hitler was born in the city of Vienna",\n         "The city of Vienna is in Austria"\n]'

In [53]:
embeddings = embedding_model.encode(paragraphs)
print(embeddings.shape)  # (2, 384)

(11420, 384)


### Embeddings import in database

In [58]:
from qdrant_client.models import PointStruct

points = [
    PointStruct(
        id=idx,
        vector=data,
        payload={"text": text},
    )
    for idx, (data, text) in enumerate(zip(embeddings, paragraphs))
    
]

[PointStruct(id=0, vector=[-0.08935342729091644, -0.0027112418320029974, -0.015764549374580383, 0.018348753452301025, 0.007916406728327274, 0.02348656952381134, 0.043074049055576324, 0.0744745135307312, 0.027069082483649254, 0.04214061424136162, -0.0034745316952466965, 0.060231342911720276, -0.01739695854485035, -0.0007344487821683288, -0.07785063236951828, -0.08178450167179108, 0.004055880941450596, 0.0031763180159032345, -0.1030840128660202, -0.003989437595009804, 0.027467647567391396, -0.0499088279902935, -0.019314415752887726, -0.012732123024761677, -0.07421362400054932, 0.07581399381160736, 0.03668233007192612, 0.024651149287819862, 0.007525783032178879, -0.15027666091918945, 0.0009189140982925892, 0.048122793436050415, 0.006224657874554396, -0.01812112331390381, -0.06815818697214127, -0.008398870937526226, 0.10647297650575638, -0.07583651691675186, 0.01122576929628849, 0.0008471235050819814, -0.04105352982878685, -0.0810575783252716, 0.017879726365208626, 0.003712973091751337, 0.

In [None]:
from qdrant_client.models import VectorParams, Distance

collection_name = "pdf_embeddings"

client.delete_collection(collection_name)

client.create_collection(
    collection_name,
    vectors_config=VectorParams(
        size=384,
        distance=Distance.COSINE,
    ),
)

# Example of batching points
batch_size = 1000  # Adjust as needed
for i in range(0, len(points), batch_size):
    batch = points[i:i + batch_size]
    client.upsert(collection_name, batch)
