## Connection to Cosmoss DB

In [None]:
%pip install azure-storage-blob azure-cosmos PyPDF2


In [None]:
from azure.storage.blob import BlobServiceClient
from azure.cosmos import CosmosClient, PartitionKey
import PyPDF2
import io
import uuid

# ---------- CONFIG ----------
# Blob Storage
BLOB_CONNECTION_STRING = ""
BLOB_CONTAINER_NAME = "ragdata"

# Cosmos DB
COSMOS_CONNECTION_STRING = ""
DATABASE_NAME = "RAGrelatedPDFs"
CONTAINER_NAME = "RawDataPDFstorage"
PARTITION_KEY_PATH = "/PDFId"

# ---------- STEP 1: CONNECT TO BLOB STORAGE ----------
blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(BLOB_CONTAINER_NAME)

print("📥 Fetching PDF list from Blob Storage...")

# ---------- STEP 2: CONNECT TO COSMOS DB ----------
cosmos_client = CosmosClient.from_connection_string(COSMOS_CONNECTION_STRING)
database = cosmos_client.create_database_if_not_exists(id=DATABASE_NAME)
container = database.create_container_if_not_exists(
    id=CONTAINER_NAME,
    partition_key=PartitionKey(path=PARTITION_KEY_PATH),
    offer_throughput=400
)

# ---------- STEP 3: PROCESS EACH PDF ----------
for blob in container_client.list_blobs():
    if blob.name.lower().endswith(".pdf"):
        print(f"\n📄 Processing: {blob.name}")

        # Download PDF file
        blob_data = container_client.download_blob(blob.name).readall()

        # Extract text from PDF
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(blob_data))
        full_text = ""
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

        # Generate unique PDFId
        pdf_id = str(uuid.uuid4())

        # Create document for Cosmos DB
        document = {
            "id": blob.name,        # Human-readable file name
            "PDFId": pdf_id,        # Unique partition key
            "content": full_text
        }

        # Insert into Cosmos DB
        container.create_item(body=document)
        print(f"✅ Inserted into Cosmos DB with PDFId: {pdf_id}")

print("\n🎯 All PDFs processed successfully.")


📥 Fetching PDF list from Blob Storage...

📄 Processing: fees105.pdf
✅ Inserted into Cosmos DB with PDFId: 6e021b52-35a9-43d4-ae08-eec375bddf18

🎯 All PDFs processed successfully.


In [None]:
from azure.storage.blob import BlobServiceClient
from azure.cosmos import CosmosClient, PartitionKey
import PyPDF2
import io
import uuid
import openai

# ---------- CONFIG ----------
# Blob Storage
BLOB_CONNECTION_STRING = ""
BLOB_CONTAINER_NAME = "ragdata"

# Cosmos DB
COSMOS_CONNECTION_STRING = ""
DATABASE_NAME = "RAGrelatedPDFs"
RAW_CONTAINER_NAME = "RawDataPDFstorage"
EMB_CONTAINER_NAME = "PDFembeedings"
PARTITION_KEY_PATH = "/PDFId"
FOLDER_PREFIX = "myfolder/"

# OpenAI
openai.api_key = "<your-openai-api-key>"  # Or use Azure OpenAI endpoint/key

# ---------- STEP 1: CONNECT TO BLOB STORAGE ----------
blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(BLOB_CONTAINER_NAME)

print("📥 Fetching PDF list from Blob Storage...")

# ---------- STEP 2: CONNECT TO COSMOS DB ----------
cosmos_client = CosmosClient.from_connection_string(COSMOS_CONNECTION_STRING)
database = cosmos_client.create_database_if_not_exists(id=DATABASE_NAME)

# Create Raw Data container
raw_container = database.create_container_if_not_exists(
    id=RAW_CONTAINER_NAME,
    partition_key=PartitionKey(path=PARTITION_KEY_PATH),
    offer_throughput=400
)

# Create Embeddings container
emb_container = database.create_container_if_not_exists(
    id=EMB_CONTAINER_NAME,
    partition_key=PartitionKey(path=PARTITION_KEY_PATH),
    offer_throughput=400
)

# ---------- STEP 3: PROCESS EACH PDF ----------
FOLDER_PREFIX = "myfolder/"  # Change this to your folder name in the container

for blob in container_client.list_blobs(name_starts_with=FOLDER_PREFIX):
    if blob.name.lower().endswith(".pdf"):
        print(f"\n📄 Processing: {blob.name}")

        # Download PDF file
        blob_data = container_client.download_blob(blob.name).readall()

        # Read PDF and extract text per page
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(blob_data))
        full_text = ""
        pdf_id = str(uuid.uuid4())  # Unique partition key for this PDF

        # Store Raw Data (full PDF text)
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

        raw_doc = {
            "id": blob.name.split("/")[-1],  # Store only filename as ID
            "PDFId": pdf_id,
            "content": full_text
        }
        raw_container.create_item(body=raw_doc)
        print(f"✅ Raw text inserted for {blob.name} with PDFId {pdf_id}")

        # Store Page-wise Embeddings
        for page_number, page in enumerate(pdf_reader.pages, start=1):
            page_text = page.extract_text()
            if not page_text:
                continue

            # Generate embedding for page text
            emb_response = openai.Embedding.create(
                input=page_text,
                model="text-embedding-3-small"
            )
            embedding_vector = emb_response['data'][0]['embedding']

            emb_doc = {
                "id": f"{blob.name.split('/')[-1]}_page_{page_number}",  # Unique per page
                "PDFId": pdf_id,
                "PageNumber": page_number,
                "Embeedings": embedding_vector
            }
            emb_container.create_item(body=emb_doc)

        print(f"✅ Page-wise embeddings inserted for {blob.name}")

print("\n🎯 All PDFs processed successfully.")


In [None]:
from azure.storage.blob import BlobServiceClient
from azure.cosmos import CosmosClient, PartitionKey
import PyPDF2
import io
import uuid
from sentence_transformers import SentenceTransformer

# ---------- CONFIG ----------
# Blob Storage
BLOB_CONNECTION_STRING = ""
BLOB_CONTAINER_NAME = "ragdata"

# Cosmos DB
COSMOS_CONNECTION_STRING = ""
DATABASE_NAME = "RAGrelatedPDFs"
RAW_CONTAINER_NAME = "RawDataPDFstorage"
EMB_CONTAINER_NAME = "PDFembeedings"
PARTITION_KEY_PATH = "/PDFId"


# ---------- STEP 1: CONNECT TO BLOB STORAGE ----------
blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(BLOB_CONTAINER_NAME)

print("📥 Fetching PDF list from Blob Storage...")

# ---------- STEP 2: CONNECT TO COSMOS DB ----------
cosmos_client = CosmosClient.from_connection_string(COSMOS_CONNECTION_STRING)
database = cosmos_client.create_database_if_not_exists(id=DATABASE_NAME)

# Create Raw Data container
raw_container = database.create_container_if_not_exists(
    id=RAW_CONTAINER_NAME,
    partition_key=PartitionKey(path=PARTITION_KEY_PATH),
    offer_throughput=400
)

# Create Embeddings container
emb_container = database.create_container_if_not_exists(
    id=EMB_CONTAINER_NAME,
    partition_key=PartitionKey(path=PARTITION_KEY_PATH),
    offer_throughput=400
)

# ---------- STEP 3: PROCESS EACH PDF ----------
for blob in container_client.list_blobs():
    if blob.name.lower().endswith(".pdf"):
        print(f"\n📄 Processing: {blob.name}")

        # Download PDF file
        blob_data = container_client.download_blob(blob.name).readall()

        # Read PDF and extract text per page
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(blob_data))
        full_text = ""
        pdf_id = str(uuid.uuid4())  # Unique partition key for this PDF

        # Store Raw Data (full PDF text)
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

        raw_doc = {
            "id": blob.name,    # Human-readable file name
            "PDFId": pdf_id,    # Partition key
            "content": full_text
        }
        raw_container.create_item(body=raw_doc)
        print(f"✅ Raw text inserted for {blob.name} with PDFId {pdf_id}")

        # Store Page-wise Embeddings
        for page_number, page in enumerate(pdf_reader.pages, start=1):
            page_text = page.extract_text()
            if not page_text:
                continue

            # Generate embedding for page text
            model = SentenceTransformer('all-MiniLM-L6-v2')
            embedding_vector = model.encode(page_text).tolist()

            emb_doc = {
                "id": f"{blob.name}_page_{page_number}",  # Unique per page
                "PDFId": pdf_id,                          # Same partition key as raw data
                "PageNumber": page_number,
                "Embeedings": embedding_vector
            }
            emb_container.create_item(body=emb_doc)

        print(f"✅ Page-wise embeddings inserted for {blob.name}")

print("\n🎯 All PDFs processed successfully.")
