In [10]:
import streamlit as st
import fitz  # PyMuPDF
from azure.storage.blob import BlobServiceClient
from io import BytesIO
import os
from dotenv import load_dotenv

load_dotenv()

STORAGE_ACCOUNT_NAME = os.getenv("STORAGE_ACCOUNT_NAME")
STORAGE_ACCOUNT_KEY = os.getenv("STORAGE_ACCOUNT_KEY")
container_name = os.getenv("CONTAINER_NAME")
# os.path.join(os.getenv("CONTAINER_NAME"), os.getenv("CONTAINER_PATH"))
# Azure setup

connect_str = f"DefaultEndpointsProtocol=https;AccountName={STORAGE_ACCOUNT_NAME};AccountKey={STORAGE_ACCOUNT_KEY};EndpointSuffix=core.windows.net"
# connect_str = st.secrets["AZURE_STORAGE_CONNECTION_STRING"]
# container_name = "your-container"
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client(container_name)

# Get all blobs (PDFs only)
def list_pdfs():
    return [blob.name for blob in container_client.list_blobs(name_starts_with="SAP_JSON/Invoice/") if blob.name.endswith('.pdf') and not blob.name.startswith("processed/")]

def get_blob(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    return blob_client.download_blob().readall()

def move_blob(blob_name, new_name):
    source_url = f"{blob_service_client.primary_endpoint}/{container_name}/{blob_name}".replace('//', '/').replace('https:/', 'https://')
    copied_blob = container_client.get_blob_client(f"SAP_JSON/Bills/2025-04-27/{new_name}")
    copied_blob.start_copy_from_url(source_url)
    container_client.delete_blob(blob_name)

# PDF Renderer
def render_pdf(data, page_num):
    pdf = fitz.open(stream=data, filetype="pdf")
    if page_num >= len(pdf):
        page_num = len(pdf) - 1
    page = pdf.load_page(page_num)
    pix = page.get_pixmap()
    return pix.tobytes("png")

In [12]:
move_blob(r"SAP_JSON/Bills/sharathhebbar24_gmail_com_cd3a32884f.pdf", "sharathhebbar24_gmail_com_cd3a32884f.pdf")
move_blob(r"SAP_JSON/Bills/sharathhebbar24_gmail_com_B0001.pdf", "sharathhebbar24_gmail_com_B0001.pdf")
move_blob(r"SAP_JSON/Bills/processed/sharathhebbar24_gmail_com_cd3a32884f.pdf", "sharathhebbar24_gmail_com_cd3a32884f.pdf")
move_blob(r"SAP_JSON/Bills/processed/sharathhebbar24_gmail_com_B0001.pdf", "sharathhebbar24_gmail_com_B0001.pdf")

In [25]:
container_path = "SAP_JSON/Invoice"
processed_path = container_path +"/processed" #os.path.join(container_path, "processed")
[blob.name for blob in container_client.list_blobs(name_starts_with=container_path) if blob.name.endswith('.pdf') and not blob.name.startswith(processed_path)]

['SAP_JSON/Invoice/invoice.pdf']

In [28]:
import re
sanitized_name = re.sub(r"[^\w\-_.]", "_", "Flipkart.pdf")
f"{processed_path}/{sanitized_name}"

'SAP_JSON/Invoice/processed/Flipkart.pdf'

In [14]:
container_client.get_blob_client(f"SAP_JSON/Invoice/processed/").blob_name

'SAP_JSON/Invoice/processed/'

In [None]:
from pathlib import Path
Path("SAP_JSON/Invoice/santhosh_s_leskode_com_LSK-INV2425-034.pdf").name

'santhosh_s_leskode_com_LSK-INV2425-034.pdf'

In [1]:
import os

In [2]:
import fitz  # PyMuPDF
from azure.storage.blob import BlobServiceClient
from io import BytesIO
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

STORAGE_ACCOUNT_NAME = os.getenv("STORAGE_ACCOUNT_NAME")
STORAGE_ACCOUNT_KEY = os.getenv("STORAGE_ACCOUNT_KEY")
container_name = "mappings" #os.path.join(os.getenv("CONTAINER_NAME"), os.getenv("CONTAINER_PATH"))
# Azure setup

connect_str = f"DefaultEndpointsProtocol=https;AccountName={STORAGE_ACCOUNT_NAME};AccountKey={STORAGE_ACCOUNT_KEY};EndpointSuffix=core.windows.net"
# connect_str = st.secrets["AZURE_STORAGE_CONNECTION_STRING"]
# container_name = "your-container"
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client(container_name)

In [8]:
container_name

'mappings'

In [13]:
[blob.name for blob in container_client.list_blobs() if blob.name.endswith('.pdf') and not blob.name.startswith("processed/")]

['SAP_JSON/Bills/santhosh_s_leskode_com_LSK-INV2425-034.pdf',
 'SAP_JSON/Bills/sharathhebbar24_gmail_com_B0001.pdf',
 'SAP_JSON/Bills/sharathhebbar24_gmail_com_cd3a32884f.pdf',
 'SAP_JSON/Invoice/santhosh_s_leskode_com_LSK-INV2425-034.pdf',
 'SAP_JSON/Invoice/santhosh_s_leskode_com_LSK-INV2425-039.pdf',
 'SAP_JSON/PurchaseOrder/2025-04-14/dagarwal_nsight-inc_com_PO-00017.pdf',
 'SAP_JSON/PurchaseOrder/dagarwal_nsight-inc_com_PO-00017.pdf',
 'SAP_JSON/PurchaseOrder/sharathhebbar24_gmail_com_PO-00017.pdf',
 'SAP_JSON/SalesOrder/santhosh_s_leskode_com_SO-000319.pdf',
 'SAP_JSON/SalesOrder/sharathhebbar24_gmail_com_SO-00032.pdf']

In [15]:
[
    blob.name
    for blob in container_client.list_blobs()
    if blob.name.endswith(".pdf") and not blob.name.startswith("processed/")
]

['SAP_JSON/Bills/santhosh_s_leskode_com_LSK-INV2425-034.pdf',
 'SAP_JSON/Bills/sharathhebbar24_gmail_com_B0001.pdf',
 'SAP_JSON/Bills/sharathhebbar24_gmail_com_cd3a32884f.pdf',
 'SAP_JSON/Invoice/santhosh_s_leskode_com_LSK-INV2425-034.pdf',
 'SAP_JSON/Invoice/santhosh_s_leskode_com_LSK-INV2425-039.pdf',
 'SAP_JSON/PurchaseOrder/2025-04-14/dagarwal_nsight-inc_com_PO-00017.pdf',
 'SAP_JSON/PurchaseOrder/dagarwal_nsight-inc_com_PO-00017.pdf',
 'SAP_JSON/PurchaseOrder/sharathhebbar24_gmail_com_PO-00017.pdf',
 'SAP_JSON/SalesOrder/santhosh_s_leskode_com_SO-000319.pdf',
 'SAP_JSON/SalesOrder/sharathhebbar24_gmail_com_SO-00032.pdf']