In [18]:
from google.cloud import documentai_v1 as documentai
from google.cloud import storage
from google.api_core.client_options import ClientOptions
import pandas as pd
import os

# === CONFIG ===
os.makedirs("./temp", exist_ok=True)
project_id = "vercillopersonal"  # ✅ FIXED HERE
location = "us"
processor_id = "fe61eee8945a8018"
bucket_name = "vercillo_projects"
input_blob_path = "transactions/amex/2025/2025-06-03.pdf"
output_blob_path = "transactions/amex/exports/2025-06-03.csv"
local_pdf_path = "./temp/amex.pdf"
local_csv_path = "./temp/amex.csv"

# === Download PDF from GCS ===
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(input_blob_path)
blob.download_to_filename(local_pdf_path)

# === Setup Document AI client ===
client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
)
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# === Load PDF content ===
with open(local_pdf_path, "rb") as f:
    raw_document = documentai.RawDocument(content=f.read(), mime_type="application/pdf")

# === Call Document AI ===
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
result = client.process_document(request=request)
document = result.document

# === Parse Entities ===
rows = []
for entity in document.entities:
    rows.append({
        "field": entity.type_,
        "value": entity.mention_text,
        "confidence": round(entity.confidence, 2),
        "page": entity.page_anchor.page_refs[0].page if entity.page_anchor.page_refs else None
    })

df = pd.DataFrame(rows)
df_clean = df.pivot_table(index="page", columns="field", values="value", aggfunc="first").reset_index()

# === Save to CSV locally ===
df_clean.to_csv(local_csv_path, index=False)

# === Upload CSV to GCS ===
output_blob = bucket.blob(output_blob_path)
output_blob.upload_from_filename(local_csv_path)

print(f"✅ CSV exported to: gs://{bucket_name}/{output_blob_path}")


✅ CSV exported to: gs://vercillo_projects/transactions/amex/exports/2025-06-03.csv
