In [1]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
import pandas as pd
import json
import io

# --- CONFIG ---
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"
bucket_name = "vercillo_projects"

input_gcs_uri = "gs://vercillo_projects/transactions/amex/2025/2025-06-03.pdf"
json_gcs_path = "transactions/amex/exports/2025-06-03.json"
csv_gcs_path = "transactions/amex/exports/2025-06-03.csv"
local_json_path = "2025-06-03.json"
local_csv_path = "2025-06-03.csv"

# --- Setup clients ---
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name)
client = documentai.DocumentProcessorServiceClient()
processor_path = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# --- Download PDF bytes from GCS ---
pdf_blob_path = input_gcs_uri.replace(f"gs://{bucket_name}/", "")
pdf_blob = bucket.blob(pdf_blob_path)
pdf_bytes = pdf_blob.download_as_bytes()

# --- Call Document AI ---
raw_document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
request = documentai.ProcessRequest(
    name=processor_path,
    raw_document=raw_document,
    skip_human_review=True
)
result = client.process_document(request=request)
document = result.document

# --- Extract entities ---
entities = [
    {
        "field": entity.type_,
        "value": entity.mention_text,
        "confidence": round(entity.confidence, 2)
    }
    for entity in document.entities
]

# --- Save JSON to GCS ---
json_blob = bucket.blob(json_gcs_path)
json_blob.upload_from_string(json.dumps(entities, indent=2), content_type="application/json")
print(f"✅ Extracted JSON saved to: gs://{bucket_name}/{json_gcs_path}")

# --- Download JSON locally (optional, already in memory) ---
with open(local_json_path, "w") as f:
    json.dump(entities, f, indent=2)
print(f"✅ JSON saved locally to: {local_json_path}")

# --- Load into DataFrame ---
df = pd.DataFrame(entities)
print("📄 DataFrame preview:")
print(df.head())

# --- Save CSV locally ---
df.to_csv(local_csv_path, index=False)
print(f"✅ CSV saved locally to: {local_csv_path}")

# --- Upload CSV to GCS ---
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
csv_blob = bucket.blob(csv_gcs_path)
csv_blob.upload_from_string(csv_buffer.getvalue(), content_type="text/csv")
print(f"✅ CSV uploaded to: gs://{bucket_name}/{csv_gcs_path}")


✅ Extracted JSON saved to: gs://vercillo_projects/transactions/amex/exports/2025-06-03.json
✅ JSON saved locally to: 2025-06-03.json
📄 DataFrame preview:
    field     value  confidence
0  amount  3,755.09        0.84
1  amount  5,852.93        1.00
2  amount      0.00        0.97
3  amount      0.00        1.00
4  amount  2.097.84        1.00
✅ CSV saved locally to: 2025-06-03.csv
✅ CSV uploaded to: gs://vercillo_projects/transactions/amex/exports/2025-06-03.csv
