In [5]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
import json

# --- CONFIG ---
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"
input_gcs_uri = "gs://vercillo_projects/transactions/amex/2025/2025-06-03.pdf"
output_gcs_path = "transactions/amex/exports/2025-06-03.json"
bucket_name = "vercillo_projects"

# --- GCS Download (PDF as bytes) ---
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name)
blob_path = input_gcs_uri.replace(f"gs://{bucket_name}/", "")
pdf_blob = bucket.blob(blob_path)
pdf_bytes = pdf_blob.download_as_bytes()

# --- Document AI client ---
client = documentai.DocumentProcessorServiceClient()
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# --- Inline document request ---
raw_document = documentai.RawDocument(
    content=pdf_bytes,
    mime_type="application/pdf"
)

request = documentai.ProcessRequest(
    name=name,
    raw_document=raw_document,
    skip_human_review=True
)

result = client.process_document(request=request)
document = result.document

# --- Extract entities ---
entities = []
for entity in document.entities:
    entities.append({
        "field": entity.type_,
        "value": entity.mention_text,
        "confidence": round(entity.confidence, 2)
    })

# --- Save to GCS ---
output_blob = bucket.blob(output_gcs_path)
output_blob.upload_from_string(
    data=json.dumps(entities, indent=2),
    content_type="application/json"
)

print(f"✅ Parsed and saved: gs://{bucket_name}/{output_gcs_path}")


✅ Parsed and saved: gs://vercillo_projects/transactions/amex/exports/2025-06-03.json


In [7]:
from google.cloud import storage
import pandas as pd
import json

# --- CONFIG ---
project_id = "vercillopersonal"
bucket_name = "vercillo_projects"
gcs_blob_path = "transactions/amex/exports/2025-06-03.json"
local_json_path = "2025-06-03.json"
local_csv_path = "2025-06-03.csv"

# --- GCS Client ---
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(gcs_blob_path)

# --- Download JSON ---
blob.download_to_filename(local_json_path)
print(f"✅ File downloaded to: {local_json_path}")

# --- Load JSON as list of dicts ---
with open(local_json_path, "r") as f:
    data = json.load(f)

# --- Convert to DataFrame ---
df = pd.DataFrame(data)
print("📄 DataFrame preview:")
print(df.head())

# --- Save as CSV ---
df.to_csv(local_csv_path, index=False)
print(f"✅ CSV saved to: {local_csv_path}")


✅ File downloaded to: 2025-06-03.json
📄 DataFrame preview:
    field     value  confidence
0  amount  3,755.09        0.84
1  amount  5,852.93        1.00
2  amount      0.00        0.97
3  amount      0.00        1.00
4  amount  2.097.84        1.00
✅ CSV saved to: 2025-06-03.csv
