In [2]:
from google.cloud import storage, documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
import pandas as pd
from pathlib import Path
import time
import json


# === Setup clients ===
storage_client = storage.Client()
documentai_client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint="us-documentai.googleapis.com")
)

# === CONFIG ===
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

bucket_name = "vercillo_projects"
prefix = "transactions/amex/2025/"
gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"

# === List PDF input URIs ===
pdf_blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
gcs_documents = [
    documentai.GcsDocument(gcs_uri=f"gs://{bucket_name}/{blob.name}", mime_type="application/pdf")
    for blob in pdf_blobs if blob.name.endswith(".pdf")
]

if not gcs_documents:
    raise ValueError("No PDF files found to process.")

# === Build input config for batch processing ===
input_config = documentai.BatchDocumentsInputConfig(
    gcs_documents=documentai.GcsDocuments(documents=gcs_documents)
)

# === Output config ===
output_config = documentai.DocumentOutputConfig(
    gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(gcs_uri=gcs_output_uri)
)

# === Submit request ===
request = documentai.BatchProcessRequest(
    name=name,
    input_documents=input_config,
    document_output_config=output_config
)

operation = documentai_client.batch_process_documents(request)

print("Waiting for operation to finish...")
operation.result(timeout=600)  # You may increase timeout if needed

print("✅ Document AI processing complete for all PDFs.")

Waiting for operation to finish...
✅ Document AI processing complete for all PDFs.


In [3]:
import json

# === Locate and process all JSON files ===
output_bucket_name = gcs_output_uri.replace("gs://", "").split("/")[0]
output_prefix = "/".join(gcs_output_uri.replace("gs://", "").split("/")[1:])
json_blobs = [
    b for b in storage_client.list_blobs(output_bucket_name, prefix=output_prefix)
    if b.name.endswith(".json")
]

if not json_blobs:
    raise ValueError("No JSON files found in export path.")

# Process each JSON and remove after processing
for blob in sorted(json_blobs, key=lambda b: b.updated):
    # Load JSON content
    json_str = blob.download_as_text()
    doc = json.loads(json_str)

    # Try to infer source PDF filename
    uri_path = doc.get("uri", blob.name)  # fallback to blob name
    pdf_filename = Path(uri_path).name.replace(".json", ".pdf")
    pdf_prefix = Path(pdf_filename).stem

    print(f"📄 Processing {pdf_filename}")

📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 


In [4]:
# List to collect all DataFrames
all_rows = []

for blob in sorted(json_blobs, key=lambda b: b.updated):
    json_str = blob.download_as_text()
    doc = json.loads(json_str)

    # === Get file info for naming
    uri_path = doc.get("uri", blob.name)
    pdf_filename = Path(uri_path).name.replace(".json", ".pdf")
    pdf_prefix = Path(pdf_filename).stem
    print(f"📄 Processing {pdf_filename}")

    # === Extract full text and pages
    full_text = doc.get("text", "")
    pages = doc.get("pages", [])
    rows = []

    def find_y_from_tokens(start_idx):
        for page in pages:
            for token in page.get("tokens", []):
                segs = token["layout"]["textAnchor"].get("textSegments", [])
                if segs:
                    token_start = int(segs[0].get("startIndex", -1))
                    if token_start == start_idx:
                        return round(token["layout"]["boundingPoly"]["normalizedVertices"][0]["y"], 4), page["pageNumber"]
        return None, None

    for entity in doc.get("entities", []):
        type_ = entity.get("type")
        value = entity.get("mentionText")
        confidence = round(entity.get("confidence", 0), 2)

        text_segments = entity.get("textAnchor", {}).get("textSegments", [{}])
        start_index = int(text_segments[0].get("startIndex", -1))
        end_index = int(text_segments[0].get("endIndex", -1))

        y_pos, page = find_y_from_tokens(start_index)

        rows.append({
            "type": type_,
            "value": value,
            "confidence": confidence,
            "page": page,
            "start_index": start_index,
            "end_index": end_index,
            "y_position": y_pos,
            "source_file": pdf_filename,  # Optional: track which file each row came from
        })

    df_single = pd.DataFrame(rows)
    all_rows.append(df_single)
    print(f"✅ Parsed {len(df_single)} entities from {pdf_filename}")

# Combine everything
df_all = pd.concat(all_rows, ignore_index=True)
print(df_all.info())


📄 Processing 
✅ Parsed 315 entities from 
📄 Processing 
✅ Parsed 309 entities from 
📄 Processing 
✅ Parsed 419 entities from 
📄 Processing 
✅ Parsed 207 entities from 
📄 Processing 
✅ Parsed 272 entities from 
📄 Processing 
✅ Parsed 208 entities from 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   type         1730 non-null   object 
 1   value        1730 non-null   object 
 2   confidence   1730 non-null   float64
 3   page         1730 non-null   int64  
 4   start_index  1730 non-null   int64  
 5   end_index    1730 non-null   int64  
 6   y_position   1730 non-null   float64
 7   source_file  1730 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 108.3+ KB
None


In [7]:
import re

# === Global regex patterns for parsing ===
date_regex = re.compile(r"^[A-Za-z]{3,9} \d{1,2}$")  # e.g. May 5
amount_regex = re.compile(r"-?\$?[\d,]+\.\d{2}$")    # e.g. -45.00, $123.45

all_payment_rows = []

for blob in sorted(json_blobs, key=lambda b: b.updated):
    json_str = blob.download_as_text()
    doc = json.loads(json_str)

    uri_path = doc.get("uri", blob.name)
    pdf_filename = Path(uri_path).name.replace(".json", ".pdf")
    pdf_prefix = Path(pdf_filename).stem
    print(f"📄 Processing {pdf_filename}")

    full_text = doc.get("text", "")
    pages = doc.get("pages", [])
    
    # Extract all payment entities from the doc
    rows = []
    def find_y_from_tokens(start_idx):
        for page in pages:
            for token in page.get("tokens", []):
                segs = token["layout"]["textAnchor"].get("textSegments", [])
                if segs:
                    token_start = int(segs[0].get("startIndex", -1))
                    if token_start == start_idx:
                        return round(token["layout"]["boundingPoly"]["normalizedVertices"][0]["y"], 4), page["pageNumber"]
        return None, None

    for entity in doc.get("entities", []):
        type_ = entity.get("type")
        value = entity.get("mentionText")
        confidence = round(entity.get("confidence", 0), 2)

        text_segments = entity.get("textAnchor", {}).get("textSegments", [{}])
        start_index = int(text_segments[0].get("startIndex", -1))
        end_index = int(text_segments[0].get("endIndex", -1))

        y_pos, page = find_y_from_tokens(start_index)
        rows.append({
            "type": type_,
            "value": value,
            "confidence": confidence,
            "page": page,
            "start_index": start_index,
            "end_index": end_index,
            "y_position": y_pos
        })

    df = pd.DataFrame(rows)

    # === Payment parsing for this file ===
    df_payment_entities = df[df["type"] == "payment"].sort_values(by="start_index").reset_index(drop=True)

    payment_rows = []
    for _, row in df_payment_entities.iterrows():
        start_idx = row["start_index"]

        # Collect all lines before this payment
        lines_before = []
        for page in pages:
            for line in page.get("lines", []):
                segs = line["layout"]["textAnchor"].get("textSegments", [])
                if not segs:
                    continue
                line_start = int(segs[0].get("startIndex", 0))
                line_end = int(segs[0].get("endIndex", 0))
                if line_end <= start_idx:
                    text = full_text[line_start:line_end].strip()
                    lines_before.append((line_start, text))

        # Sort lines by start index
        lines_before = sorted(lines_before, key=lambda x: x[0])

        # Extract all date lines and get the first one
        date_lines = [text for _, text in lines_before if date_regex.match(text)]
        tx_date = date_lines[0] if date_lines else None

        # Get lines after last date line
        last_date_idx = max([i for i, (_, text) in enumerate(lines_before) if date_regex.match(text)], default=-1)
        vendor_lines = [text for _, text in lines_before[last_date_idx+1:]]

        # Join vendor lines into single string
        vendor_full = " | ".join(vendor_lines).strip()

        # Extract amount
        amount_match = re.search(amount_regex, vendor_full)
        amount = amount_match.group(0).replace(",", "").replace("$", "") if amount_match else None
        vendor_clean = vendor_full[:vendor_full.rfind(amount_match.group(0))].strip(" |") if amount_match else vendor_full

        # Build row
        payment_rows.append({
            "source_file": pdf_filename,
            "transaction_date": tx_date,
            "posting_date": tx_date,
            "Vendor": vendor_clean,
            "amount": amount,
            "location": None,
        })

    df_payment_rows = pd.DataFrame(payment_rows)
    df_payment_rows = df_payment_rows[df_payment_rows["transaction_date"].notnull()]
    all_payment_rows.append(df_payment_rows)

# After loop: concatenate
df_all_payments = pd.concat(all_payment_rows, ignore_index=True)
print(df_all_payments.info())


📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 
📄 Processing 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source_file       14 non-null     object
 1   transaction_date  14 non-null     object
 2   posting_date      14 non-null     object
 3   Vendor            14 non-null     object
 4   amount            14 non-null     object
 5   location          0 non-null      object
dtypes: object(6)
memory usage: 804.0+ bytes
None


In [None]:
# # === Clean up all JSONs in the exports folder (after processing) ===
# export_prefix = "transactions/amex/exports/"

# for blob in storage_client.list_blobs("vercillo_projects", prefix=export_prefix):
#     if blob.name.endswith(".json"):
#         print(f"Deleting {blob.name}")
#         blob.delete()

Deleting transactions/amex/exports/3148426476992466026/0/2025-01-03-0.json
Deleting transactions/amex/exports/3148426476992466026/1/2025-02-03-0.json
Deleting transactions/amex/exports/3148426476992466026/2/2025-03-03-0.json
Deleting transactions/amex/exports/3148426476992466026/3/2025-04-03-0.json
Deleting transactions/amex/exports/3148426476992466026/4/2025-05-03-0.json
Deleting transactions/amex/exports/3148426476992466026/5/2025-06-03-0.json


In [8]:
df_payment_rows 

Unnamed: 0,source_file,transaction_date,posting_date,Vendor,amount,location
1,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-2500.0,
2,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-1689.8,
3,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-1245.69,
4,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-178.4,
5,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-99.46,
6,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-5.65,
7,,May 5,May 5,PAYMENT RECEIVED - THANK YOU | Reference AT251...,-68.57,


In [7]:
df.head(10)

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,PTZ INSURANCE SERVICES,1.0,2,2251,2273,0.3757
1,Vendor,ETSY.COM - CUSTOMMOOSE,0.99,2,2301,2323,0.3942
2,Vendor,LCBO/RAO #633,0.99,2,2387,2400,0.4194
3,Vendor,SECURITY NATIONAL INSUR,0.99,2,2439,2462,0.4379
4,Vendor,THE HOME DEPOT #7011,0.99,2,2497,2517,0.4569
5,Vendor,HOMESENSE 013,0.99,2,2546,2559,0.4758
6,Vendor,WINNERS 278,0.98,2,2598,2609,0.4947
7,Vendor,MIRVISH PRODUCTIONS,0.99,2,2623,2642,0.512
8,Vendor,SIMON SUSHI 001,0.99,2,2676,2691,0.5318
9,Vendor,THE HOME DEPOT #7274,1.0,2,2712,2732,0.5494


In [8]:
target_types = [
    "Vendor", "amount", "location", "payment",
    "posting_date", "transaction_date"
]

# Split: dedup these
df_dedup_target = df[df["type"].isin(target_types)].copy()

# Keep all other types untouched
df_other = df[~df["type"].isin(target_types)].copy()

# Only filter low-confidence 'location' values
mask_location = df_dedup_target["type"] == "location"
df_dedup_target = df_dedup_target[~mask_location | (df_dedup_target["confidence"] >= 0.90)]

# Deduplicate by type + page + y_position
df_dedup_target = df_dedup_target.drop_duplicates(subset=["type", "page", "y_position"])

# Combine both
df_cleaned = pd.concat([df_dedup_target, df_other], ignore_index=True)

df_cleaned


Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,PTZ INSURANCE SERVICES,1.00,2,2251,2273,0.3757
1,Vendor,ETSY.COM - CUSTOMMOOSE,0.99,2,2301,2323,0.3942
2,Vendor,LCBO/RAO #633,0.99,2,2387,2400,0.4194
3,Vendor,SECURITY NATIONAL INSUR,0.99,2,2439,2462,0.4379
4,Vendor,THE HOME DEPOT #7011,0.99,2,2497,2517,0.4569
...,...,...,...,...,...,...,...
266,transaction_date,Apr 19,1.00,4,5564,5570,0.4300
267,closing_date,"May 03, 2025",0.96,1,632,644,0.1615
268,opening_date,"Apr 04, 2025",0.98,1,619,631,0.1611
269,points_earned,5885,0.97,7,13368,13373,0.2701


In [None]:
anchor_types = ["transaction_date", "posting_date", "Vendor", "amount", "location"]
anchor_rows_all = []

# Inside your for blob in json_blobs loop, after `df_cleaned` is created:
df_anchor = df_cleaned[df_cleaned["type"].isin(anchor_types)].copy()

anchored_parts = []
for type_ in anchor_types:
    df_type = df_cleaned[df_cleaned["type"] == type_].copy()
    df_type = df_type.sort_values(by="start_index").reset_index(drop=True)
    df_type["row_id"] = range(len(df_type))  # Local row ID within the type
    anchored_parts.append(df_type[["type", "value", "start_index", "row_id"]])

df_anchor = pd.concat(anchored_parts).sort_values(by=["row_id", "type"]).reset_index(drop=True)

# Add file info (optional, good for tracing)
df_anchor["source_file"] = pdf_filename

# Collect all
anchor_rows_all.append(df_anchor)

anchor_rows_all

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         265 non-null    object
 1   value        265 non-null    object
 2   start_index  265 non-null    int64 
 3   row_id       265 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 8.4+ KB


In [None]:
# Rebuild wide-format table from this file’s anchors
df_output = df_anchor.pivot_table(
    index="row_id", columns="type", values="value", aggfunc="first"
).reset_index()

# Add file identifier for traceability
df_output["source_file"] = pdf_filename

# Collect for final merge
final_outputs.append(df_output)

df_ouput_all = pd.concat(final_outputs, ignore_index=True)
print(df_ouput_all.info())


type,row_id,Vendor,amount,location,posting_date,transaction_date
0,0,PTZ INSURANCE SERVICES,21.64,OAKVILLE,Apr 4,Apr 3
1,1,ETSY.COM - CUSTOMMOOSE,112.18,CANADA,Apr 4,Apr 4
2,2,LCBO/RAO #633,61.9,ETOBICOKE,Apr 5,Apr 4
3,3,SECURITY NATIONAL INSUR,111.64,MONTREAL,Apr 5,Apr 4
4,4,THE HOME DEPOT #7011,15.42,ETOBICOKE,Apr 7,Apr 6


In [11]:
# Create a helper function to extract single-value fields
def extract_single_value(df, field_name):
    matches = df[df["type"] == field_name]["value"]
    return matches.iloc[0] if not matches.empty else None

closing_date = extract_single_value(df_cleaned, "closing_date")
opening_date = extract_single_value(df_cleaned, "opening_date")

df_ouput["closing_date"] = closing_date
df_ouput["opening_date"] = opening_date

df_ouput.head()

type,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date
0,0,PTZ INSURANCE SERVICES,21.64,OAKVILLE,Apr 4,Apr 3,"May 03, 2025","Apr 04, 2025"
1,1,ETSY.COM - CUSTOMMOOSE,112.18,CANADA,Apr 4,Apr 4,"May 03, 2025","Apr 04, 2025"
2,2,LCBO/RAO #633,61.9,ETOBICOKE,Apr 5,Apr 4,"May 03, 2025","Apr 04, 2025"
3,3,SECURITY NATIONAL INSUR,111.64,MONTREAL,Apr 5,Apr 4,"May 03, 2025","Apr 04, 2025"
4,4,THE HOME DEPOT #7011,15.42,ETOBICOKE,Apr 7,Apr 6,"May 03, 2025","Apr 04, 2025"


In [12]:
points_earned = extract_single_value(df_cleaned, "points_earned")
points_redeemed = extract_single_value(df_cleaned, "points_redeemed")

points_rows = pd.DataFrame([
    {
        "row_id": df_ouput["row_id"].max() + 1,
        "Vendor": "points_earned",
        "amount": points_earned,
        "location": None,
        "posting_date": None,
        "transaction_date": None,
        "closing_date": closing_date,
        "opening_date": opening_date
    },
    {
        "row_id": df_ouput["row_id"].max() + 2,
        "Vendor": "points_redeemed",
        "amount": points_redeemed,
        "location": None,
        "posting_date": None,
        "transaction_date": None,
        "closing_date": closing_date,
        "opening_date": opening_date
    }
])

df_ouput = pd.concat([df_ouput, points_rows], ignore_index=True)

In [13]:
# Add row_id and meta fields before final export
base_row_id = df_ouput["row_id"].max() + 1
df_payment_rows["row_id"] = range(base_row_id, base_row_id + len(df_payment_rows))
df_payment_rows["closing_date"] = closing_date
df_payment_rows["opening_date"] = opening_date

df_ouput = pd.concat([df_ouput, df_payment_rows], ignore_index=True)

In [14]:
df_ouput.head()

Unnamed: 0,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date
0,0,PTZ INSURANCE SERVICES,21.64,OAKVILLE,Apr 4,Apr 3,"May 03, 2025","Apr 04, 2025"
1,1,ETSY.COM - CUSTOMMOOSE,112.18,CANADA,Apr 4,Apr 4,"May 03, 2025","Apr 04, 2025"
2,2,LCBO/RAO #633,61.9,ETOBICOKE,Apr 5,Apr 4,"May 03, 2025","Apr 04, 2025"
3,3,SECURITY NATIONAL INSUR,111.64,MONTREAL,Apr 5,Apr 4,"May 03, 2025","Apr 04, 2025"
4,4,THE HOME DEPOT #7011,15.42,ETOBICOKE,Apr 7,Apr 6,"May 03, 2025","Apr 04, 2025"


In [10]:
# Local export path (make sure this folder exists)
local_csv = rf"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\{pdf_prefix}_cleansed.csv"

# Save locally
df_all_payments.to_csv(local_csv, index=False)
print(f"Exported CSV saved to: {local_csv}")

Exported CSV saved to: C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\_cleansed.csv
