In [16]:
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
from google.cloud import storage
import pandas as pd
import time
from pathlib import Path
import json

# === CONFIG ===
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"

# === INPUT/OUTPUT PATHS ===
gcs_input_uri = "gs://vercillo_projects/transactions/amex/2025/2025-02-03.pdf"
pdf_filename = Path(gcs_input_uri).name               
pdf_prefix = pdf_filename.replace(".pdf", "")        
gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"

# === Setup Client ===
client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
)
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# === GCS input/output config ===
input_config = documentai.BatchDocumentsInputConfig(
    gcs_documents=documentai.GcsDocuments(
        documents=[documentai.GcsDocument(gcs_uri=gcs_input_uri, mime_type="application/pdf")]
    )
)

output_config = documentai.DocumentOutputConfig(
    gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri
    )
)

# === Submit batch process ===
request = documentai.BatchProcessRequest(
    name=name,
    input_documents=input_config,
    document_output_config=output_config
)

operation = client.batch_process_documents(request)

print("Waiting for operation to finish...")
operation.result(timeout=300)

print("Document AI processing complete.")

Waiting for operation to finish...
Document AI processing complete.


In [17]:
import json

# === Locate the first JSON file in output path ===
storage_client = storage.Client()
output_bucket_name = gcs_output_uri.replace("gs://", "").split("/")[0]
output_prefix = "/".join(gcs_output_uri.replace("gs://", "").split("/")[1:])

blobs = list(storage_client.list_blobs(output_bucket_name, prefix=output_prefix))
json_blobs = [b for b in blobs if b.name.endswith(".json")]

if not json_blobs:
    raise ValueError("No JSON output found. Wait a few more seconds or check if the processor ran successfully.")

# Sort and take the most recent JSON (usually only one)
json_blobs = sorted(json_blobs, key=lambda b: b.updated, reverse=True)
output_blob = json_blobs[0]

# Download and parse
json_str = output_blob.download_as_text()
doc = json.loads(json_str)


In [18]:
full_text = doc.get("text", "")
pages = doc.get("pages", [])
rows = []

def find_y_from_tokens(start_idx):
    for page in pages:
        for token in page.get("tokens", []):
            segs = token["layout"]["textAnchor"].get("textSegments", [])
            if segs:
                token_start = int(segs[0].get("startIndex", -1))
                if token_start == start_idx:
                    return round(token["layout"]["boundingPoly"]["normalizedVertices"][0]["y"], 4), page["pageNumber"]
    return None, None

for entity in doc.get("entities", []):
    type_ = entity.get("type")
    value = entity.get("mentionText")
    confidence = round(entity.get("confidence", 0), 2)

    text_segments = entity.get("textAnchor", {}).get("textSegments", [{}])
    start_index = int(text_segments[0].get("startIndex", -1))
    end_index = int(text_segments[0].get("endIndex", -1))

    # Get Y and page by matching entity start index to token
    y_pos, page = find_y_from_tokens(start_index)

    rows.append({
        "type": type_,
        "value": value,
        "confidence": confidence,
        "page": page,
        "start_index": start_index,
        "end_index": end_index,
        "y_position": y_pos
    })

df = pd.DataFrame(rows)
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   type         309 non-null    object 
 1   value        309 non-null    object 
 2   confidence   309 non-null    float64
 3   page         309 non-null    int64  
 4   start_index  309 non-null    int64  
 5   end_index    309 non-null    int64  
 6   y_position   309 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 17.0+ KB
None


In [19]:
import re

date_regex = re.compile(r"^[A-Za-z]{3,9} \d{1,2}$")   # e.g. Dec 4
amount_regex = re.compile(r"-?\$?[\d,]+\.\d{2}$")     # e.g. -2,481.67

payment_rows = []

# Filter for payment entities
df_payment_entities = df[df["type"] == "payment"].sort_values(by="start_index").reset_index(drop=True)

for _, row in df_payment_entities.iterrows():
    lines = row["value"].splitlines()
    lines = [line.strip() for line in lines if line.strip()]

    # === Extract amount
    amount = None
    if lines and amount_regex.match(lines[-1]):
        amount = lines[-1].replace(",", "").replace("$", "")
        lines = lines[:-1]

    # === Extract dates
    dates = [line for line in lines if date_regex.match(line)]
    transaction_date = dates[0] if len(dates) > 0 else None
    posting_date = dates[1] if len(dates) > 1 else transaction_date
    vendor_lines = [line for line in lines if line not in dates]

    # === Clean vendor
    vendor_clean = " | ".join(vendor_lines).strip()
    if "payment received" in vendor_clean.lower():
        vendor_clean = "PAYMENT RECEIVED"

    payment_rows.append({
        "transaction_date": transaction_date,
        "posting_date": posting_date,
        "Vendor": vendor_clean,
        "amount": amount,
        "location": None,
    })

df_payment_rows = pd.DataFrame(payment_rows)
df_payment_rows = df_payment_rows[df_payment_rows["transaction_date"].notnull()].reset_index(drop=True)


In [20]:
# === Clean up all JSONs in the exports folder (after processing) ===
export_prefix = "transactions/amex/exports/"

for blob in storage_client.list_blobs("vercillo_projects", prefix=export_prefix):
    if blob.name.endswith(".json"):
        print(f"Deleting {blob.name}")
        blob.delete()

Deleting transactions/amex/exports/15139426876766062364/0/2025-02-03-0.json


In [21]:
df_payment_rows 

Unnamed: 0,transaction_date,posting_date,Vendor,amount,location
0,Jan 4,Jan 4,PAYMENT RECEIVED,-562.48,
1,Jan 13,Jan 13,PAYMENT RECEIVED,-1000.0,
2,Jan 24,Jan 24,PAYMENT RECEIVED,-4000.0,


In [22]:
df.head(10)

Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,PTZ INSURANCE SERVICES,0.99,2,2336,2358,0.4043
1,Vendor,CANADA COMPUTERS ONLIN,0.95,2,2386,2408,0.4228
2,Vendor,CANADA COMPUTERS ONLINE,0.95,2,2442,2465,0.4413
3,Vendor,TST-DOOR FIFTYFIVE,0.99,2,2500,2518,0.4607
4,Vendor,AMZN MKTP CA*ZP9B72SS1,0.99,2,2549,2571,0.4783
5,Vendor,SECURITY NATIONAL INSUR,0.98,2,2605,2628,0.4973
6,Vendor,UW WATSPEED,0.98,2,2656,2667,0.5166
7,Vendor,AMZN MKТР СА,0.99,2,2698,2710,0.5343
8,Vendor,WALMART.CA,0.98,2,2745,2755,0.5528
9,Vendor,DELL CANADA INC.,0.99,2,2786,2802,0.5713


In [23]:
target_types = [
    "Vendor", "amount", "location", "payment",
    "posting_date", "transaction_date"
]

# Split: dedup these
df_dedup_target = df[df["type"].isin(target_types)].copy()

# Keep all other types untouched
df_other = df[~df["type"].isin(target_types)].copy()

# Only filter low-confidence 'location' values
mask_location = df_dedup_target["type"] == "location"
df_dedup_target = df_dedup_target[~mask_location | (df_dedup_target["confidence"] >= 0.90)]

# Deduplicate by type + page + y_position
df_dedup_target = df_dedup_target.drop_duplicates(subset=["type", "page", "y_position"])

# Combine both
df_cleaned = pd.concat([df_dedup_target, df_other], ignore_index=True)

df_cleaned


Unnamed: 0,type,value,confidence,page,start_index,end_index,y_position
0,Vendor,PTZ INSURANCE SERVICES,0.99,2,2336,2358,0.4043
1,Vendor,CANADA COMPUTERS ONLIN,0.95,2,2386,2408,0.4228
2,Vendor,CANADA COMPUTERS ONLINE,0.95,2,2442,2465,0.4413
3,Vendor,TST-DOOR FIFTYFIVE,0.99,2,2500,2518,0.4607
4,Vendor,AMZN MKTP CA*ZP9B72SS1,0.99,2,2549,2571,0.4783
...,...,...,...,...,...,...,...
302,transaction_date,Jan 24,1.00,4,6427,6433,0.6832
303,closing_date,"Feb 03, 2025",0.99,1,629,641,0.1607
304,opening_date,"Jan 04, 2025",0.98,1,616,628,0.1615
305,points_earned,10486,0.99,7,14290,14296,0.2701


In [24]:
anchor_types = ["transaction_date", "posting_date", "Vendor", "amount", "location"]

# Step 1: Filter just the rows of interest
df_anchor = df_cleaned[df_cleaned["type"].isin(anchor_types)].copy()

anchored_parts = []

for type_ in anchor_types:
    df_type = df_cleaned[df_cleaned["type"] == type_].copy()
    df_type = df_type.sort_values(by="start_index").reset_index(drop=True)
    df_type["row_id"] = range(len(df_type))
    anchored_parts.append(df_type[["type", "value", "start_index", "row_id"]])

df_anchor = pd.concat(anchored_parts).sort_values(by=["row_id", "type"]).reset_index(drop=True)

df_anchor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         300 non-null    object
 1   value        300 non-null    object
 2   start_index  300 non-null    int64 
 3   row_id       300 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 9.5+ KB


In [25]:
# Rebuild wide-format table
df_ouput = df_anchor.pivot_table(
    index="row_id", columns="type", values="value", aggfunc="first"
).reset_index()

# Preview final result
df_ouput.head()


type,row_id,Vendor,amount,location,posting_date,transaction_date
0,0,PTZ INSURANCE SERVICES,21.27,OAKVILLE,Jan 4,Jan 3
1,1,CANADA COMPUTERS ONLIN,410.54,RICHMOND HILL,Jan 4,Jan 3
2,2,CANADA COMPUTERS ONLINE,-410.54,RICHMOND HILL,Jan 4,Jan 3
3,3,TST-DOOR FIFTYFIVE,67.9,MISSISSAUGA,Jan 5,Jan 3
4,4,AMZN MKTP CA*ZP9B72SS1,548.03,WWW.AMAZON.CA,Jan 8,Jan 3


In [26]:
# Create a helper function to extract single-value fields
def extract_single_value(df, field_name):
    matches = df[df["type"] == field_name]["value"]
    return matches.iloc[0] if not matches.empty else None

closing_date = extract_single_value(df_cleaned, "closing_date")
opening_date = extract_single_value(df_cleaned, "opening_date")

df_ouput["closing_date"] = closing_date
df_ouput["opening_date"] = opening_date

df_ouput.head()

type,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date
0,0,PTZ INSURANCE SERVICES,21.27,OAKVILLE,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
1,1,CANADA COMPUTERS ONLIN,410.54,RICHMOND HILL,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
2,2,CANADA COMPUTERS ONLINE,-410.54,RICHMOND HILL,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
3,3,TST-DOOR FIFTYFIVE,67.9,MISSISSAUGA,Jan 5,Jan 3,"Feb 03, 2025","Jan 04, 2025"
4,4,AMZN MKTP CA*ZP9B72SS1,548.03,WWW.AMAZON.CA,Jan 8,Jan 3,"Feb 03, 2025","Jan 04, 2025"


In [27]:
points_earned = extract_single_value(df_cleaned, "points_earned")
points_redeemed = extract_single_value(df_cleaned, "points_redeemed")

points_rows = pd.DataFrame([
    {
        "row_id": df_ouput["row_id"].max() + 1,
        "Vendor": "points_earned",
        "amount": points_earned,
        "location": None,
        "posting_date": None,
        "transaction_date": None,
        "closing_date": closing_date,
        "opening_date": opening_date
    },
    {
        "row_id": df_ouput["row_id"].max() + 2,
        "Vendor": "points_redeemed",
        "amount": points_redeemed,
        "location": None,
        "posting_date": None,
        "transaction_date": None,
        "closing_date": closing_date,
        "opening_date": opening_date
    }
])

df_ouput = pd.concat([df_ouput, points_rows], ignore_index=True)

In [28]:
# Add row_id and meta fields before final export
base_row_id = df_ouput["row_id"].max() + 1
df_payment_rows["row_id"] = range(base_row_id, base_row_id + len(df_payment_rows))
df_payment_rows["closing_date"] = closing_date
df_payment_rows["opening_date"] = opening_date

df_ouput = pd.concat([df_ouput, df_payment_rows], ignore_index=True)

In [29]:
df_ouput.head()

Unnamed: 0,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date
0,0,PTZ INSURANCE SERVICES,21.27,OAKVILLE,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
1,1,CANADA COMPUTERS ONLIN,410.54,RICHMOND HILL,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
2,2,CANADA COMPUTERS ONLINE,-410.54,RICHMOND HILL,Jan 4,Jan 3,"Feb 03, 2025","Jan 04, 2025"
3,3,TST-DOOR FIFTYFIVE,67.9,MISSISSAUGA,Jan 5,Jan 3,"Feb 03, 2025","Jan 04, 2025"
4,4,AMZN MKTP CA*ZP9B72SS1,548.03,WWW.AMAZON.CA,Jan 8,Jan 3,"Feb 03, 2025","Jan 04, 2025"


In [30]:
# Local export path (make sure this folder exists)
local_csv = rf"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\{pdf_prefix}_cleansed.csv"

# Save locally
df_ouput.to_csv(local_csv, index=False)
print(f"Exported CSV saved to: {local_csv}")

Exported CSV saved to: C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2025-02-03_cleansed.csv
