In [14]:
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
from google.cloud import storage
import pandas as pd
import time
from pathlib import Path
import json
import glob
import os


# === CONFIG ===
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"

calendar_year = "2025"

# === INPUT/OUTPUT PATHS ===
# gcs_input_uri = "gs://vercillo_projects/transactions/amex/data/2025/2025-01-03_cleansed.csv"
# filename = Path(gcs_input_uri).name  
# gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"




## Google Cloud Storage Bucket

In [15]:
# === INPUT/OUTPUT PATHS ===
# gcs_input_uri = "gs://vercillo_projects/transactions/amex/data/2025/2025-01-03_cleansed.csv"
# filename = Path(gcs_input_uri).name  
# gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"

# df = pd.read_csv(gcs_input_uri)

## Local File Path

In [16]:
# filename = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2025\2025-02-03_cleansed.csv"
# filename = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\history\amex_staged_2022-2025.csv"
# output_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\amex_staged.csv"
# df = pd.read_csv(filename)
# df_hist = pd.read_csv(filename)

## Batch Local Path

In [17]:
# Path to your folder (with wildcard for CSV files)
folder_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\current"
# hist_folder_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\history"
output_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\amex_staged.csv"

# hist_files = glob.glob(os.path.join(hist_folder_path, "*.csv")) + glob.glob(os.path.join(hist_folder_path, "*.CSV"))  # Change to "*.xlsx" for Excel
all_files = glob.glob(os.path.join(folder_path, "*.csv"))  # Change to "*.xlsx" for Excel

# Fallback to .CSV (uppercase) only if no .csv files found
if not all_files:
    all_files = glob.glob(os.path.join(folder_path, "*.CSV"))

In [18]:
# Read and concatenate
df_list = []

for file in all_files:
    try:
        temp_df = pd.read_csv(file, encoding="ISO-8859-1")
        temp_df["source_file"] = os.path.basename(file)
        df_list.append(temp_df)
    except Exception as e:
        print(f"⚠️ Failed to read {file}: {e}")
# Combine all files into one DataFrame
df = pd.concat(df_list, ignore_index=True)

In [19]:
# Preview
print(df.shape)
df.head()

(49, 9)


Unnamed: 0,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date,source_file
0,0,UBER EATS,13.3,HTTPS://HELP.UB,May 4,May 4,"Jun 03, 2025","May 04, 2025",2025-06-03_cleansed.csv
1,1,SECURITY NATIONAL INSUR,111.64,MONTREAL,May 5,May 4,"Jun 03, 2025","May 04, 2025",2025-06-03_cleansed.csv
2,2,PTZ INSURANCE SERVICES,21.64,OAKVILLE,May 6,May 5,"Jun 03, 2025","May 04, 2025",2025-06-03_cleansed.csv
3,3,HORNER ESSO 0303,49.85,ETOBICOKE,May 8,May 6,"Jun 03, 2025","May 04, 2025",2025-06-03_cleansed.csv
4,4,AMZN MKTP CA*NI1XI4C60,86.82,WWW.AMAZON.CA,May 8,May 6,"Jun 03, 2025","May 04, 2025",2025-06-03_cleansed.csv


In [20]:
df['source_file'].unique()

array(['2025-06-03_cleansed.csv'], dtype=object)

In [21]:
# df = df[df["source_file"] == "2025-06-03_cleansed.csv"]
# df

In [28]:
date_cols = ["closing_date", "opening_date"]

# Fallback format: "Jan 04, 2025"
for col in date_cols:
    # First store original strings
    original_vals = df[col].copy()

    # First pass: "04-Jan-25"
    df.loc[:, col] = pd.to_datetime(df[col], format="%d-%b-%y", errors="coerce")

    # Second pass: for NaT rows where original string still exists, try "Jan 04, 2025"
    mask = df[col].isna() & original_vals.notna()
    df.loc[mask, col] = pd.to_datetime(original_vals[mask], format="%b %d, %Y", errors="coerce")

    # # Safely convert to date if column is datetime64[ns]
    # if pd.api.types.is_datetime64_any_dtype(df[col]):
    #     df[col] = df[col].dt.date

    df[col] = pd.to_datetime(df[col], errors="coerce")

    # Strip time component, keep only date
    df[col] = df[col].dt.date

# # Now extract the year
df["closing_year"] = pd.to_datetime(df["closing_date"]).dt.year.astype("Int64")
df.head(10)

Unnamed: 0,row_id,vendor,amount,location,posting_date,transaction_date,opening_date,closing_date,closing_year,calendar_year,source_file
0,0,UBER EATS,13.3,HTTPS://HELP.UB,2025-05-04,2025-05-04,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
1,1,SECURITY NATIONAL INSUR,111.64,MONTREAL,2025-05-05,2025-05-04,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
2,2,PTZ INSURANCE SERVICES,21.64,OAKVILLE,2025-05-06,2025-05-05,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
3,3,HORNER ESSO 0303,49.85,ETOBICOKE,2025-05-08,2025-05-06,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
4,4,AMZN MKTP CA*NI1XI4C60,86.82,WWW.AMAZON.CA,2025-05-08,2025-05-06,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
5,5,DC03 A-OK COMMISSARY &,14.69,Vaughan,2025-05-07,2025-05-07,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
6,6,DC03 A-OK COMMISSARY &,5.65,Vaughan,2025-05-07,2025-05-07,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
7,7,DC03 A-OK COMMISSARY &,5.65,Vaughan,2025-05-08,2025-05-08,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
8,8,LITTLE CAESARS #4999-00,9.03,TORONTO,2025-05-10,2025-05-08,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
9,9,LITTLE CAESARS #4999-00,1.68,TORONTO,2025-05-10,2025-05-08,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv


In [29]:
date_fields = ["posting_date", "transaction_date"]

for field in date_fields:
    # Keep original values before coercion
    original_vals = df[field].copy()

    # First try format like "04-Dec"
    df[field + "_full"] = pd.to_datetime(
        df[field].astype(str) + "-" + df["closing_year"].astype(str),
        format="%d-%b-%Y",
        errors="coerce"
    )

    # Fallback for format like "May 24"
    mask = df[field + "_full"].isna() & original_vals.notna()
    df.loc[mask, field + "_full"] = pd.to_datetime(
        original_vals[mask].astype(str) + " " + df.loc[mask, "closing_year"].astype(str),
        format="%b %d %Y",
        errors="coerce"
    )

    # Convert to string with only "day-month" format
    df[field] = df[field + "_full"].dt.strftime("%d-%b")

df.head(10)

Unnamed: 0,row_id,vendor,amount,location,posting_date,transaction_date,opening_date,closing_date,closing_year,calendar_year,source_file,posting_date_full,transaction_date_full
0,0,UBER EATS,13.3,HTTPS://HELP.UB,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
1,1,SECURITY NATIONAL INSUR,111.64,MONTREAL,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
2,2,PTZ INSURANCE SERVICES,21.64,OAKVILLE,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
3,3,HORNER ESSO 0303,49.85,ETOBICOKE,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
4,4,AMZN MKTP CA*NI1XI4C60,86.82,WWW.AMAZON.CA,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
5,5,DC03 A-OK COMMISSARY &,14.69,Vaughan,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
6,6,DC03 A-OK COMMISSARY &,5.65,Vaughan,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
7,7,DC03 A-OK COMMISSARY &,5.65,Vaughan,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
8,8,LITTLE CAESARS #4999-00,9.03,TORONTO,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT
9,9,LITTLE CAESARS #4999-00,1.68,TORONTO,,,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv,NaT,NaT


In [30]:
df["calendar_year"] = calendar_year  

df["transaction_date_full"] = pd.to_datetime(
    df["calendar_year"].astype(str) + "-" + df["transaction_date"],
    format="%Y-%d-%b", errors="coerce"
)

df["posting_date_full"] = pd.to_datetime(
    df["calendar_year"].astype(str) + "-" + df["posting_date"],
    format="%Y-%d-%b", errors="coerce"
)

#Clean up
df = df.drop(columns=["posting_date", "transaction_date"])

df = df.rename(columns={
    "transaction_date_full": "transaction_date",
    "posting_date_full": "posting_date",
    "Vendor": "vendor"
})


df = df[["row_id", "vendor", "amount", "location","posting_date", "transaction_date","opening_date", "closing_date", "closing_year","calendar_year","source_file"]]

df.head(10)

Unnamed: 0,row_id,vendor,amount,location,posting_date,transaction_date,opening_date,closing_date,closing_year,calendar_year,source_file
0,0,UBER EATS,13.3,HTTPS://HELP.UB,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
1,1,SECURITY NATIONAL INSUR,111.64,MONTREAL,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
2,2,PTZ INSURANCE SERVICES,21.64,OAKVILLE,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
3,3,HORNER ESSO 0303,49.85,ETOBICOKE,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
4,4,AMZN MKTP CA*NI1XI4C60,86.82,WWW.AMAZON.CA,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
5,5,DC03 A-OK COMMISSARY &,14.69,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
6,6,DC03 A-OK COMMISSARY &,5.65,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
7,7,DC03 A-OK COMMISSARY &,5.65,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
8,8,LITTLE CAESARS #4999-00,9.03,TORONTO,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
9,9,LITTLE CAESARS #4999-00,1.68,TORONTO,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv


In [31]:
df = df.drop_duplicates()
df.head(10)

Unnamed: 0,row_id,vendor,amount,location,posting_date,transaction_date,opening_date,closing_date,closing_year,calendar_year,source_file
0,0,UBER EATS,13.3,HTTPS://HELP.UB,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
1,1,SECURITY NATIONAL INSUR,111.64,MONTREAL,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
2,2,PTZ INSURANCE SERVICES,21.64,OAKVILLE,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
3,3,HORNER ESSO 0303,49.85,ETOBICOKE,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
4,4,AMZN MKTP CA*NI1XI4C60,86.82,WWW.AMAZON.CA,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
5,5,DC03 A-OK COMMISSARY &,14.69,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
6,6,DC03 A-OK COMMISSARY &,5.65,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
7,7,DC03 A-OK COMMISSARY &,5.65,Vaughan,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
8,8,LITTLE CAESARS #4999-00,9.03,TORONTO,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv
9,9,LITTLE CAESARS #4999-00,1.68,TORONTO,NaT,NaT,2025-05-04,2025-06-03,2025,2025,2025-06-03_cleansed.csv


In [26]:
df.to_csv(output_path, index=False)