In [148]:
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
from google.cloud import storage
import pandas as pd
import time
from pathlib import Path
import json
import glob
import os


# === CONFIG ===
project_id = "vercillopersonal"
location = "us"
processor_id = "fe61eee8945a8018"

# === INPUT/OUTPUT PATHS ===
# gcs_input_uri = "gs://vercillo_projects/transactions/amex/data/2025/2025-01-03_cleansed.csv"
# filename = Path(gcs_input_uri).name  
# gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"




## Google Cloud Storage Bucket

In [149]:
# === INPUT/OUTPUT PATHS ===
# gcs_input_uri = "gs://vercillo_projects/transactions/amex/data/2025/2025-01-03_cleansed.csv"
# filename = Path(gcs_input_uri).name  
# gcs_output_uri = "gs://vercillo_projects/transactions/amex/exports/"

# df = pd.read_csv(gcs_input_uri)

## Local File Path

In [150]:
# filename = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2025\2025-02-03_cleansed.csv"
# output_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\amex_staged.csv"
# df = pd.read_csv(filename)

## Batch Local Path

In [151]:

# Path to your folder (with wildcard for CSV files)
folder_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\2022"
output_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\test_files\export\amex_staged.csv"
all_files = glob.glob(os.path.join(folder_path, "*.csv"))  # Change to "*.xlsx" for Excel

In [152]:
# Read and concatenate
df_list = []

for file in all_files:
    try:
        temp_df = pd.read_csv(file, encoding="ISO-8859-1")
        temp_df["source_file"] = os.path.basename(file)
        df_list.append(temp_df)
    except Exception as e:
        print(f"⚠️ Failed to read {file}: {e}")
# Combine all files into one DataFrame
df = pd.concat(df_list, ignore_index=True)

In [153]:
# Preview
print(df.shape)
df.head()

(419, 9)


Unnamed: 0,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date,source_file
0,0,AMAZON.CA*VH3BK5M53,48.08,AMAZON.CA,04-Dec,03-Dec,03-Jan-22,04-Dec-21,2022-01-03_cleansed.csv
1,1,UBER EATS,24.06,TORONTO,08-Dec,08-Dec,03-Jan-22,04-Dec-21,2022-01-03_cleansed.csv
2,2,UBER EATS,3.6,TORONTO,08-Dec,08-Dec,03-Jan-22,04-Dec-21,2022-01-03_cleansed.csv
3,3,MARSHALLS 701,40.65,MISSISSAUGA,11-Dec,10-Dec,03-Jan-22,04-Dec-21,2022-01-03_cleansed.csv
4,4,FIONN MACCOOLS BRITI,24.05,MISSISSAUGA,11-Dec,10-Dec,03-Jan-22,04-Dec-21,2022-01-03_cleansed.csv


In [154]:
# df = df[df["source_file"] == "2025-06-03_cleansed.csv"]
# df

In [155]:
date_cols = ["closing_date", "opening_date"]

# Fallback format: "Jan 04, 2025"
for col in date_cols:
    # First store original strings
    original_vals = df[col].copy()

    # First pass: "04-Jan-25"
    df.loc[:, col] = pd.to_datetime(df[col], format="%d-%b-%y", errors="coerce")

    # Second pass: for NaT rows where original string still exists, try "Jan 04, 2025"
    mask = df[col].isna() & original_vals.notna()
    df.loc[mask, col] = pd.to_datetime(original_vals[mask], format="%b %d, %Y", errors="coerce")

    # # Safely convert to date if column is datetime64[ns]
    # if pd.api.types.is_datetime64_any_dtype(df[col]):
    #     df[col] = df[col].dt.date

    df[col] = pd.to_datetime(df[col], errors="coerce")

    # Strip time component, keep only date
    df[col] = df[col].dt.date

# # Now extract the year
df["closing_year"] = pd.to_datetime(df["closing_date"]).dt.year.astype("Int64")
df

Unnamed: 0,row_id,Vendor,amount,location,posting_date,transaction_date,closing_date,opening_date,source_file,closing_year
0,0,AMAZON.CA*VH3BK5M53,48.08,AMAZON.CA,04-Dec,03-Dec,2022-01-03,2021-12-04,2022-01-03_cleansed.csv,2022
1,1,UBER EATS,24.06,TORONTO,08-Dec,08-Dec,2022-01-03,2021-12-04,2022-01-03_cleansed.csv,2022
2,2,UBER EATS,3.6,TORONTO,08-Dec,08-Dec,2022-01-03,2021-12-04,2022-01-03_cleansed.csv,2022
3,3,MARSHALLS 701,40.65,MISSISSAUGA,11-Dec,10-Dec,2022-01-03,2021-12-04,2022-01-03_cleansed.csv,2022
4,4,FIONN MACCOOLS BRITI,24.05,MISSISSAUGA,11-Dec,10-Dec,2022-01-03,2021-12-04,2022-01-03_cleansed.csv,2022
...,...,...,...,...,...,...,...,...,...,...
414,49,points_redeemed,0,,,,2022-12-03,2022-11-04,2022-12-03_cleansed.csv,2022
415,50,PAYMENT RECEIVED,-2000,,,,2022-12-03,2022-11-04,2022-12-03_cleansed.csv,2022
416,51,PAYMENT RECEIVED,-1500,,20-Nov,20-Nov,2022-12-03,2022-11-04,2022-12-03_cleansed.csv,2022
417,52,"INSTALLMENT PLAN FOR $4,098.48",-4098.48,,07-Nov,07-Nov,2022-12-03,2022-11-04,2022-12-03_cleansed.csv,2022


In [156]:
date_fields = ["posting_date", "transaction_date"]

for field in date_fields:
    # Keep original values before coercion
    original_vals = df[field].copy()

    # First try format like "04-Dec"
    df[field + "_full"] = pd.to_datetime(
        df[field].astype(str) + "-" + df["closing_year"].astype(str),
        format="%d-%b-%Y",
        errors="coerce"
    )

    # Fallback for format like "May 24"
    mask = df[field + "_full"].isna() & original_vals.notna()
    df.loc[mask, field + "_full"] = pd.to_datetime(
        original_vals[mask].astype(str) + " " + df.loc[mask, "closing_year"].astype(str),
        format="%b %d %Y",
        errors="coerce"
    )

    # Convert to string with only "day-month" format
    df[field] = df[field + "_full"].dt.strftime("%d-%b")

# Clean up
df = df.drop(columns=["posting_date_full", "transaction_date_full"])

df = df.rename(columns={
    "transaction_date_full": "transaction_date",
    "posting_date_full": "posting_date",
    "Vendor": "vendor"
})


df = df[["row_id", "vendor", "amount", "location","posting_date", "transaction_date","opening_date", "closing_date", "closing_year","source_file"]]

In [157]:
df

Unnamed: 0,row_id,vendor,amount,location,posting_date,transaction_date,opening_date,closing_date,closing_year,source_file
0,0,AMAZON.CA*VH3BK5M53,48.08,AMAZON.CA,04-Dec,03-Dec,2021-12-04,2022-01-03,2022,2022-01-03_cleansed.csv
1,1,UBER EATS,24.06,TORONTO,08-Dec,08-Dec,2021-12-04,2022-01-03,2022,2022-01-03_cleansed.csv
2,2,UBER EATS,3.6,TORONTO,08-Dec,08-Dec,2021-12-04,2022-01-03,2022,2022-01-03_cleansed.csv
3,3,MARSHALLS 701,40.65,MISSISSAUGA,11-Dec,10-Dec,2021-12-04,2022-01-03,2022,2022-01-03_cleansed.csv
4,4,FIONN MACCOOLS BRITI,24.05,MISSISSAUGA,11-Dec,10-Dec,2021-12-04,2022-01-03,2022,2022-01-03_cleansed.csv
...,...,...,...,...,...,...,...,...,...,...
414,49,points_redeemed,0,,,,2022-11-04,2022-12-03,2022,2022-12-03_cleansed.csv
415,50,PAYMENT RECEIVED,-2000,,,,2022-11-04,2022-12-03,2022,2022-12-03_cleansed.csv
416,51,PAYMENT RECEIVED,-1500,,20-Nov,20-Nov,2022-11-04,2022-12-03,2022,2022-12-03_cleansed.csv
417,52,"INSTALLMENT PLAN FOR $4,098.48",-4098.48,,07-Nov,07-Nov,2022-11-04,2022-12-03,2022,2022-12-03_cleansed.csv


In [158]:
df.to_csv(output_path, index=False)