In [1]:
# Step 1: Imports
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [7]:
# Step 2: Load dataset
df = pd.read_csv("/Users/garvsorout/FinanceRepo/ai-journal-auditor/data/journal_entries.csv")  



In [11]:
# Feature engineering
df["Hour"] = pd.to_datetime(df["Timestamp"]).dt.hour

features = df[["Account", "Debit/Credit", "Amount", "Preparer", "Approver", "Hour"]].copy()

from sklearn.model_selection import train_test_split

# Train/test split (no labels, unsupervised!)
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

In [12]:
# Preprocessing pipeline
categorical_features = ["Account", "Debit/Credit", "Preparer", "Approver"]
numeric_features = ["Amount", "Hour"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

In [13]:
# Step 5: Build pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", IsolationForest(n_estimators=100, contamination=0.05, random_state=42))
])

In [15]:
# Train only on X_train
pipeline.fit(X_train)

# Predict on test set
df_test = df.iloc[X_test.index].copy()
df_test["AnomalyScore"] = pipeline.predict(X_test)
df_test["IsAnomaly"] = df_test["AnomalyScore"] == -1

# Save flagged anomalies from test set
df_test[df_test["IsAnomaly"] == True].to_csv("/Users/garvsorout/FinanceRepo/ai-journal-auditor/data/test_flagged_anomalies.csv", index=False)

# Inspect a few
print(df_test[df_test["IsAnomaly"] == True].head())

           Date      Time           Account Debit/Credit   Amount  \
45   2025-04-03  23:10:59  Accounts Payable        Debit   488.57   
165  2025-04-01  23:56:40  Accounts Payable        Debit  8905.71   
93   2025-03-30  05:33:13     Sales Revenue       Credit   169.14   

                                           Description         Preparer  \
45   Its rock finish paper memory history office ef...      Noah Rhodes   
165              Such during open model how financial.  Gabrielle Davis   
93                    Society organization station TV.  Cristian Santos   

            Approver            Timestamp  Hour  AnomalyScore  IsAnomaly  
45        Gina Moore  2025-04-03 23:10:59    23            -1       True  
165  Connie Lawrence  2025-04-01 23:56:40    23            -1       True  
93   Connie Lawrence  2025-03-30 05:33:13     5            -1       True  


In [16]:
# Save the trained pipeline (preprocessing + model)
joblib.dump(pipeline, "/Users/garvsorout/FinanceRepo/ai-journal-auditor/model/audit_model.pkl")
print("✅ Model saved to model/audit_model.pkl")

✅ Model saved to model/audit_model.pkl
