In [12]:
import sys
from pathlib import Path

# Dynamic project root (2 levels up if notebook is in notebooks/)
project_root = Path().resolve().parents[0]  # adjust depending on your notebook location
sys.path.append(str(project_root))

# Confirm path is added
print(sys.path)

['C:\\Program Files\\Python311\\python311.zip', 'C:\\Program Files\\Python311\\DLLs', 'C:\\Program Files\\Python311\\Lib', 'C:\\Program Files\\Python311', 'c:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearning\\synthetic-finance-mlops\\venv', '', 'c:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearning\\synthetic-finance-mlops\\venv\\Lib\\site-packages', 'c:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearning\\synthetic-finance-mlops\\venv\\Lib\\site-packages\\win32', 'c:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearning\\synthetic-finance-mlops\\venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearning\\synthetic-finance-mlops\\venv\\Lib\\site-packages\\Pythonwin', 'C:\\Users\\hamza\\OneDrive\\Desktop\\InterviewPrepUSA\\UCSC_Extension\\IntroToMachineLearn

In [13]:
from src.Utils import data_cleaner
from src.Utils.data_cleaner import *
from src.Utils.data_merger import build_enriched_transactions
from src.Utils.data_enricher import compute_transaction_features

In [14]:
import pandas as pd

In [15]:
def preprocess_data(data_folder: str) -> pd.DataFrame:
    """
    Load raw CSV files, merge them into an enriched transactions DataFrame,
    clean inconsistencies, and compute additional transaction-level features.

    Parameters
    ----------
    data_folder : str
        Path to the folder containing raw CSV files. 
        The folder must contain:
        - transactions.csv
        - transaction_types.csv
        - accounts.csv
        - account_types.csv
        - account_statuses.csv
        - customers.csv
        - customer_types.csv
        - addresses.csv
        - branches.csv
        - loans.csv
        - loan_statuses.csv

    Returns
    -------
    pd.DataFrame
        A cleaned + enriched DataFrame, ready for model preparation.
    """

    # === List of CSV names (must match filenames without .csv) ===
    csv_names = [
        "transactions", "transaction_types",
        "accounts", "account_types", "account_statuses",
        "customers", "customer_types", "addresses",
        "branches", "loans", "loan_statuses"
    ]

    # === Load CSVs into DataFrames ===
    dataframes = {}
    for name in csv_names:
        file_path = os.path.join(data_folder, f"{name}.csv")
        dataframes[name] = pd.read_csv(file_path)

    # === Clean raw DataFrames ===
    dataframes["accounts"] = clean_accounts(dataframes["accounts"])
    dataframes["addresses"] = clean_addresses(dataframes["addresses"])
    dataframes["branches"] = clean_branches(dataframes["branches"])
    dataframes["customers"] = clean_customers(dataframes["customers"])
    dataframes["loans"] = clean_loans(dataframes["loans"])

    # === Merge and enrich ===
    df = build_enriched_transactions(
        dataframes["transactions"], dataframes["transaction_types"],
        dataframes["accounts"], dataframes["account_types"], dataframes["account_statuses"],
        dataframes["customers"], dataframes["customer_types"], dataframes["addresses"],
        dataframes["branches"], dataframes["loans"], dataframes["loan_statuses"]
    )

    return df

In [7]:
import os

In [16]:
from src.Utils.data_cleaner import *
from src.Utils.data_merger import build_enriched_transactions
from src.Utils.data_enricher import compute_transaction_features
import pandas as pd
import os

In [23]:
df=preprocess_data(data_folder=r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw")

🔧 Computing transaction features...
  → Amount-based features
  → Account status features
  → Customer demographic features
  → Loan-related features
  → Temporal features
  → Anomaly detection flags
✅ Feature engineering completed!
✅ Preprocessing complete. Data ready for model prep.


In [24]:
df.head()
df[:3].to_csv(r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\notebooks\df1.csv")

In [22]:
def preprocess_data(data_folder: str) -> pd.DataFrame:
    """
    Load raw CSV files, merge them into an enriched transactions DataFrame,
    clean inconsistencies, and compute additional transaction-level features.

    Parameters
    ----------
    data_folder : str
        Path to the folder containing raw CSV files. 
        The folder must contain:
        - transactions.csv
        - transaction_types.csv
        - accounts.csv
        - account_types.csv
        - account_statuses.csv
        - customers.csv
        - customer_types.csv
        - addresses.csv
        - branches.csv
        - loans.csv
        - loan_statuses.csv

    Returns
    -------
    pd.DataFrame
        A cleaned + enriched DataFrame, ready for model preparation.
    """

    # === List of CSV names (must match filenames without .csv) ===
    csv_names = [
        "transactions", "transaction_types",
        "accounts", "account_types", "account_statuses",
        "customers", "customer_types", "addresses",
        "branches", "loans", "loan_statuses"
    ]

    # === Load CSVs into DataFrames ===
    dataframes = {}
    for name in csv_names:
        file_path = os.path.join(data_folder, f"{name}.csv")
        dataframes[name] = pd.read_csv(file_path)

    # === Clean raw DataFrames ===
    dataframes["accounts"] = clean_accounts(dataframes["accounts"])
    dataframes["addresses"] = clean_addresses(dataframes["addresses"])
    dataframes["branches"] = clean_branches(dataframes["branches"])
    dataframes["customers"] = clean_customers(dataframes["customers"])
    dataframes["loans"] = clean_loans(dataframes["loans"])

    # === Merge and enrich ===
    df = build_enriched_transactions(
        dataframes["transactions"], dataframes["transaction_types"],
        dataframes["accounts"], dataframes["account_types"], dataframes["account_statuses"],
        dataframes["customers"], dataframes["customer_types"], dataframes["addresses"],
        dataframes["branches"], dataframes["loans"], dataframes["loan_statuses"]
    )

    # === Add engineered features ===
    df = compute_transaction_features(df)

    # === Fill missing values ===
    df = df.fillna(0)

    print("✅ Preprocessing complete. Data ready for model prep.", flush=True)

    return df

In [26]:
df.columns

Index(['TransactionTypeID', 'Amount', 'Origin_AccountTypeID',
       'Origin_AccountStatusID', 'Origin_Balance', 'Dest_AccountTypeID',
       'Dest_AccountStatusID', 'Dest_Balance', 'Origin_CustomerTypeID',
       'Dest_CustomerTypeID', 'Origin_LoanCount', 'Origin_TotalPrincipal',
       'Origin_AvgInterestRate', 'Dest_LoanCount', 'Dest_TotalPrincipal',
       'Dest_AvgInterestRate', 'Origin_LoanStatus_Active',
       'Origin_LoanStatus_Overdue', 'Origin_LoanStatus_Paid Off',
       'Dest_LoanStatus_Active', 'Dest_LoanStatus_Overdue',
       'Dest_LoanStatus_Paid Off', 'Amount_to_OriginBalance',
       'Amount_to_DestBalance', 'Amount_to_AvgTransaction',
       'Origin_AccountInactive', 'Dest_AccountInactive', 'Age_Difference',
       'Origin_LoanLeverage', 'Dest_LoanLeverage', 'TransactionHour',
       'TransactionWeekday', 'TransactionMonth', 'TransactionQuarter',
       'IsWeekend', 'IsBusinessHours', 'IsNightTime', 'LargeTransferFlag',
       'VeryLargeTransferFlag', 'UnusualTiming

In [25]:
df.head()

Unnamed: 0,TransactionTypeID,Amount,Origin_AccountTypeID,Origin_AccountStatusID,Origin_Balance,Dest_AccountTypeID,Dest_AccountStatusID,Dest_Balance,Origin_CustomerTypeID,Dest_CustomerTypeID,Origin_LoanCount,Origin_TotalPrincipal,Origin_AvgInterestRate,Dest_LoanCount,Dest_TotalPrincipal,Dest_AvgInterestRate,Origin_LoanStatus_Active,Origin_LoanStatus_Overdue,Origin_LoanStatus_Paid Off,Dest_LoanStatus_Active,Dest_LoanStatus_Overdue,Dest_LoanStatus_Paid Off,Amount_to_OriginBalance,Amount_to_DestBalance,Amount_to_AvgTransaction,Origin_AccountInactive,Dest_AccountInactive,Age_Difference,Origin_LoanLeverage,Dest_LoanLeverage,TransactionHour,TransactionWeekday,TransactionMonth,TransactionQuarter,IsWeekend,IsBusinessHours,IsNightTime,LargeTransferFlag,VeryLargeTransferFlag,UnusualTimingFlag,HighRiskFlag,CrossTypeTransfer
0,2,855.17,3.0,1.0,55889.89,3.0,2.0,65218.63,1.0,2.0,1.0,72030.09,0.0789,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015301,0.013112,0.341536,0,1,-7.0,1.288786,0.0,2.0,3.0,4.0,2.0,0,0,1,0,0,1,1,1
1,2,806.2,1.0,3.0,35239.9,4.0,1.0,53136.92,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022877,0.015172,0.321979,1,0,-9.0,0.0,0.0,15.0,1.0,8.0,3.0,0,1,0,0,0,0,1,1
2,1,1229.44,3.0,1.0,92795.9,2.0,1.0,49281.56,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013249,0.024947,0.491012,0,0,-11.0,0.0,0.0,3.0,6.0,8.0,3.0,1,0,1,0,0,1,0,1
3,4,4441.6,1.0,1.0,48854.26,4.0,1.0,80118.93,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090915,0.055438,1.773878,0,0,18.0,0.0,0.0,6.0,6.0,10.0,4.0,1,0,1,0,0,1,0,1
4,3,2526.2,4.0,1.0,59032.48,2.0,2.0,13489.63,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042793,0.18727,1.008909,0,1,-2.0,0.0,0.0,0.0,3.0,7.0,3.0,0,0,1,0,0,1,1,1
