In [5]:
import pandas as pd
from rapidfuzz import process

def clean_customers(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the customers dataframe by handling:
    - Missing values
    - Duplicates
    - Typos in categorical fields
    - Inconsistent formats
    - Mixed date formats
    - Non-sequential IDs
    - Future dates
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------   
    df["FirstName"] = df["FirstName"].fillna("Unknown")
    df["LastName"] = df["LastName"].fillna("Unknown")
    df["AddressID"] = df["AddressID"].fillna(0)
    df["DateOfBirth"] = pd.to_datetime(df["DateOfBirth"], errors="coerce")
    df["DateOfBirth"].fillna(pd.Timestamp("1970-01-01"))

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos (example: CustomerTypeID mapping)
    # -------------------------------
    # Suppose CustomerTypeID must map to {1, 2, 3}
    valid_types = [1, 2, 3]
    df = df[df["CustomerTypeID"].isin(valid_types)]

    # Example for country (if customers table had country info)
    # valid_countries = ["United States"]
    # df["Country"] = df["Country"].apply(
    #     lambda x: process.extractOne(x, valid_countries)[0]
    # )

    # -------------------------------
    # 4. Fix inconsistent formats
    # -------------------------------
    # Ensure AddressID is numeric
    df["AddressID"] = pd.to_numeric(df["AddressID"], errors="coerce")

    # -------------------------------
    # 5. Standardize date formats
    # -------------------------------
    df["DateOfBirth"] = pd.to_datetime(df["DateOfBirth"], errors="coerce")

    # -------------------------------
    # 6. Check non-sequential IDs
    # -------------------------------
    if not df["CustomerID"].is_unique:
        df = df.drop_duplicates(subset=["CustomerID"], keep="first")

    # -------------------------------
    # 7. Remove future dates
    # -------------------------------
    today = pd.Timestamp.today()
    df = df[df["DateOfBirth"] <= today]

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\customers.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\customers_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_customers(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned customers to {output_file}")


Original shape: (1111, 6)
Cleaned shape: (1058, 6)
✅ Saved cleaned customers to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\customers_cleaned.csv


In [7]:
import pandas as pd
from rapidfuzz import process

def clean_transactions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the transactions dataframe by handling:
    - Missing values
    - Duplicates
    - Inconsistent number/text formats
    - Mixed date formats
    - Non-sequential IDs
    - Future dates
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------
    # Amount and IDs must be numeric
    df["Amount"] = pd.to_numeric(df["Amount"], errors="coerce")
    df["AccountOriginID"] = pd.to_numeric(df["AccountOriginID"], errors="coerce")
    df["AccountDestinationID"] = pd.to_numeric(df["AccountDestinationID"], errors="coerce")
    df["BranchID"] = pd.to_numeric(df["BranchID"], errors="coerce")
    df["TransactionTypeID"] = pd.to_numeric(df["TransactionTypeID"], errors="coerce")

    # Fill missing Description
    df["Description"] = df["Description"].fillna("Unknown")

    # Drop rows with critical missing data (IDs or Amount)
    df=df.dropna(subset=["TransactionID", "AccountOriginID", "AccountDestinationID", "Amount"])

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos (example: TransactionType ID mapping)
    # -------------------------------
    # Suppose TransactionTypeID must map to {1, 2, 3,4}
    valid_types = [1, 2, 3,4]
    df = df[df["TransactionTypeID"].isin(valid_types)]
    
    # Normalize amounts with commas or 'k' notation
    def normalize_amount(x):
        if isinstance(x, str):
            x = x.replace(",", "").lower()
            if "k" in x: 
                x = float(x.replace("k", "")) * 1000
        return float(x)
    
    df["Amount"] = df["Amount"].apply(normalize_amount)

    # -------------------------------
    # 4. Standardize date formats
    # -------------------------------
    df["TransactionDate"] = pd.to_datetime(df["TransactionDate"], errors="coerce")

    # -------------------------------
    # 5. Check non-sequential IDs
    # -------------------------------
    if not df["TransactionID"].is_unique:
        df = df.drop_duplicates(subset=["TransactionID"], keep="first")
    df = df.sort_values("TransactionID").reset_index(drop=True)

    # -------------------------------
    # 6. Remove future dates
    # -------------------------------
    today = pd.Timestamp.today()
    df = df[df["TransactionDate"] <= today]

    # -------------------------------
    # 7. Remove future dates
    # -------------------------------
    today = pd.Timestamp.today()
    df = df[df["TransactionDate"] <= today]

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\transactions.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\transactions_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_transactions(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned transactions to {output_file}")


Original shape: (50000, 8)
Cleaned shape: (48042, 8)
✅ Saved cleaned transactions to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\transactions_cleaned.csv


In [10]:
import pandas as pd
from rapidfuzz import process

def clean_accounts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the accounts dataframe by handling:
    - Missing values
    - Duplicates
    - Typos in categorical fields
    - Inconsistent number/text formats
    - Mixed date formats
    - Non-sequential IDs
    - Future dates
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------
    # Numeric fields: Balance
    df["Balance"] = pd.to_numeric(df["Balance"], errors="coerce")
    #df["Balance"].fillna(df["Balance"].median(), inplace=True)

    # Categorical IDs: AccountTypeID, AccountStatusID
    df["AccountTypeID"] = pd.to_numeric(df["AccountTypeID"], errors="coerce")
    df["AccountStatusID"] = pd.to_numeric(df["AccountStatusID"], errors="coerce")

    # Dates
    df["OpeningDate"] = pd.to_datetime(df["OpeningDate"], errors="coerce")
    df["OpeningDate"]=df['OpeningDate'].fillna(pd.Timestamp("2000-01-01"))
    
    # Drop rows with critical missing data (IDs or Amount)
    df=df.dropna(subset=["AccountID", "CustomerID", "AccountTypeID", "AccountStatusID", "Balance"])  # placeholder for missing dates

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos in categorical fields
    # -------------------------------
    # Suppose valid AccountTypeIDs = [1,2,3,4,5] and AccountStatusIDs = [1,2,3]
    valid_account_types = [1, 2, 3, 4, 5]
    valid_statuses = [1, 2, 3]

    df = df[df["AccountTypeID"].isin(valid_account_types)]
    df = df[df["AccountStatusID"].isin(valid_statuses)]

    # -------------------------------
    # 4. Fix inconsistent number/text formats
    # -------------------------------
    # Ensure AccountID and CustomerID are numeric
    df["AccountID"] = pd.to_numeric(df["AccountID"], errors="coerce")
    df["CustomerID"] = pd.to_numeric(df["CustomerID"], errors="coerce")

    # -------------------------------
    # 5. Standardize date formats
    # -------------------------------
    df["OpeningDate"] = pd.to_datetime(df["OpeningDate"], errors="coerce", infer_datetime_format=True)

    # -------------------------------
    # 6. Check non-sequential IDs
    # -------------------------------
    if not df["AccountID"].is_unique:
        df = df.drop_duplicates(subset=["AccountID"], keep="first")

    df = df.sort_values("AccountID").reset_index(drop=True)

    # -------------------------------
    # 7. Remove future dates
    # -------------------------------
    today = pd.Timestamp.today()
    df = df[df["OpeningDate"] <= today]
    

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\accounts.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\accounts_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_accounts(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned accounts to {output_file}")


Original shape: (1667, 6)
Cleaned shape: (1635, 6)
✅ Saved cleaned accounts to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\accounts_cleaned.csv


  df["OpeningDate"] = pd.to_datetime(df["OpeningDate"], errors="coerce", infer_datetime_format=True)


In [14]:
import pandas as pd

def clean_loans(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the loans dataframe by handling:
    - Missing values (drop rows)
    - Duplicates
    - Inconsistent number/text formats
    - Mixed date formats
    - Non-sequential IDs
    - Future dates
    - Typos in categorical fields (LoanStatusID)
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------
    # Numeric fields
    df["PrincipalAmount"] = pd.to_numeric(df["PrincipalAmount"], errors="coerce")
    df["InterestRate"] = pd.to_numeric(df["InterestRate"], errors="coerce")
    df["LoanID"] = pd.to_numeric(df["LoanID"], errors="coerce")
    df["AccountID"] = pd.to_numeric(df["AccountID"], errors="coerce")
    df["LoanStatusID"] = pd.to_numeric(df["LoanStatusID"], errors="coerce")
    

    

    # Drop rows with missing critical fields
    df=df.dropna(subset=["LoanID", "AccountID", "LoanStatusID", "PrincipalAmount", "InterestRate"])

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos in categorical fields
    # -------------------------------
    # Valid LoanStatusIDs = [1, 2, 3]
    valid_statuses = [1, 2, 3]
    df = df[df["LoanStatusID"].isin(valid_statuses)]

    # -------------------------------
    # 4. Fix inconsistent number/text formats
    # -------------------------------
    # Already ensured numeric conversion above

    # -------------------------------
    # 5. Standardize date formats
    # -------------------------------
    # Dates
    df["StartDate"] = pd.to_datetime(df["StartDate"], errors="coerce")
    df["EstimatedEndDate"] = pd.to_datetime(df["EstimatedEndDate"], errors="coerce")

    # Drop rows with missing critical fields
    df=df.dropna(subset=["StartDate", "EstimatedEndDate"])

    # -------------------------------
    # 6. Check non-sequential IDs
    # -------------------------------
    if not df["LoanID"].is_unique:
        df = df.drop_duplicates(subset=["LoanID"], keep="first")
    df = df.sort_values("LoanID").reset_index(drop=True)

    # -------------------------------
    # 7. Remove future dates
    # -------------------------------
    today = pd.Timestamp.today()
    df = df[df["StartDate"] <= today]
    

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\loans.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\loans_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_loans(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned loans to {output_file}")


Original shape: (333, 7)
Cleaned shape: (316, 7)
✅ Saved cleaned loans to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\loans_cleaned.csv


In [15]:
import pandas as pd
from rapidfuzz import process

def clean_addresses(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the addresses dataframe by handling:
    - Missing values (drop rows)
    - Duplicates
    - Typos in text fields
    - Inconsistent text formats
    - Non-sequential IDs
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------
    df["Street"]=df["Street"].fillna("Unknown")
    df["City"]=df["City"].fillna("Unknown")
    df["Country"]=df["Country"].fillna("Unknown")

    # Drop rows with missing AddressID
    df["AddressID"] = pd.to_numeric(df["AddressID"], errors="coerce")
    df=df.dropna(subset=["AddressID"])

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos in text fields
    # -------------------------------
    # Example: standardize Country field
    valid_countries = ["United States"]  # expand if needed
    df["Country"] = df["Country"].apply(
        lambda x: process.extractOne(str(x), valid_countries)[0]
    )

    # -------------------------------
    # 4. Fix inconsistent text formats
    # -------------------------------
    df["Street"] = df["Street"].str.title().str.strip()
    df["City"] = df["City"].str.title().str.strip()
    df["Country"] = df["Country"].str.title().str.strip()

    # -------------------------------
    # 5. Standardize ID format
    # -------------------------------
    df["AddressID"] = df["AddressID"].astype(int)

    # -------------------------------
    # 6. Check non-sequential IDs
    # -------------------------------
    if not df["AddressID"].is_unique:
        df = df.drop_duplicates(subset=["AddressID"], keep="first")

    df = df.sort_values("AddressID").reset_index(drop=True)

    # -------------------------------
    # 7. Future dates
    # -------------------------------
    # Not applicable for Addresses table

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\addresses.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\addresses_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_addresses(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned addresses to {output_file}")


Original shape: (1222, 4)
Cleaned shape: (1210, 4)
✅ Saved cleaned addresses to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\addresses_cleaned.csv


In [16]:
import pandas as pd

def clean_branches(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the branches dataframe by handling:
    - Missing values (drop rows)
    - Duplicates
    - Typos in text fields
    - Inconsistent number/text formats
    - Non-sequential IDs
    """

    # -------------------------------
    # 1. Handle missing values
    # -------------------------------
    df["BranchName"]=df["BranchName"].fillna("Unknown")

    # Numeric fields
    df["BranchID"] = pd.to_numeric(df["BranchID"], errors="coerce")
    df["AddressID"]=pd.to_numeric(df["AddressID"], errors="coerce")

    # Drop rows with missing critical fields
    df=df.dropna(subset=["BranchID", "AddressID"])

    # -------------------------------
    # 2. Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    # -------------------------------
    # 3. Fix typos in text fields
    # -------------------------------
    # Example: normalize BranchName by stripping spaces
    df["BranchName"] = df["BranchName"].str.strip().str.title()

    # -------------------------------
    # 4. Fix inconsistent number/text formats
    # -------------------------------
    # Already handled above (numeric IDs)

    # -------------------------------
    # 5. Standardize date formats
    # -------------------------------
    # Not applicable

    # -------------------------------
    # 6. Check non-sequential IDs
    # -------------------------------
    if not df["BranchID"].is_unique:
        df = df.drop_duplicates(subset=["BranchID"], keep="first")
    df = df.sort_values("BranchID").reset_index(drop=True)

    # -------------------------------
    # 7. Future dates
    # -------------------------------
    # Not applicable

    return df


if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\raw\branches.csv"
    output_file = r"C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\branches_cleaned.csv"

    df = pd.read_csv(input_file)
    print(f"Original shape: {df.shape}")
    cleaned = clean_branches(df)
    print(f"Cleaned shape: {cleaned.shape}")

    cleaned.to_csv(output_file, index=False)
    print(f"✅ Saved cleaned branches to {output_file}")


Original shape: (50, 3)
Cleaned shape: (50, 3)
✅ Saved cleaned branches to C:\Users\hamza\OneDrive\Desktop\InterviewPrepUSA\UCSC_Extension\IntroToMachineLearning\synthetic-finance-mlops\data\processed\branches_cleaned.csv
