In [4]:
from faker import Faker
import pandas as pd
import random

In [5]:
fake = Faker('en_BD')


def generate_full_application_data(num_records: int = 5, persona: str = "Businessman") -> pd.DataFrame:
    """Generate synthetic loan application form data + GT for supporting documents.

    Personas:
    - Businessman
    - Service Holder
    - Self-Employed
    - random (mix of all)

    Note: Fields that are irrelevant for a given persona are set to None.
    """

    valid_personas = {"Businessman", "Service Holder", "Self-Employed", "random"}
    if persona not in valid_personas:
        raise ValueError(f"persona must be one of {sorted(valid_personas)}")

    data = []
    for _ in range(num_records):
        selected_persona = (
            random.choice(["Businessman", "Service Holder", "Self-Employed"]) if persona == "random" else persona
        )

        # Generate common values first
        full_name = fake.name()
        dob = fake.date_of_birth(minimum_age=30, maximum_age=60).strftime("%d-%m-%Y")
        nid_no = fake.numerify(text="###########")
        tin_no = fake.numerify(text="############")

        # Sync the address between Application + Utility Bill GT
        present_address = fake.address().replace("\n", ", ")

        is_married = random.choice([True, False])
        gender = random.choice(["Male", "Female"])

        # Common income components
        rental_income = random.choice([0, 20000, 40000])
        interest_income = random.randint(1000, 5000)
        other_income = random.randint(0, 5000)

        # Persona switch (sets income + org_name + persona-specific GT)
        income_salary = None
        income_business = None
        income_professional = None
        org_name = None

        gt_trade_org = None
        gt_trade_license_no = None

        gt_payslip_employer = None
        gt_payslip_net_pay = None

        # Service Holder specific (replaces NOC)
        gt_empid_no = None
        gt_empid_org = None
        gt_empid_designation = None
        gt_empid_name = None

        gt_prof_body = None
        gt_prof_reg_no = None
        gt_prof_name = None

        # Employment/Business columns (persona-specific; None for others)
        name_of_present_employer = None
        designation = None
        date_of_joining = None
        profession_self_employed = None
        years_in_practice = None
        clinic_chamber_name = None
        org_name_businessman = None
        org_address_businessman = None
        years_in_present_business = None
        total_length_of_business = None

        if selected_persona == "Businessman":
            income_salary = 0
            income_business = random.randint(200000, 600000)
            income_professional = None

            org_name = fake.company()  # Business Name

            gt_trade_org = org_name
            gt_trade_license_no = fake.bothify(text="TL-#######")

            org_name_businessman = org_name
            org_address_businessman = fake.address().replace("\n", ", ")
            years_in_present_business = random.randint(1, 15)
            total_length_of_business = random.randint(years_in_present_business, years_in_present_business + 20)

            primary_monthly_income = income_business

        elif selected_persona == "Service Holder":
            income_business = 0
            income_salary = random.randint(30000, 250000)
            income_professional = None

            org_name = fake.company()  # Employer Name

            gt_payslip_employer = org_name
            deductions_rate = random.uniform(0.02, 0.10)
            gt_payslip_net_pay = int(round(income_salary * (1 - deductions_rate)))

            # Employee ID Card GT (replaces NOC)
            gt_empid_no = fake.bothify(text="EMP-#####")
            gt_empid_org = org_name
            gt_empid_designation = fake.job()
            gt_empid_name = full_name

            name_of_present_employer = org_name
            designation = gt_empid_designation
            date_of_joining = fake.date_between(start_date="-15y", end_date="-1y").strftime("%d-%m-%Y")

            primary_monthly_income = income_salary

        else:  # Self-Employed (Doctor/Lawyer)
            income_salary = 0
            income_business = None
            income_professional = random.randint(80000, 500000)

            practice_suffix = random.choice(
                ["Chamber", "Law Chambers", "Associates", "Clinic", "Medical Center", "Consultancy"]
            )
            org_name = f"{fake.last_name()} {practice_suffix}"  # Chamber/Firm Name

            gt_prof_body = random.choice(["BMDC", "Bangladesh Bar Council"])
            gt_prof_reg_no = (
                fake.bothify(text="BMDC-#######") if gt_prof_body == "BMDC" else fake.bothify(text="BAR-#######")
            )
            gt_prof_name = full_name

            profession_self_employed = "Doctor" if gt_prof_body == "BMDC" else "Lawyer"
            years_in_practice = random.randint(5, 25)
            clinic_chamber_name = org_name

            primary_monthly_income = income_professional

        # Expenditure + Net worth (unchanged logic; keep realistic ranges)
        loan_repayment = random.randint(0, 15000)
        other_loan = random.randint(5000, 25000)
        rent_utilities = random.randint(20000, 50000)
        living_expenses = random.randint(40000, 80000)
        educational_expenses = random.randint(5000, 20000)
        other_expenses = random.randint(0, 5000)
        total_expenditure_y = (
            loan_repayment + other_loan + rent_utilities + living_expenses + educational_expenses + other_expenses
        )

        loan_from_banks = random.randint(0, 1500000)
        credit_card = random.randint(20000, 150000)
        other_liabilities = random.randint(0, 500000)
        total_liability_y = loan_from_banks + credit_card + other_liabilities

        savings_ac = random.randint(100000, 600000)
        current_ac = random.randint(500000, 2500000)
        fixed_deposit = random.randint(1000000, 5000000)
        bonds = random.randint(0, 500000)
        shares = random.randint(0, 1000000)
        land_building = random.randint(10000000, 25000000)
        motor_vehicles = random.randint(1500000, 4500000)
        biz_investment = random.randint(5000000, 15000000)
        precious_metals = random.randint(100000, 400000)
        total_assets_x = (
            savings_ac
            + current_ac
            + fixed_deposit
            + bonds
            + shares
            + land_building
            + motor_vehicles
            + biz_investment
            + precious_metals
        )

        # Bank account (used for Bank Statement GT)
        bank_acc_no = fake.numerify(text="##########")

        # Total income (X)
        total_income_x = (
            (income_salary or 0)
            + (income_business or 0)
            + (income_professional or 0)
            + rental_income
            + interest_income
            + other_income
        )

        # Bank Statement GT (new)
        gt_statement_total_credits = int(round(primary_monthly_income * 6 * random.uniform(0.95, 1.05)))

        # CIB Report GT (Businessman only)
        cib_total_outstanding = None
        cib_overdue_amount = None
        cib_worst_status = None
        cib_trade_name = None
        cib_nid = None

        if selected_persona == "Businessman":
            cib_total_outstanding = total_liability_y
            cib_overdue_amount = 0
            if random.random() < 0.10:  # ~10% defaulters
                cib_overdue_amount = random.randint(1000, 200000)
            cib_worst_status = "DF" if cib_overdue_amount > 0 else "UC"

            # Strict match fields
            cib_trade_name = org_name
            cib_nid = nid_no

        record = {
            # Facility Details
            "Persona": selected_persona,
            "Amount sought (BDT)": random.randint(500000, 5000000),
            "Tenure (Months)": random.choice([12, 24, 36, 48, 60]),
            "Offered Security": random.choice(["Vehicle", "Residence", "Land", "None"]),

            # Section A: Personal Information
            "Applicant Name": full_name,
            "Father's Name": fake.name_male(),
            "Mother's Name": fake.name_female(),
            "Marital Status": "Married" if is_married else "Unmarried",
            "Applicant's Date of Birth": dob,
            "Gender": gender,
            "Educational Level": random.choice(["Below SSC", "SSC", "HSC", "Graduate", "Post Graduate"]),
            "Number of Dependents": random.randint(0, 4),
            "TIN No.": tin_no,
            "NID No.": nid_no,

            # Contact Details
            "Present Address": present_address,
            "Permanent Address": fake.address().replace("\n", ", "),
            "Mobile": fake.phone_number(),
            "E-mail": fake.email(),

            # Employment / Business Details (persona-specific)
            "Name of Present Employer": name_of_present_employer,
            "Designation": designation,
            "Date of Joining (Present Employer)": date_of_joining,
            "Profession (Self-Employed)": profession_self_employed,
            "No. of years in Practice (Self-Employed)": years_in_practice,
            "Clinic/Chamber/Firm/Office Name (Self-Employed)": clinic_chamber_name,
            "Organization Name (Businessman)": org_name_businessman,
            "Organization Address (Businessman)": org_address_businessman,
            "Years in Present Business": years_in_present_business,
            "Total Length of Business": total_length_of_business,

            # Income & Expenses
            "Monthly Income: Salary": income_salary,
            "Monthly Income: Business income": income_business,
            "Monthly Income: Income from Self-Employment": income_professional,
            "Monthly Income: Rental Income": rental_income,
            "Monthly Income: Interest earning": interest_income,
            "Monthly Income: Any other source": other_income,
            "Total Income (X)": total_income_x,

            # Monthly Expenditure
            "Monthly Expenditure: Loan repayment": loan_repayment,
            "Monthly Expenditure: Other loan & credit Card Installments": other_loan,
            "Monthly Expenditure: Rent and Utilities": rent_utilities,
            "Monthly Expenditure: Living expenses (food, education, clothing, etc.)": living_expenses,
            "Monthly Expenditure: Educational Expenses": educational_expenses,
            "Monthly Expenditure: Any others": other_expenses,
            "Total Expenditure (Y)": total_expenditure_y,
            "Monthly uncommitted income in BDT (X-Y)": total_income_x - total_expenditure_y,

            # Liabilities
            "Liabilities: Loan from Banks": loan_from_banks,
            "Liabilities: Credit Card": credit_card,
            "Liabilities: Other Liabilities (including any guarantee given)": other_liabilities,
            "Total Liability (Y)": total_liability_y,

            # Assets
            "Assets: Savings A/C with Banks": savings_ac,
            "Assets: Current A/C with Banks": current_ac,
            "Assets: Fixed deposit with Banks": fixed_deposit,
            "Assets: Bonds": bonds,
            "Assets: Shares/certificates": shares,
            "Assets: Land & building": land_building,
            "Assets: Motor vehicles": motor_vehicles,
            "Assets: Investment in business": biz_investment,
            "Assets: Precious metals": precious_metals,
            "Total Assets (X)": total_assets_x,
            "Net Worth (X-Y)": total_assets_x - total_liability_y,

            # Bank Account Details - Depository Account
            "Bank Name": fake.company(),
            "Bank Branch": fake.city(),
            "Bank Account No.": bank_acc_no,
            "Bank Account Type": random.choice(["Savings", "Current", "Fixed Deposit"]),
            "Bank Average Balance": random.randint(50000, 1000000),

            # Loan with Any Bank/Financial Institute/Employer
            "Lender Name": fake.company(),
            "Lender Branch": fake.city(),
            "Lender Account No.": fake.numerify(text="##########"),
            "Lender Amount Outstanding": random.randint(0, 1000000),
            "Lender Monthly Repayment": random.randint(0, 50000),

            # Credit Card
            "Has Credit Card": random.choice(["Yes", "No"]),
            "Issuing Bank/Institution": fake.company(),
            "Credit Card No.": fake.numerify(text="####-####-####-####"),
            "Credit Card Limit": random.randint(50000, 500000),
            "Credit Card Outstanding": random.randint(0, 200000),

            # Reference / Guarantor Details
            "Reference Name": fake.name(),
            "Reference Relationship": random.choice(["Friend", "Relative"]),
            "Reference Address": fake.address().replace("\n", ", "),
            "Reference Phone": fake.phone_number(),
            "Reference Profession": fake.job(),
            "Reference Email": fake.email(),

            # -----------------
            # Ground Truth (GT) Fields
            # -----------------

            # Common (all personas)
            "GT_Doc_NID_Name": full_name,
            "GT_Doc_NID_No": nid_no,
            "GT_Doc_NID_DOB": dob,
            "GT_Doc_TIN_Name": full_name,
            "GT_Doc_TIN_Number": tin_no,

            # Utility Bill GT (all personas; strict name + address match)
            "GT_Doc_Utility_Name": full_name,
            "GT_Doc_Utility_Address": present_address,

            # Bank Statement GT (all personas)
            "GT_Doc_Statement_AccName": full_name,
            "GT_Doc_Statement_AccNo": bank_acc_no,
            "GT_Doc_Statement_TotalCredits": gt_statement_total_credits,
            "GT_Doc_Statement_ClosingBalance": current_ac,

            # CIB Report GT (Businessman only; None for others)
            "GT_Doc_CIB_TotalOutstanding": cib_total_outstanding,
            "GT_Doc_CIB_OverdueAmount": cib_overdue_amount,
            "GT_Doc_CIB_WorstStatus": cib_worst_status,
            "GT_Doc_CIB_TradeName": cib_trade_name,
            "GT_Doc_CIB_NID": cib_nid,

            # Businessman specific
            "GT_Doc_TradeLicense_OrgName": gt_trade_org,
            "GT_Doc_TradeLicense_LicenseNo": gt_trade_license_no,

            # Service Holder specific
            "GT_Doc_Payslip_EmployerName": gt_payslip_employer,
            "GT_Doc_Payslip_NetPay": gt_payslip_net_pay,
            "GT_Doc_EmpID_No": gt_empid_no,
            "GT_Doc_EmpID_OrgName": gt_empid_org,
            "GT_Doc_EmpID_Designation": gt_empid_designation,
            "GT_Doc_EmpID_Name": gt_empid_name,

            # Self-Employed specific
            "GT_Doc_ProfCert_BodyName": gt_prof_body,
            "GT_Doc_ProfCert_RegNo": gt_prof_reg_no,
            "GT_Doc_ProfCert_Name": gt_prof_name,
        }

        # Post-generation convenience field
        record["Monthly Uncommitted Income"] = record["Total Income (X)"] - record["Total Expenditure (Y)"]
        data.append(record)

    return pd.DataFrame(data)


df = generate_full_application_data(100, persona="random")

base_dir = r"C:\Users\user\OneDrive\Loan App Validation"
output_path = rf"{base_dir}\master_loan_application_with_gt.csv"

try:
    df.to_csv(output_path, index=False)
    wrote_path = output_path
except PermissionError:
    # If the target CSV is open/locked (common with OneDrive/Excel), write to an alternate filename.
    alt_path = rf"{base_dir}\master_loan_application_with_gt_{random.randint(1000, 9999)}.csv"
    df.to_csv(alt_path, index=False)
    wrote_path = alt_path

print(df["Present Address"].head())
print(f"Wrote {len(df)} records to: {wrote_path}")
print(df["Persona"].value_counts(dropna=False))

0    Flat 94, Holding No. 3, Middle DaulatHar, Sona...
1    Holding No. 97, New BijoyPlaza, ManoharTala, N...
2    Apartment 3, House No. 50, Middle SubarnaGarh,...
3    Studio 66, Holding No. 547, QutubBari Station,...
4    Apartment 70, House No. 7, Old KeshavGan, Nali...
Name: Present Address, dtype: object
Wrote 100 records to: C:\Users\user\OneDrive\Loan App Validation\master_loan_application_with_gt.csv
Persona
Self-Employed     35
Businessman       33
Service Holder    32
Name: count, dtype: int64
