<a href="https://colab.research.google.com/github/JBlizzard-sketch/LoanIQ/blob/main/Untitled80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ========================== CELL 1 — ENVIRONMENT & SETUP ==========================

# ---------- Install dependencies ----------
import subprocess
import sys

def install(package):
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", package])

# Packages (skip if installed)
packages = [
    "streamlit", "pandas", "numpy", "scikit-learn", "xgboost", "imbalanced-learn",
    "shap", "faker", "pyngrok", "matplotlib", "seaborn", "plotly", "reportlab", "python-dotenv"
]

for pkg in packages:
    install(pkg)

print("✅ Dependencies installed / confirmed.")

# ---------- Folder structure ----------
from pathlib import Path

ROOT = Path("/content/LoanIQ").resolve()
MODULES = ROOT / "modules"
DATA = ROOT / "data"
EXPORTS = ROOT / "exports"

FOLDERS = [
    MODULES / "synth",
    MODULES / "ml",
    MODULES / "app",
    MODULES / "auth",
    MODULES / "pipeline",
    MODULES / "schema",
    DATA / "uploads",
    DATA / "synthetic",
    EXPORTS / "reports",
    EXPORTS / "logs"
]

for folder in FOLDERS:
    folder.mkdir(parents=True, exist_ok=True)

# Create empty __init__.py for modules
import os
for subfolder in MODULES.glob("**/"):
    init_file = subfolder / "__init__.py"
    if not init_file.exists():
        init_file.touch()

print("✅ Folder structureready, __init__.py created.")

# ---------- Colab-safe import paths ----------
import sys
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("✅ sys.path updated.")

# ---------- Hardcoded secrets ----------
ADMIN_USERNAME = "Admin"
ADMIN_PASSWORD = "Shady868"
NGROK_AUTHTOKEN = "31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF"

print(f"✅ Hardcoded admin credentials and ngrok token set.")

# ---------- Smoke test imports ----------
try:
    import modules.app.client_panel as client_panel
    import modules.app.app as app_module
    import modules.ml.engine as ml_engine
    import modules.pipeline.pipeline as pipeline
    import modules.schema.schema as schema
    import modules.auth.auth as auth
    import modules.synth.generator as generator
    print("✅ Smoke test: All modules imported successfully.")
except Exception as e:
    print("❌ Import failed:", e)

print("\n✅ CELL 1 complete — Environment setup ready for Colab.")

Installing streamlit ...
Installing scikit-learn ...
Installing imbalanced-learn ...
Installing faker ...
Installing pyngrok ...
Installing reportlab ...
Installing python-dotenv ...
✅ Dependencies installed / confirmed.
✅ Folder structureready, __init__.py created.
✅ sys.path updated.
✅ Hardcoded admin credentials and ngrok token set.
❌ Import failed: No module named 'modules.app.client_panel'

✅ CELL 1 complete — Environment setup ready for Colab.


In [None]:
!pip install streamlit

In [3]:
# ======== COLAB PERSISTENCE PATCH FOR LOANIQ ========

# 1️⃣ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2️⃣ Define persistent paths
import os, shutil, sys

BASE_DRIVE_PATH = '/content/drive/MyDrive/LoanIQ'
MODULES_DRIVE_PATH = os.path.join(BASE_DRIVE_PATH, 'modules')
DATA_DRIVE_PATH = os.path.join(BASE_DRIVE_PATH, 'data')
MODELS_DRIVE_PATH = os.path.join(BASE_DRIVE_PATH, 'models')

# Ensure directories exist
for path in [MODULES_DRIVE_PATH, DATA_DRIVE_PATH, MODELS_DRIVE_PATH]:
    os.makedirs(path, exist_ok=True)

print("✅ Persistent folders created for modules, data, and models")

# 3️⃣ Copy modules from Colab to Drive if not already there
SRC_MODULES_PATH = '/content/LoanIQ/modules'  # adjust if your code is elsewhere
if os.path.exists(SRC_MODULES_PATH):
    shutil.copytree(SRC_MODULES_PATH, MODULES_DRIVE_PATH, dirs_exist_ok=True)
    print("✅ Modules copied to Drive")

# 4️⃣ Ensure __init__.py exists in every module subfolder
for root, dirs, files in os.walk(MODULES_DRIVE_PATH):
    for d in dirs:
        init_file = os.path.join(root, d, '__init__.py')
        if not os.path.exists(init_file):
            open(init_file, 'w').close()
print("✅ __init__.py ensured in all module subfolders")

# 5️⃣ Add modules path to sys.path
if MODULES_DRIVE_PATH not in sys.path:
    sys.path.append(MODULES_DRIVE_PATH)
print("✅ Modules path added to sys.path")

# 6️⃣ Create empty placeholder files for datasets/models if they don't exist
# This avoids "not found" errors in later cells
synth_path = os.path.join(DATA_DRIVE_PATH, 'df_synth.csv')
if not os.path.exists(synth_path):
    import pandas as pd
    pd.DataFrame().to_csv(synth_path)
    print("✅ Placeholder synthetic dataset created")

model_placeholder = os.path.join(MODELS_DRIVE_PATH, 'global_model.pkl')
if not os.path.exists(model_placeholder):
    import joblib
    joblib.dump(None, model_placeholder)
    print("✅ Placeholder global model file created")

# 7️⃣ Hardcoded secrets (for now)
ADMIN_USER = 'Admin'
ADMIN_PASS = 'Shady868'
NGROK_AUTH_TOKEN = '31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF'
print("✅ Secrets configured (admin credentials and ngrok token)")

Mounted at /content/drive
✅ Persistent folders created for modules, data, and models
✅ Modules copied to Drive
✅ __init__.py ensured in all module subfolders
✅ Modules path added to sys.path
✅ Secrets configured (admin credentials and ngrok token)


In [4]:
# ====== Cell 3: Kenyan Faker & Robust Base Dataset Setup ======

# --- Imports ---
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

# --- Faker setup ---
fake = Faker('en_US')  # Base English, customized for Kenyan context

# --- Kenyan Names ---
MALE_NAMES = ['Joseph', 'John', 'David', 'James', 'William', 'Peter', 'Brian', 'Jackson', 'Kamau', 'Mwangi', 'Onyango', 'Kipchoge', 'Juma', 'Baraka', 'Henry', 'Aiden', 'Kyalo', 'Muthui', 'Matu', 'Badru', 'Azizi']
FEMALE_NAMES = ['Sarah', 'Naomi', 'Irene', 'Mary', 'Anne', 'Elizabeth', 'Mercy', 'Faith', 'Caro', 'Lilian', 'Njeri', 'Wanjiku', 'Atieno', 'Jepkosgei', 'Zawadi', 'Amani', 'Ayana', 'Mumbi', 'Makena', 'Kioni', 'Kainda']
LAST_NAMES = ['Njuguna', 'Onyango', 'Aduda', 'Mwayi', 'Nthambi', 'Kyeli', 'Egadwa', 'Simiyu', 'Mwangi', 'Kamau', 'Otieno', 'Cheruiyot', 'Juma', 'Baraka']

# --- Branches & Regions ---
BRANCHES = {
    'Nairobi': ['Ruiru', 'Thika', 'Kikuyu', 'Ngong', 'Mavoko', 'Westlands', 'Dagoretti', 'Kilimani'],
    'Eastern': ['Machakos', 'Kitui', 'Meru', 'Embu', 'Kangundo', 'Maua'],
    'Coast': ['Mombasa', 'Malindi', 'Lamu', 'Watamu', 'Diani', 'Kilifi', 'Changamwe'],
    'Central': ['Nyeri', 'Kiambu', 'Kerugoya', 'Nyahururu', 'Murang"a', 'Karuri'],
    'Other': ['Nakuru', 'Molo', 'Ugunja', 'Mbale', "Moi's Bridge", 'Eldoret', 'Kisumu']
}

# --- Products & Statuses ---
PRODUCTS = ['INUKA 4 WEEKS', 'KUZA 4 WEEKS', 'KUZA 5 WEEKS', 'FADHILI WEEKS', 'INUKA 5 WEEKS']
STATUSES = ['Active', 'Pending Branch Approval', 'Rejected']
HEALTH_OPTIONS = ['Performing', 'Non-Performing']  # 90/10 distribution

# --- ID to Age Mapping ---
def infer_age_from_id(id_num):
    prefix = int(str(id_num)[:2])
    if 1 <= prefix <= 10: return random.randint(60, 80)
    elif 11 <= prefix <= 20: return random.randint(50, 60)
    elif 21 <= prefix <= 25: return random.randint(40, 50)
    elif 26 <= prefix <= 31: return random.randint(30, 40)
    elif 32 <= prefix <= 34: return random.randint(27, 30)
    else: return random.randint(18, 26)

# --- Generate Single Loan Record ---
def generate_loan():
    # Gender skew: 70% female
    is_female = random.choices([True, False], weights=[70, 30])[0]
    first = random.choice(FEMALE_NAMES if is_female else MALE_NAMES)
    last = random.choice(LAST_NAMES)
    name = f"{first.upper()} {last.upper()}"
    gender = 'Female' if is_female else 'Male'

    # ID & age
    id_num = random.randint(10000000, 39999999)
    age = infer_age_from_id(id_num)

    # Branch & product
    region = random.choices(['Nairobi', 'Eastern', 'Coast', 'Central', 'Other'], weights=[20,20,20,20,20])[0]
    branch = random.choice(BRANCHES[region])
    product = random.choice(PRODUCTS)

    # Loan amount & dates
    amount = random.randrange(4000, 15001, 1000)
    created_date = datetime.now() - timedelta(days=random.randint(0, 30))
    ref = created_date.strftime('%y%m%d') + str(random.randint(10000, 999999))

    # Status & health
    status = random.choices(STATUSES, weights=[50, 40, 10])[0]
    health = random.choices(HEALTH_OPTIONS, weights=[90, 10])[0]

    # Defaults (~20% default rate)
    default_flag = 1 if random.random() < 0.2 else 0

    # Income, occupation, collateral
    income = round(random.uniform(4000, 8000), 0)
    occupation = random.choices(['Small Business', 'Salaried'], weights=[80, 20])[0]
    collateral = 'None'

    # Risk placeholder (can be refined later)
    risk_score = round(random.uniform(0,1),2)

    return {
        'Customer Name': name,
        'Id/Reg Number': id_num,
        'Branch': branch.upper(),
        'Product': product,
        'Amount': f"{amount:,}",
        'Ref. Number': ref,
        'Loan Type': 'Normal',
        'Status': status,
        'Loan Health': health,
        'Created Date': created_date.strftime('%Y-%m-%d'),
        'Age': age,
        'Gender': gender,
        'Income (KES weekly)': income,
        'Occupation': occupation,
        'Collateral': collateral,
        'Default': default_flag,
        'Risk Score': risk_score
    }

# --- Batch Generation Function ---
def generate_batch(batch_size=20000):
    return pd.DataFrame([generate_loan() for _ in range(batch_size)])

def generate_multiple_batches(n_batches=20, batch_size=20000):
    return [generate_batch(batch_size) for _ in range(n_batches)]

# --- Generate Example Dataset (1 small batch for preview) ---
loan_df_preview = generate_batch(50)
loan_df_preview.head(10)

Unnamed: 0,Customer Name,Id/Reg Number,Branch,Product,Amount,Ref. Number,Loan Type,Status,Loan Health,Created Date,Age,Gender,Income (KES weekly),Occupation,Collateral,Default,Risk Score
0,IRENE ONYANGO,38751584,MBALE,INUKA 5 WEEKS,5000,250808328762,Normal,Rejected,Performing,2025-08-08,18,Female,7079.0,Salaried,,0,0.81
1,JOSEPH BARAKA,11892957,WATAMU,FADHILI WEEKS,15000,250808242964,Normal,Rejected,Performing,2025-08-08,53,Male,7190.0,Small Business,,1,0.15
2,MUTHUI NTHAMBI,28257643,EMBU,INUKA 5 WEEKS,14000,250814368909,Normal,Active,Performing,2025-08-14,36,Male,4400.0,Small Business,,0,0.6
3,MERCY MWANGI,23070213,KERUGOYA,KUZA 4 WEEKS,5000,250811856815,Normal,Active,Performing,2025-08-11,42,Female,7107.0,Salaried,,0,0.75
4,JACKSON ONYANGO,24351719,KARURI,FADHILI WEEKS,12000,250823338515,Normal,Active,Non-Performing,2025-08-23,48,Male,7555.0,Small Business,,1,0.79
5,ATIENO EGADWA,39074743,MAUA,KUZA 5 WEEKS,8000,250829497847,Normal,Pending Branch Approval,Performing,2025-08-29,19,Female,5694.0,Small Business,,0,0.87
6,WILLIAM ONYANGO,25848168,LAMU,INUKA 5 WEEKS,4000,250831131038,Normal,Active,Performing,2025-08-31,45,Male,6676.0,Salaried,,0,0.14
7,MERCY SIMIYU,38322910,THIKA,FADHILI WEEKS,5000,250804536958,Normal,Pending Branch Approval,Performing,2025-08-04,20,Female,6371.0,Small Business,,0,0.46
8,KAINDA JUMA,20991782,CHANGAMWE,KUZA 5 WEEKS,11000,250903947478,Normal,Pending Branch Approval,Performing,2025-09-03,58,Female,7172.0,Small Business,,0,0.42
9,WILLIAM NJUGUNA,31622733,NAKURU,INUKA 4 WEEKS,11000,250817710939,Normal,Active,Performing,2025-08-17,33,Male,6986.0,Small Business,,0,0.44


In [5]:
# ====== Cell 4: Schema Patch & Validation ======

import pandas as pd
from datetime import datetime

# --- Define expected schema columns and types ---
EXPECTED_SCHEMA = {
    'Customer Name': str,
    'Id/Reg Number': int,
    'Branch': str,
    'Product': str,
    'Amount': str,  # formatted as "x,xxx"
    'Ref. Number': str,
    'Loan Type': str,
    'Status': str,
    'Loan Health': str,
    'Created Date': str,  # can convert to datetime later
    'Age': int,
    'Gender': str,
    'Income (KES weekly)': float,
    'Occupation': str,
    'Collateral': str,
    'Default': int,
    'Risk Score': float
}

# --- Schema validation function ---
def validate_schema(df):
    missing_cols = [col for col in EXPECTED_SCHEMA.keys() if col not in df.columns]
    extra_cols = [col for col in df.columns if col not in EXPECTED_SCHEMA.keys()]

    if missing_cols:
        print(f"⚠️ Missing columns in uploaded/generated dataset: {missing_cols}")
    if extra_cols:
        print(f"⚠️ Extra columns found in dataset: {extra_cols}")

    # Type conversion
    for col, dtype in EXPECTED_SCHEMA.items():
        if col in df.columns:
            try:
                df[col] = df[col].astype(dtype)
            except Exception as e:
                print(f"⚠️ Could not convert column {col} to {dtype}: {e}")

    # Ensure dates are datetime
    if 'Created Date' in df.columns:
        df['Created Date'] = pd.to_datetime(df['Created Date'], errors='coerce')
        null_dates = df['Created Date'].isnull().sum()
        if null_dates > 0:
            print(f"⚠️ {null_dates} invalid dates found in 'Created Date', set to NaT")

    print("✅ Schema validation complete.")
    return df

# --- Example validation of preview dataset from Cell 3 ---
loan_df_validated = validate_schema(loan_df_preview)
loan_df_validated.head(10)

✅ Schema validation complete.


Unnamed: 0,Customer Name,Id/Reg Number,Branch,Product,Amount,Ref. Number,Loan Type,Status,Loan Health,Created Date,Age,Gender,Income (KES weekly),Occupation,Collateral,Default,Risk Score
0,IRENE ONYANGO,38751584,MBALE,INUKA 5 WEEKS,5000,250808328762,Normal,Rejected,Performing,2025-08-08,18,Female,7079.0,Salaried,,0,0.81
1,JOSEPH BARAKA,11892957,WATAMU,FADHILI WEEKS,15000,250808242964,Normal,Rejected,Performing,2025-08-08,53,Male,7190.0,Small Business,,1,0.15
2,MUTHUI NTHAMBI,28257643,EMBU,INUKA 5 WEEKS,14000,250814368909,Normal,Active,Performing,2025-08-14,36,Male,4400.0,Small Business,,0,0.6
3,MERCY MWANGI,23070213,KERUGOYA,KUZA 4 WEEKS,5000,250811856815,Normal,Active,Performing,2025-08-11,42,Female,7107.0,Salaried,,0,0.75
4,JACKSON ONYANGO,24351719,KARURI,FADHILI WEEKS,12000,250823338515,Normal,Active,Non-Performing,2025-08-23,48,Male,7555.0,Small Business,,1,0.79
5,ATIENO EGADWA,39074743,MAUA,KUZA 5 WEEKS,8000,250829497847,Normal,Pending Branch Approval,Performing,2025-08-29,19,Female,5694.0,Small Business,,0,0.87
6,WILLIAM ONYANGO,25848168,LAMU,INUKA 5 WEEKS,4000,250831131038,Normal,Active,Performing,2025-08-31,45,Male,6676.0,Salaried,,0,0.14
7,MERCY SIMIYU,38322910,THIKA,FADHILI WEEKS,5000,250804536958,Normal,Pending Branch Approval,Performing,2025-08-04,20,Female,6371.0,Small Business,,0,0.46
8,KAINDA JUMA,20991782,CHANGAMWE,KUZA 5 WEEKS,11000,250903947478,Normal,Pending Branch Approval,Performing,2025-09-03,58,Female,7172.0,Small Business,,0,0.42
9,WILLIAM NJUGUNA,31622733,NAKURU,INUKA 4 WEEKS,11000,250817710939,Normal,Active,Performing,2025-08-17,33,Male,6986.0,Small Business,,0,0.44


In [6]:
# --- Colab Patch: Setup modules and synthetic generator globally ---
import os
import sys

# --- Ensure folder structure exists ---
os.makedirs('modules/synth', exist_ok=True)

# --- Ensure __init__.py exists to make modules importable ---
for folder in ['modules', 'modules/synth']:
    init_file = os.path.join(folder, '__init__.py')
    if not os.path.exists(init_file):
        with open(init_file, 'w') as f:
            f.write("# Init for module")

# --- Add modules to sys.path if missing ---
if '/content/LoanIQ/modules' not in sys.path:
    sys.path.append('/content/LoanIQ/modules')

# --- Define synthetic generator directly if module import fails ---
try:
    from modules.synth import generate_synthetic
except ImportError:
    print("⚠️ modules.synth not found. Defining generate_synthetic inline for Colab.")
    import pandas as pd
    import random
    from datetime import datetime, timedelta

    def generate_synthetic(n_records=1000, default_rate=0.2):
        """Generate robust synthetic dataset for Colab fallback."""
        df = pd.DataFrame()
        names = ['Joseph Njuguna','Roselda Onyango','Josephine Aduda','Francis Mwayi',
                 'Cynthia Nthambi','Jacob Kyeli','Everlyne Egadwa','Edith Simiyu']
        statuses = ['Active','Pending Branch Approval','Rejected']
        amounts = [4000,5000,6000,8000,10000,15000]
        branches = ['MPEKETONI','UGUNJA','NAKURU','MOLO','CHANGAMWE','KANYANGI','MBALE',"MOI'S BRIDGE"]
        products = ['INUKA 4 WEEKS','KUZA 4 WEEKS','KUZA 5 WEEKS','FADHILI WEEKS','INUKA 5 WEEKS']

        data = []
        for _ in range(n_records):
            name = random.choice(names)
            id_num = random.randint(10000000,39999999)
            branch = random.choice(branches)
            product = random.choice(products)
            amount = random.choice(amounts)
            status = random.choices(statuses, weights=[0.5,0.4,0.1])[0]
            created_date = datetime.now() - timedelta(days=random.randint(0,30))
            ref = created_date.strftime('%y%m%d') + str(random.randint(10000,999999))
            age = random.randint(18,60)
            gender = random.choice(['Male','Female'])
            income = round(random.uniform(4000,8000),0)
            occupation = random.choice(['Small Business','Salaried'])
            collateral = 'None'

            default = 1 if status in ['Rejected','Pending Branch Approval'] else 0

            data.append([name,id_num,branch,product,amount,ref,'Normal',status,'Performing',
                         created_date.strftime('%Y-%m-%d'),age,gender,income,occupation,collateral,default])

        df = pd.DataFrame(data, columns=['Customer Name','Id/Reg Number','Branch','Product','Amount',
                                         'Ref. Number','Loan Type','Status','Loan Health','Created Date',
                                         'Age','Gender','Income (KES weekly)','Occupation','Collateral','Default'])
        return df

print("✅ Patch applied: modules and generate_synthetic ready for Colab.")

⚠️ modules.synth not found. Defining generate_synthetic inline for Colab.
✅ Patch applied: modules and generate_synthetic ready for Colab.


In [7]:
# === Cell 5: ML Engine / Baseline Global Model + Batch Generation ===

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import pickle
import random
from datetime import datetime, timedelta

# --- Ensure folders exist ---
os.makedirs('models', exist_ok=True)

# --- Synthetic dataset generator (inline for Colab) ---
def generate_synthetic(n_records=5000, default_rate=0.2):
    MALE_NAMES = ['Joseph','John','David','James','William','Peter','Brian','Jackson','Kamau','Mwangi','Onyango','Kipchoge','Juma','Baraka','Henry','Aiden','Kyalo','Muthui','Matu','Badru','Azizi']
    FEMALE_NAMES = ['Sarah','Naomi','Irene','Mary','Anne','Elizabeth','Mercy','Faith','Caro','Lilian','Njeri','Wanjiku','Atieno','Jepkosgei','Zawadi','Amani','Ayana','Mumbi','Makena','Kioni','Kainda']
    LAST_NAMES = ['Njuguna','Onyango','Aduda','Mwayi','Nthambi','Kyeli','Egadwa','Simiyu','Mwangi','Kamau','Otieno','Cheruiyot','Juma','Baraka']
    PRODUCTS = ['INUKA 4 WEEKS','KUZA 4 WEEKS','KUZA 5 WEEKS','FADHILI WEEKS','INUKA 5 WEEKS']
    BRANCHES = ['Ruiru','Thika','Kikuyu','Ngong','Mavoko','Westlands','Dagoretti','Kilimani','Machakos','Kitui','Meru','Embu','Kangundo','Maua','Mombasa','Malindi','Lamu','Watamu','Diani','Kilifi','Changamwe','Nyeri','Kiambu','Kerugoya','Nyahururu','Murang"a','Karuri','Nakuru','Molo','Ugunja','Mbale',"Moi's Bridge"]

    def infer_age_from_id(id_num):
        prefix = int(str(id_num)[:2])
        if 1 <= prefix <= 10: return random.randint(60,80)
        elif 11 <= prefix <= 20: return random.randint(50,60)
        elif 21 <= prefix <= 25: return random.randint(40,50)
        elif 26 <= prefix <= 31: return random.randint(30,40)
        elif 32 <= prefix <= 34: return random.randint(27,30)
        else: return random.randint(18,26)

    data = []
    for _ in range(n_records):
        is_female = random.choices([True,False], weights=[70,30])[0]
        first = random.choice(FEMALE_NAMES if is_female else MALE_NAMES)
        last = random.choice(LAST_NAMES)
        name = f"{first.upper()} {last.upper()}"
        gender = 'Female' if is_female else 'Male'
        id_num = random.randint(10000000,39999999)
        age = infer_age_from_id(id_num)
        branch = random.choice(BRANCHES)
        product = random.choice(PRODUCTS)
        amount = random.randrange(4000,15001,1000)
        income = round(random.uniform(4000,8000),0)
        status = random.choices(['Active','Pending Branch Approval','Rejected'], weights=[60,20,20])[0]
        health = 'Performing'
        created_date = datetime.now() - timedelta(days=random.randint(0,30))
        ref = created_date.strftime('%y%m%d') + str(random.randint(10000,999999))
        default = 1 if random.random() < default_rate else 0
        data.append({
            'Customer Name': name,
            'Id/Reg Number': id_num,
            'Branch': branch.upper(),
            'Product': product,
            'Amount': amount,  # Ensure numeric
            'Income (KES weekly)': income,
            'Status': status,
            'Loan Health': health,
            'Created Date': created_date.strftime('%Y-%m-%d'),
            'Age': age,
            'Gender': gender,
            'Default': default
        })
    return pd.DataFrame(data)

# --- Generate single dataset (5k) or batch of datasets ---
try:
    df_synth
except NameError:
    print("⚠️ df_synth not found, generating 5k records...")
    df_synth = generate_synthetic(n_records=5000, default_rate=0.2)
    print("✅ df_synth generated with 5,000 records")

# --- Optional: batch generation (20 batches × 20k records each) ---
BATCH_MODE = True
BATCH_SIZE = 20000
NUM_BATCHES = 20
df_batches = []
if BATCH_MODE:
    for i in range(NUM_BATCHES):
        print(f"Generating batch {i+1}/{NUM_BATCHES}...")
        df_batches.append(generate_synthetic(n_records=BATCH_SIZE, default_rate=0.2))
    print("✅ All batches generated.")

# --- Prepare features ---
feature_cols = ['Amount','Income (KES weekly)','Age']
X = df_synth[feature_cols].copy()
y = df_synth['Default']

# --- Convert numeric columns in case of formatting issues ---
for col in ['Amount','Income (KES weekly)','Age']:
    X[col] = pd.to_numeric(X[col], errors='coerce')
X = X.fillna(0)

# --- Scale ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --- Train XGBoost baseline ---
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# --- Predictions & metrics ---
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("=== Baseline Global XGBoost Model Metrics ===")
print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")
print(f"F1 Score: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

# --- Save model and scaler ---
with open('models/global_xgb_model.pkl','wb') as f: pickle.dump(xgb_model,f)
with open('models/global_scaler.pkl','wb') as f: pickle.dump(scaler,f)

# --- Add predictions to df_synth ---
df_synth['Pred_Default_Proba'] = xgb_model.predict_proba(X_scaled)[:,1]
df_synth['Pred_Default'] = xgb_model.predict(X_scaled)

print("\n✅ Baseline global model trained and predictions added to dataset.")

⚠️ df_synth not found, generating 5k records...
✅ df_synth generated with 5,000 records
Generating batch 1/20...
Generating batch 2/20...
Generating batch 3/20...
Generating batch 4/20...
Generating batch 5/20...
Generating batch 6/20...
Generating batch 7/20...
Generating batch 8/20...
Generating batch 9/20...
Generating batch 10/20...
Generating batch 11/20...
Generating batch 12/20...
Generating batch 13/20...
Generating batch 14/20...
Generating batch 15/20...
Generating batch 16/20...
Generating batch 17/20...
Generating batch 18/20...
Generating batch 19/20...
Generating batch 20/20...
✅ All batches generated.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Baseline Global XGBoost Model Metrics ===
Accuracy: 0.792
AUC: 0.550
F1 Score: 0.046
Confusion Matrix:
 [[787  12]
 [196   5]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.98      0.88       799
           1       0.29      0.02      0.05       201

    accuracy                           0.79      1000
   macro avg       0.55      0.50      0.46      1000
weighted avg       0.70      0.79      0.71      1000


✅ Baseline global model trained and predictions added to dataset.


In [8]:
# === Cell: Global + Hybrid + Self-learning ML Engine ===

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import pickle

# --- Ensure folders exist ---
os.makedirs('models', exist_ok=True)

# --- Synthetic generator inline for Colab ---
def generate_synthetic(n_records=5000, default_rate=0.2):
    np.random.seed(42)
    df = pd.DataFrame({
        'User_ID': np.arange(1, n_records+1),
        'Branch': np.random.choice(['Thomasberg','Millerberg','Lake Brittany','Birdstad'], size=n_records),
        'Amount': np.round(np.random.uniform(1000, 10000, size=n_records), 2),
        'Income (KES weekly)': np.round(np.random.uniform(5000, 20000, size=n_records), 2),
        'Age': np.random.randint(18, 65, size=n_records),
        'Status': np.random.choice(['Active','Rejected','Pending Branch Approval'], size=n_records, p=[0.8,0.1,0.1])
    })
    # Default column based on fixed default_rate
    df['Default'] = (np.random.rand(n_records) < default_rate).astype(int)
    return df

# --- Load or generate df_synth ---
try:
    df_synth
except NameError:
    print("⚠️ df_synth not found, generating synthetic dataset inline for Colab...")
    df_synth = generate_synthetic(n_records=5000, default_rate=0.2)
    print(f"✅ df_synth generated with {len(df_synth)} records")

# --- Prepare features ---
feature_cols = ['Amount','Income (KES weekly)','Age']
X = df_synth[feature_cols].copy()
y = df_synth['Default']

# --- Convert numeric columns properly ---
for col in ['Amount','Income (KES weekly)']:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# --- Scale ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --- Global XGBoost ---
global_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
global_model.fit(X_train, y_train)

# --- Predictions ---
y_pred = global_model.predict(X_test)
y_proba = global_model.predict_proba(X_test)[:,1]

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("=== Global XGBoost Model Metrics ===")
print(f"Accuracy: {acc:.3f}, AUC: {auc:.3f}, F1: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

# --- Save model and scaler ---
with open('models/global_xgb_model.pkl','wb') as f: pickle.dump(global_model,f)
with open('models/global_scaler.pkl','wb') as f: pickle.dump(scaler,f)

# --- Add predictions to df_synth ---
df_synth['Pred_Default_Proba'] = global_model.predict_proba(scaler.transform(X))[:,1]
df_synth['Pred_Default'] = global_model.predict(scaler.transform(X))

# --- Simple Fraud probability (synthetic) ---
df_synth['Pred_Fraud_Proba'] = np.clip(df_synth['Pred_Default_Proba'] + np.random.normal(0,0.05,len(df_synth)),0,1)

# --- Loan limit recommendation (example logic) ---
df_synth['Loan_Limit_Recommendation'] = (df_synth['Income (KES weekly)'] * 4 * (1 - df_synth['Pred_Default_Proba'])).round(0)

# --- Admin override placeholders ---
df_synth['Admin_Adjusted_Default_Proba'] = df_synth['Pred_Default_Proba']
df_synth['Admin_Override_Loan_Limit'] = df_synth['Loan_Limit_Recommendation']

print("\n✅ Global model trained, predictions, fraud probability, and loan limit added to dataset.")

# --- Batch simulation for self-learning ---
n_batches = 20
batch_size = len(df_synth)//n_batches
print(f"\nGenerating {n_batches} batches for self-learning updates...")
for i in range(n_batches):
    batch = df_synth.iloc[i*batch_size:(i+1)*batch_size]
    print(f"Batch {i+1}/{n_batches} ready ({len(batch)} records)")
print("✅ All batches generated.")

=== Global XGBoost Model Metrics ===
Accuracy: 0.797, AUC: 0.542, F1: 0.029
Confusion Matrix:
 [[794   5]
 [198   3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.99      0.89       799
           1       0.38      0.01      0.03       201

    accuracy                           0.80      1000
   macro avg       0.59      0.50      0.46      1000
weighted avg       0.71      0.80      0.71      1000


✅ Global model trained, predictions, fraud probability, and loan limit added to dataset.

Generating 20 batches for self-learning updates...
Batch 1/20 ready (250 records)
Batch 2/20 ready (250 records)
Batch 3/20 ready (250 records)
Batch 4/20 ready (250 records)
Batch 5/20 ready (250 records)
Batch 6/20 ready (250 records)
Batch 7/20 ready (250 records)
Batch 8/20 ready (250 records)
Batch 9/20 ready (250 records)
Batch 10/20 ready (250 records)
Batch 11/20 ready (250 records)
Batch 12/20 ready (250 records)
Batch 13/20 r

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# modules/admin_tools.py
import pandas as pd
import json
import traceback
from pathlib import Path
from datetime import datetime
import random

# Paths
MODEL_DIR = Path("/content/LoanIQ/models")
SCHEMA_BACKUPS = Path("/content/LoanIQ/schema_backups")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
SCHEMA_BACKUPS.mkdir(parents=True, exist_ok=True)

# -----------------------------
# Admin Utilities
# -----------------------------

def generate_synthetic(records=1000, branches=None, fraud_pct=0.02, default_rate=0.08, seed=None):
    """Generate synthetic loan dataset for testing or model training."""
    try:
        if seed is not None:
            random.seed(seed)
        if branches is None:
            branches = ["Nairobi", "Mombasa", "Kisumu", "Nakuru"]
        data = []
        for i in range(records):
            branch = random.choice(branches)
            loan_amount = random.randint(5000, 50000)
            income = random.randint(10000, 100000)
            fraud_flag = 1 if random.random() < fraud_pct else 0
            default_flag = 1 if random.random() < default_rate else 0
            data.append({
                "client_id": f"C{i+1:05d}",
                "branch": branch,
                "loan_amount": loan_amount,
                "income": income,
                "fraud_flag": fraud_flag,
                "loan_status": default_flag
            })
        df = pd.DataFrame(data)
        summary = {
            "records": len(df),
            "branches": df['branch'].unique().tolist(),
            "simulated_fraud": df['fraud_flag'].sum(),
            "avg_loan": df['loan_amount'].mean(),
            "median_income": df['income'].median()
        }
        return {"df": df, "summary": summary}
    except Exception as e:
        return {"error": str(e), "trace": traceback.format_exc()}

def retrain_model(df, target_col='loan_status', model_params=None, version_note="admin_retrain"):
    """Placeholder: trains model and saves to MODEL_DIR."""
    try:
        version_id = f"v{int(datetime.now().timestamp())}"
        model_path = MODEL_DIR / version_id
        model_path.mkdir(parents=True, exist_ok=True)
        # Dummy metrics
        result = {
            "version_id": version_id,
            "auc": round(random.uniform(0.7, 0.9), 2),
            "f1": round(random.uniform(0.6, 0.8), 2),
            "artifact": str(model_path)
        }
        return {"ok": True, "result": result}
    except Exception as e:
        return {"ok": False, "error": str(e), "trace": traceback.format_exc()}

def fraud_stress_test(base_records=500, branches=None, fraud_steps=None, detection_threshold=0.6):
    """Simulate fraud detection under different fraud injection levels."""
    try:
        if fraud_steps is None:
            fraud_steps = [0.02, 0.05, 0.1]
        if branches is None:
            branches = ["Nairobi", "Mombasa"]
        results = []
        for pct in fraud_steps:
            injected = int(base_records * pct)
            detected = int(injected * random.uniform(0.7, 0.9))
            results.append({
                "fraud_pct": pct,
                "injected": injected,
                "detected": detected,
                "detection_rate": round(detected / injected if injected > 0 else 0, 2)
            })
        return results
    except Exception as e:
        return {"error": str(e), "trace": traceback.format_exc()}

def inject_schema(new_expected_columns, tag=None):
    """Save a schema backup JSON file."""
    try:
        if tag is None:
            tag = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_file = SCHEMA_BACKUPS / f"schema_{tag}.json"
        payload = {
            "expected_columns": new_expected_columns,
            "tag": tag,
            "timestamp": datetime.now().isoformat()
        }
        with open(backup_file, "w") as f:
            json.dump(payload, f, indent=2)
        return {"ok": True, "backup": str(backup_file), "metadata": payload}
    except Exception as e:
        return {"ok": False, "error": str(e), "trace": traceback.format_exc()}

def impersonate_user(username):
    """Generate a dummy impersonation token for testing."""
    try:
        token = f"impersonate_{username}_{int(datetime.now().timestamp())}"
        return {
            "impersonated_user": username,
            "issued_at": int(datetime.now().timestamp()),
            "expires_in": 3600,
            "token": token
        }
    except Exception as e:
        return {"error": str(e), "trace": traceback.format_exc()}

def ensure_admin():
    """Placeholder: ensure admin exists in auth DB."""
    # In real app, call auth.init_db() and add_user if missing
    return True

In [10]:

import streamlit as st
import pandas as pd
import numpy as np
import time

# ----------------------
# INLINE ADMIN TOOLS
# ----------------------
class AdminTools:
    @staticmethod
    def ensure_admin():
        return True

    @staticmethod
    def generate_synthetic(records=500, branches=["Nairobi", "Mombasa"], fraud_pct=0.02, default_rate=0.08, seed=None):
        np.random.seed(seed or int(time.time()))
        df = pd.DataFrame({
            "client_id": range(records),
            "branch": np.random.choice(branches, size=records),
            "loan_amount": np.random.randint(1000, 50000, size=records),
            "income": np.random.randint(20000, 100000, size=records),
            "fraud": np.random.choice([0,1], size=records, p=[1-fraud_pct, fraud_pct]),
            "default": np.random.choice([0,1], size=records, p=[1-default_rate, default_rate])
        })
        summary = {
            "records": records,
            "branches": list(df["branch"].unique()),
            "simulated_fraud": df["fraud"].sum(),
            "avg_loan": df["loan_amount"].mean(),
            "median_income": df["income"].median()
        }
        return {"df": df, "summary": summary}

    @staticmethod
    def retrain_model(df, model_params=None, version_note="test"):
        time.sleep(1)
        return {"ok": True, "result": {"version_id": f"v{int(time.time())}", "auc": 0.85, "f1": 0.72}}

    @staticmethod
    def fraud_stress_test(base_records=500, fraud_steps=[0.02,0.05], detection_threshold=0.6):
        results = []
        for f in fraud_steps:
            injected = int(base_records*f)
            detected = int(injected * 0.8)
            results.append({"fraud_pct": f, "injected": injected, "detected": detected, "detection_rate": detected/injected})
        return results

    @staticmethod
    def inject_schema(new_expected_columns, tag="test"):
        backup_path = f"/content/schema_backups/schema_{tag}.json"
        return {"ok": True, "backup": backup_path}

    @staticmethod
    def impersonate_user(username="demo_user"):
        return {"impersonated_user": username, "token": f"token_{username}_{int(time.time())}"}

admin_tools = AdminTools()
admin_tools.ensure_admin()

# ----------------------
# STREAMLIT ADMIN PANEL
# ----------------------
st.set_page_config(page_title="LoanIQ Admin Sandbox", layout="wide")
st.title("📊 LoanIQ Admin Sandbox")

menu = st.sidebar.selectbox("Menu", ["Overview", "Generate Synthetic Dataset", "Retrain Model", "Fraud Stress Test", "Schema Management", "Impersonate User"])

# --------- OVERVIEW ---------
if menu == "Overview":
    st.subheader("Admin Overview")
    st.metric("Total Users", 5)
    st.metric("Models Trained", 3)
    st.metric("Schema Backups", 2)
    st.info("This is a placeholder overview; metrics can be dynamically loaded from DB or model list.")

# --------- GENERATE SYNTHETIC DATA ---------
elif menu == "Generate Synthetic Dataset":
    st.subheader("Generate Synthetic Dataset")
    records = st.slider("Number of Records", 100, 5000, 500, step=100)
    branches = st.multiselect("Branches", ["Nairobi", "Mombasa", "Kisumu", "Nakuru"], default=["Nairobi","Mombasa"])
    fraud_pct = st.slider("Fraud Percentage", 0.0, 0.3, 0.02, 0.01)
    default_rate = st.slider("Default Rate", 0.0, 0.3, 0.08, 0.01)
    seed = st.number_input("Random Seed (optional)", min_value=0, value=0, step=1)
    if st.button("Generate Dataset"):
        res = admin_tools.generate_synthetic(records, branches, fraud_pct, default_rate, seed)
        st.success("Synthetic dataset generated!")
        st.json(res["summary"])
        st.dataframe(res["df"].head(10))
        st.download_button("Download CSV", res["df"].to_csv(index=False), "synthetic_loans.csv")
        st.bar_chart(res["df"]["loan_amount"].value_counts(bins=10))

# --------- RETRAIN MODEL ---------
elif menu == "Retrain Model":
    st.subheader("Retrain Model")
    st.info("Using last generated synthetic dataset as example.")
    version_note = st.text_input("Version Note", "admin_test")
    model_params = st.text_area("Model Params (JSON)", '{"max_depth": 5, "learning_rate": 0.1}')
    if st.button("Train Model"):
        try:
            res = admin_tools.retrain_model(None, version_note=version_note)  # Dummy call
            st.success(f"Model trained: {res['result']['version_id']}, AUC: {res['result']['auc']}, F1: {res['result']['f1']}")
        except Exception as e:
            st.error(str(e))

# --------- FRAUD STRESS TEST ---------
elif menu == "Fraud Stress Test":
    st.subheader("Fraud Stress Test")
    base_records = st.slider("Base Records", 100, 2000, 500, 100)
    fraud_steps_input = st.text_input("Fraud Steps (comma-separated)", "0.02,0.05,0.1")
    detection_threshold = st.slider("Detection Threshold", 0.0, 1.0, 0.6, 0.05)
    if st.button("Run Stress Test"):
        fraud_steps = [float(x.strip()) for x in fraud_steps_input.split(",")]
        res = admin_tools.fraud_stress_test(base_records, fraud_steps, detection_threshold)
        df_res = pd.DataFrame(res)
        st.dataframe(df_res)
        st.line_chart(df_res.set_index('fraud_pct')['detection_rate'])

# --------- SCHEMA MANAGEMENT ---------
elif menu == "Schema Management":
    st.subheader("Schema Management")
    new_cols = st.text_area("New Columns (comma-separated)", "client_id,loan_amount")
    tag = st.text_input("Schema Tag", "custom_2025")
    if st.button("Save Schema"):
        cols = [c.strip() for c in new_cols.split(",")]
        res = admin_tools.inject_schema(cols, tag)
        st.success(f"Schema saved to {res['backup']}")

# --------- IMPERSONATE USER ---------
elif menu == "Impersonate User":
    st.subheader("Impersonate User")
    username = st.text_input("Username", "demo_user")
    if st.button("Impersonate"):
        res = admin_tools.impersonate_user(username)
        st.success(f"Now impersonating {res['impersonated_user']}, token: {res['token']}")

2025-09-03 12:45:16.203 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-09-03 12:45:16.212 Session state does not function when running a script without `streamlit run`


In [11]:

# modules/app/client_panel.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import shap
from sklearn.ensemble import IsolationForest
from modules import pipeline, engine, synth, schema
from pathlib import Path
import os

DATA_DIR = Path("/content/LoanIQ/data/uploads")
SCENARIO_DIR = Path("/content/LoanIQ/data/scenarios")
DATA_DIR.mkdir(parents=True, exist_ok=True)
SCENARIO_DIR.mkdir(parents=True, exist_ok=True)

def app():
    st.title("LoanIQ Client Panel 📊")

    # -------------------------------
    # Data Ingestion
    # -------------------------------
    st.subheader("📥 Data Ingestion")
    data_source = st.radio("Choose data source:", ["Upload CSV/Excel", "Simulate Dataset"])

    df = pd.DataFrame()
    if data_source == "Upload CSV/Excel":
        uploaded_file = st.file_uploader("Upload File", type=["csv", "xlsx"])
        if uploaded_file:
            try:
                if uploaded_file.name.endswith(".csv"):
                    df = pd.read_csv(uploaded_file)
                else:
                    df = pd.read_excel(uploaded_file)
                st.success(f"Uploaded {len(df)} records")
            except Exception as e:
                st.error(f"Error reading file: {e}")
    else:
        with st.form("simulate_form"):
            records = st.slider("Number of records", 100, 2000, 500, step=100)
            fraud_pct = st.slider("Fraud %", 0.0, 0.3, 0.05, step=0.01)
            default_rate = st.slider("Default %", 0.0, 0.3, 0.08, step=0.01)
            branches = st.multiselect("Branches", synth.DEFAULT_BRANCHES, default=synth.DEFAULT_BRANCHES)
            generate_btn = st.form_submit_button("Generate Dataset")
        if generate_btn:
            df = synth.generate_synthetic_data(records, fraud_pct, default_rate, branches)
            st.success(f"Generated {len(df)} synthetic records")

    if not df.empty:
        st.dataframe(df.head(10))
        with st.spinner("Processing data..."):
            df_proc = pipeline.add_features(schema.simple_preprocess(df))
        st.plotly_chart(px.histogram(df_proc, x='loan_amount', nbins=20, title="Loan Amount Distribution"))

        # -------------------------------
        # Portfolio Insights
        # -------------------------------
        st.subheader("📊 Portfolio Insights")
        branch_filter = st.selectbox("Filter by Branch", ["All"] + df_proc['branch'].unique().tolist())
        risk_filter = st.slider("Min Risk Score", 0.0, 1.0, 0.0)
        filtered_df = df_proc.copy()
        if branch_filter != "All":
            filtered_df = filtered_df[filtered_df['branch'] == branch_filter]
        filtered_df = filtered_df[filtered_df['risk_score'] >= risk_filter]

        col1, col2, col3, col4, col5 = st.columns(5)
        col1.metric("Total Loans", len(filtered_df))
        col2.metric("Total Loan Amount", round(filtered_df['loan_amount'].sum(), 2))
        col3.metric("Avg Risk Score", round(filtered_df['risk_score'].mean(), 2))
        col4.metric("Default Rate", f"{round(filtered_df['pred_default'].mean()*100, 2) if 'pred_default' in filtered_df.columns else 0}%")
        col5.metric("High-Risk Loans", len(filtered_df[filtered_df['risk_score'] > 0.7]))

        st.dataframe(filtered_df[['record_id', 'national_id', 'loan_amount', 'risk_score', 'loan_status']])
        st.plotly_chart(px.pie(filtered_df, names='loan_status', title="Loan Status Breakdown"))
        st.plotly_chart(px.density_heatmap(filtered_df, x='branch', y='risk_score', title="Risk by Branch"))
        st.plotly_chart(px.line(filtered_df.groupby('created_date').size(), title="Loans Over Time"))

        # -------------------------------
        # Per-Client Lookup
        # -------------------------------
        st.subheader("🔍 Per-Client Lookup")
        with st.form("client_form"):
            client_id = st.text_input("National ID or Phone")
            client_btn = st.form_submit_button("Search Client")
        if client_btn:
            client_df = df_proc[(df_proc['national_id'] == client_id) | (df_proc['phone'] == client_id)]
            if not client_df.empty:
                st.dataframe(client_df[['record_id', 'loan_amount', 'loan_status', 'risk_score', 'prob_default']])
                st.metric("Total Loans", len(client_df))
                st.metric("Avg Risk Score", round(client_df['risk_score'].mean(), 2))
                st.plotly_chart(px.line(client_df, x='created_date', y='loan_amount', title="Client Loan Timeline"))
                if st.button("Tag as High Risk"):
                    st.success(f"Client {client_id} tagged as High Risk")
                    # Optionally save to a database or CSV
                st.download_button("Export Client CSV", client_df.to_csv(index=False), file_name=f"client_{client_id}.csv")
            else:
                st.warning("Client not found")

        # -------------------------------
        # Risk Analysis
        # -------------------------------
        st.subheader("⚠️ Risk Analysis")
        top_branches = pipeline.top_risky_branches(df_proc, top_n=10)
        st.dataframe(top_branches)
        st.plotly_chart(px.bar(top_branches, x='branch', y='avg_risk', title="Top Risky Branches"))

        high_risk_loans = df_proc[df_proc['risk_score'] > 0.7][['national_id', 'loan_amount', 'risk_score']]
        st.dataframe(high_risk_loans)
        st.plotly_chart(px.scatter(df_proc, x='loan_amount', y='risk_score', color='branch', title="Loan Risk Scatter Plot"))
        st.download_button("Export Risk Report", high_risk_loans.to_csv(index=False), file_name="high_risk_loans.csv")

        # -------------------------------
        # ML Predictions & Anomaly Detection
        # -------------------------------
        st.subheader("📈 ML Predictions & Anomalies")
        model_version = st.selectbox("Model Version", engine.list_models())
        if st.button("Run Predictions"):
            df_proc = engine.predict(df_proc, model_version)
            X = engine.prepare_dataset(df_proc)
            iso = IsolationForest(contamination=0.05).fit_predict(X)
            st.write(f"Predicted Default Rate: {df_proc['pred_default'].mean()*100:.2f}%")
            anomalies = df_proc[iso == -1][['national_id', 'loan_amount', 'prob_default']]
            st.dataframe(anomalies.head(10))

            # SHAP Summary Plot
            model = engine.load_model(model_version)
            explainer = shap.Explainer(model, X)
            shap_values = explainer(X)
            st.pyplot(shap.summary_plot(shap_values, X, show=False))
            st.plotly_chart(px.histogram(df_proc, x='prob_default', color='pred_default', title="Predicted Probabilities"))
            st.download_button("Export Anomalies CSV", anomalies.to_csv(index=False), file_name="anomalies.csv")

        # -------------------------------
        # What-if Analysis
        # -------------------------------
        st.subheader("🔮 What-if Analysis")
        with st.form("whatif_form"):
            loan_adj = st.slider("Loan Size Adjustment (%)", -50, 50, 0)
            income_adj = st.slider("Income Adjustment (%)", -50, 50, 0)
            fraud_adj = st.slider("Fraud Multiplier", 0.5, 2.0, 1.0, step=0.1)
            whatif_btn = st.form_submit_button("Run What-if")
        if whatif_btn:
            df_whatif = df_proc.copy()
            df_whatif['loan_amount'] *= 1 + loan_adj/100
            df_whatif['income'] *= 1 + income_adj/100
            df_whatif['simulated_fraud'] = df_whatif.get('simulated_fraud', 0) * fraud_adj
            df_whatif = pipeline.add_features(df_whatif)
            st.metric("Adjusted Avg Risk Score", round(df_whatif['risk_score'].mean(), 3))
            box_df = pd.DataFrame({'Original': df_proc['risk_score'], 'Adjusted': df_whatif['risk_score']})
            st.plotly_chart(px.box(box_df, title="Original vs Adjusted Risk"))
            save_path = SCENARIO_DIR / "whatif_scenario.csv"
            df_whatif.to_csv(save_path, index=False)
            st.success(f"Scenario saved to {save_path}")

        # -------------------------------
        # Export Data
        # -------------------------------
        st.subheader("⬇️ Export Data")
        export_type = st.selectbox("Export Type", ["Full Dataset", "High-Risk Loans", "Client Lookup", "What-if Results"])
        if st.button("Download Export"):
            if export_type == "Full Dataset":
                data = df_proc.to_csv(index=False)
            elif export_type == "High-Risk Loans":
                data = df_proc[df_proc['risk_score'] > 0.7].to_csv(index=False)
            elif export_type == "Client Lookup" and client_id:
                data = client_df.to_csv(index=False) if not client_df.empty else ""
            elif export_type == "What-if Results":
                data = df_whatif.to_csv(index=False) if 'df_whatif' in locals() else ""
            else:
                data = ""
            st.download_button(f"Download {export_type}", data, file_name=f"{export_type.replace(' ', '_').lower()}.csv")

ImportError: cannot import name 'engine' from 'modules' (/content/LoanIQ/modules/__init__.py)

In [12]:
import os
import ast

modules_path = "/content/LoanIQ/modules"

def list_defs(file_path):
    """Parse a Python file and list its functions, classes, and imports."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            tree = ast.parse(f.read(), filename=file_path)
        funcs = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
        classes = [n.name for n in tree.body if isinstance(n, ast.ClassDef)]
        imports = []
        for n in tree.body:
            if isinstance(n, ast.Import):
                imports.extend([alias.name for alias in n.names])
            elif isinstance(n, ast.ImportFrom):
                mod = n.module if n.module else ""
                imports.extend([f"{mod}.{alias.name}" for alias in n.names])
        return funcs, classes, imports
    except Exception as e:
        return [], [], [f"⚠️ Error parsing: {e}"]

print("📂 Inspecting:", modules_path)
for root, dirs, files in os.walk(modules_path):
    level = root.replace(modules_path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")
        if f.endswith(".py"):
            path = os.path.join(root, f)
            funcs, classes, imports = list_defs(path)
            if funcs:   print(f"{subindent}    🟦 Functions: {funcs}")
            if classes: print(f"{subindent}    🟨 Classes: {classes}")
            if imports: print(f"{subindent}    🟩 Imports: {imports}")

📂 Inspecting: /content/LoanIQ/modules
modules/
    __init__.py
    __pycache__/
        __init__.cpython-312.pyc
    ml/
        __init__.py
    schema/
        __init__.py
        __pycache__/
            __init__.cpython-312.pyc
    app/
        __init__.py
        __pycache__/
            __init__.cpython-312.pyc
    auth/
        __init__.py
    pipeline/
        __init__.py
        __pycache__/
            __init__.cpython-312.pyc
    synth/
        __init__.py
        __pycache__/
            __init__.cpython-312.pyc
