<a href="https://colab.research.google.com/github/JBlizzard-sketch/LoanIQ/blob/main/Copy_of_LoanIQ2Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# 🚀 LoanIQ Bootstrap Cell (Run after reset)
# ============================================

# --- Step 1: Install dependencies ---
!pip install faker imbalanced-learn shap xgboost streamlit

# --- Step 2: Create folder scaffolding ---
import os

folders = [
    "modules/bootstrap", "modules/core", "modules/ingestion", "modules/synth",
    "modules/features", "modules/ml", "modules/reports", "modules/sandbox", "modules/api",
    "pages", "tests", "data", "config"
]
for f in folders:
    os.makedirs(f, exist_ok=True)

# Touch __init__.py files
for f in ["modules"] + [f"modules/{d}" for d in ["bootstrap","core","ingestion","synth","features","ml","reports","sandbox","api"]]:
    open(os.path.join(f, "__init__.py"), "w").close()
open("tests/__init__.py", "w").close()

# --- Step 3: Write modules/synth/generators.py ---
generators_code = r'''
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

faker = Faker(["en_US"])  # use global dataset, we'll override with Kenyan names manually

kenyan_first_names = [
    "Achieng", "Atieno", "Wanjiku", "Njeri", "Chebet", "Kiptoo", "Otieno", "Kamau",
    "Mwangi", "Omondi", "Kipchoge", "Cherono", "Mutiso", "Nduta", "Nyambura", "Wairimu"
]
kenyan_last_names = [
    "Ouma", "Mutua", "Koech", "Barasa", "Kiprotich", "Maina", "Otieno", "Kamau",
    "Mwangi", "Njoroge", "Omondi", "Chege", "Kariuki", "Kipkorir"
]
towns = [
    "Nairobi","Mombasa","Kisumu","Nakuru","Eldoret","Meru","Nyeri","Machakos","Thika","Kitale",
    "Kericho","Embu","Garissa","Isiolo","Kilifi","Lamu","Voi","Narok","Naivasha","Kakamega"
]

products = {
    "Inuka":5, "Kuza":4, "Fadhili":6, "Imara":8, "Boreshwa":12
}
loan_types = ["Normal","Top-up","Emergency","Business"]
statuses = ["Active","Pending Approval"]
health_states = ["performing","watch","non-performing"]

def guess_age_from_id(gov_id:str) -> int:
    try:
        num = int(gov_id[:2])
        if num <= 15: return random.randint(55,70)
        if num <= 25: return random.randint(40,55)
        if num <= 33: return random.randint(28,40)
        return random.randint(18,28)
    except:
        return random.randint(18,60)

def generate_clients_loans(n_rows=1000, seed=None, default_rate=0.15, gender_ratio=0.6):
    if seed: np.random.seed(seed); random.seed(seed)
    rows = []
    for i in range(n_rows):
        fname = random.choice(kenyan_first_names)
        lname = random.choice(kenyan_last_names)
        name = f"{fname} {lname}"
        gov_id = str(random.randint(20000000, 40000000))
        age = guess_age_from_id(gov_id)
        gender = "F" if random.random() < gender_ratio else "M"
        branch = random.choice(towns)
        product = random.choice(list(products.keys()))
        product_weeks = products[product]
        amount = random.randint(5000, 50000)
        loan_type = random.choice(loan_types)
        status = random.choice(statuses)
        health = "default" if random.random() < default_rate else "performing"
        debt_to_income = round(random.uniform(0.1, 1.5), 2)
        created_date = faker.date_between(start_date="-2y", end_date="today")
        rows.append([f"CUST{i:05d}", name, gov_id, age, gender, branch,
                     product, product_weeks, amount, loan_type, status, health,
                     debt_to_income, created_date])
    return pd.DataFrame(rows, columns=[
        "customer_id","customer_name","gov_id","age","gender","branch",
        "product","product_weeks","loan_amount","loan_type","status",
        "default","debt_to_income","created_date"
    ])
'''
with open("modules/synth/generators.py","w") as f: f.write(generators_code)

# --- Step 4: Write modules/ml/engine.py ---
engine_code = r'''
import os, joblib
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import shap
import xgboost as xgb

def train_model(df, algo="LogReg", use_smote=True, test_size=0.2, random_state=42):
    if "default" not in df.columns:
        raise ValueError("Dataset must contain 'default' column (0/1).")

    X = df.drop(columns=["default","customer_id","customer_name","gov_id","created_date"])
    X = pd.get_dummies(X, drop_first=True)
    y = df["default"].apply(lambda v: 1 if v!="performing" else 0)

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size,random_state=random_state)

    if use_smote:
        sm = SMOTE(random_state=random_state)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    if algo=="LogReg":
        model = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=1000))])
    elif algo=="SGD":
        model = Pipeline([("scaler", StandardScaler()), ("clf", SGDClassifier(loss="log_loss", max_iter=1000))])
    elif algo=="XGBoost":
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    elif algo=="HybridBlend":
        logreg = LogisticRegression(max_iter=1000)
        xgbc = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        logreg.fit(X_train,y_train)
        xgbc.fit(X_train,y_train)
        def hybrid_predict(X): return (0.5*logreg.predict_proba(X)[:,1] + 0.5*xgbc.predict_proba(X)[:,1])
        model = (logreg, xgbc, hybrid_predict)
    else:
        raise ValueError(f"Unknown algo {algo}")

    if algo!="HybridBlend": model.fit(X_train,y_train)

    if algo=="HybridBlend":
        preds = (model[2](X_test) > 0.5).astype(int)
        probs = model[2](X_test)
    else:
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:,1]

    metrics = {
        "Accuracy": accuracy_score(y_test,preds),
        "Precision": precision_score(y_test,preds,zero_division=0),
        "Recall": recall_score(y_test,preds,zero_division=0),
        "AUC": roc_auc_score(y_test,probs)
    }

    path = f"models/{algo}_model.pkl"
    os.makedirs("models",exist_ok=True)
    joblib.dump(model, path)

    try:
        if algo=="HybridBlend":
            explainer = shap.Explainer(model[1])
            shap_values = explainer(X_test)
        else:
            explainer = shap.Explainer(model, X_test)
            shap_values = explainer(X_test)
    except Exception:
        shap_values = None

    return metrics, shap_values, path
'''
with open("modules/ml/engine.py","w") as f: f.write(engine_code)

# --- Step 5: Write Streamlit page placeholders ---
for i in range(1,8):
    with open(f"pages/{i:02d}_placeholder.py","w") as f:
        f.write(f"import streamlit as st\nst.title('Placeholder Tab {i}')\n")

# --- Step 6: Confirm setup ---
print("✅ LoanIQ environment bootstrapped successfully!")
for root, dirs, files in os.walk("modules"):
    print(root, files)

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m130.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m129.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker, pydeck, streamlit
Successfully installed faker-37.6.0 pydeck-0.9.1 streamlit-1.49.1
✅ LoanIQ environment bootstrapped successfully!
modules ['__init_

In [None]:
import streamlit as st

# Recreate the tab structure (must always exist before referencing tabs[x])
tabs = st.tabs([
    "Data Ingestion",
    "Client Onboarding",
    "Feature Engineering",
    "Model Training",
    "Reports",
    "Audit Logs",
    "Sandbox"
])



In [None]:
# --- Tab: Full Sandbox Control Center ---
with tabs[6]:
    import importlib, joblib, os
    import modules.synth.generators as g
    import modules.ml.engine as engine
    importlib.reload(engine)

    st.subheader("🧪 LoanIQ Sandbox Control Center")

    # =====================
    # SECTION 1: Data Config
    # =====================
    st.markdown("### 📂 Data Configuration")
    with st.expander("Synthetic Data Parameters", expanded=True):
        n_rows = st.slider("Dataset Size", 100, 20000, 2000, 100)
        default_rate = st.slider("Default Rate (%)", 0, 50, 15, 1) / 100
        gender_ratio = st.slider("Female Ratio", 0.0, 1.0, 0.6, 0.05)
        branch_count = st.slider("Number of Branches", 5, 100, 50, 5)
        seed = st.number_input("Random Seed", value=42, step=1)

    uploaded_file = st.file_uploader("Or upload real client dataset (CSV)", type=["csv"])

    # =====================
    # SECTION 2: Model Config
    # =====================
    st.markdown("### 🤖 Model Configuration")
    with st.expander("Select & Configure Models", expanded=True):
        model_choice = st.multiselect(
            "Choose Models to Test",
            ["LogReg", "SGD", "XGBoost", "HybridBlend"],
            default=["LogReg", "XGBoost"]
        )

        st.write("⚙️ Hyperparameters")
        logreg_C = st.slider("LogReg Regularization (C)", 0.01, 10.0, 1.0)
        sgd_alpha = st.slider("SGD Alpha", 0.0001, 0.1, 0.001, 0.0001)
        xgb_lr = st.slider("XGBoost Learning Rate", 0.01, 0.5, 0.1)
        xgb_depth = st.slider("XGBoost Max Depth", 2, 12, 6)

        # Pin model
        pin_model = st.selectbox("📌 Pin Model to Production", ["None"] + model_choice)

        # AB Testing toggle
        ab_test = st.checkbox("Run A/B Test Across Models", value=True)

    # =====================
    # SECTION 3: Actions
    # =====================
    if st.button("🚀 Run Sandbox"):
        # --- Data ---
        if uploaded_file:
            import pandas as pd
            df = pd.read_csv(uploaded_file)
            st.success(f"✅ Loaded real dataset: {df.shape}")
        else:
            df = g.generate_clients_loans(
                n_rows=n_rows,
                default_rate=default_rate,
                gender_ratio=gender_ratio,
                branch_count=branch_count,
                seed=seed,
            )
            st.success(f"✅ Synthetic dataset generated: {df.shape}")
        st.dataframe(df.head())

        # --- Models ---
        st.subheader("📊 Model Results")
        results = {}
        for algo in model_choice:
            try:
                model_path = f"models/{algo}_model.pkl"
                if not os.path.exists(model_path):
                    st.warning(f"⚠️ {algo} not trained yet.")
                    continue

                model = joblib.load(model_path)
                X = df.drop(columns=["default"]) if "default" in df.columns else df.copy()

                preds = model.predict(X)
                try:
                    scores = model.predict_proba(X)[:, 1]
                except:
                    scores = preds

                df[f"{algo}_pred"] = preds
                df[f"{algo}_score"] = scores
                results[algo] = (preds, scores)

                st.write(f"### {algo}")
                st.bar_chart(df[f"{algo}_pred"].value_counts())
                st.line_chart(df[f"{algo}_score"].head(50))

            except Exception as e:
                st.error(f"Error running {algo}: {e}")

        # --- Governance ---
        if pin_model != "None":
            with open("models/production_model.txt", "w") as f:
                f.write(pin_model)
            st.success(f"📌 Pinned {pin_model} as production model.")

    # =====================
    # SECTION 4: Drilldown
    # =====================
    st.markdown("### 🔍 Client Drilldown")
    client_id = st.text_input("Enter Client ID to Inspect")
    if client_id and "df" in locals():
        row = df[df["customer_id"].astype(str) == str(client_id)]
        if not row.empty:
            st.write("Client Record:", row.T)
            st.write("Predictions:", {m: row[f"{m}_pred"].values[0] for m in model_choice if f"{m}_pred" in row})

2025-08-31 12:16:10.095 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-08-31 12:16:10.122 Session state does not function when running a script without `streamlit run`


In [None]:
# --- ONE-CELL PATCH: Full Admin Sandbox page with training, versioning, A/B, SHAP, audit logs ---

import os, json, sqlite3, textwrap, datetime as dt

# Ensure folders
os.makedirs("pages", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

# Write/overwrite the Admin Sandbox Streamlit page
sandbox_code = r'''
import os, io, json, glob, sqlite3, importlib, datetime as dt
import pandas as pd
import streamlit as st

# --- Safe imports of our modules (reload to pick latest code) ---
import modules.synth.generators as g
import modules.ml.engine as engine
import importlib
importlib.reload(g)
importlib.reload(engine)

MODELS_DIR = "models"
AUDIT_DB   = "data/audit.db"
PROD_PIN   = os.path.join(MODELS_DIR, "PROD_MODEL.txt")

# ---------- Utilities ----------
def ensure_audit_db():
    os.makedirs(os.path.dirname(AUDIT_DB), exist_ok=True)
    with sqlite3.connect(AUDIT_DB) as conn:
        conn.execute("""
        CREATE TABLE IF NOT EXISTS audit_logs(
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            ts TEXT NOT NULL,
            action TEXT NOT NULL,
            details TEXT
        )""")
        conn.commit()

def log_action(action: str, details: dict):
    ensure_audit_db()
    with sqlite3.connect(AUDIT_DB) as conn:
        conn.execute(
            "INSERT INTO audit_logs (ts, action, details) VALUES (?, ?, ?)",
            (dt.datetime.utcnow().isoformat(timespec="seconds")+"Z", action, json.dumps(details, default=str))
        )
        conn.commit()

def list_model_files():
    files = sorted(glob.glob(os.path.join(MODELS_DIR, "*.pkl")))
    # attach simple meta if present
    out = []
    for f in files:
        meta_path = f.replace(".pkl", ".meta.json")
        meta = {}
        if os.path.exists(meta_path):
            try:
                with open(meta_path, "r") as fh:
                    meta = json.load(fh)
            except Exception:
                meta = {}
        out.append({"file": f, "meta": meta})
    return out

def read_prod_pin():
    try:
        with open(PROD_PIN, "r") as fh:
            return fh.read().strip()
    except FileNotFoundError:
        return None

def write_prod_pin(path: str):
    with open(PROD_PIN, "w") as fh:
        fh.write(path)

def load_uploaded_csv(uploaded):
    # Accept csv/xlsx
    if uploaded is None:
        return None
    name = uploaded.name.lower()
    if name.endswith(".csv"):
        return pd.read_csv(uploaded)
    if name.endswith(".xlsx") or name.endswith(".xls"):
        return pd.read_excel(uploaded)
    # Try CSV as fallback
    try:
        return pd.read_csv(uploaded)
    except Exception:
        return None

# ---------- PAGE LAYOUT ----------
st.title("🧪 Admin Sandbox — Loan IQ")

tabs = st.tabs([
    "📦 Dataset",
    "🤖 Train One Model",
    "📌 Versions",
    "🆚 A/B Test",
    "🔍 Explainability",
    "🧾 Audit Logs"
])

# === Tab 0: Dataset ===
with tabs[0]:
    st.subheader("Dataset Source")

    src = st.radio("Choose dataset source:", ["Generate (synthetic)", "Upload file", "Pick batch file"], horizontal=True)

    df = None
    dataset_info = {}

    if src == "Generate (synthetic)":
        c1, c2, c3 = st.columns(3)
        with c1:
            n_rows = st.number_input("Rows", 1000, 200000, 5000, step=500)
            default_rate = st.slider("Default rate", 0.0, 0.5, 0.12, 0.01)
        with c2:
            gender_ratio = st.slider("Female ratio", 0.0, 1.0, 0.65, 0.01)
            mean_dti     = st.slider("Mean DTI", 0.05, 1.0, 0.35, 0.01)
        with c3:
            seed = st.number_input("Random seed", 0, 10_000, 42, step=1)
            # optional loan sizing knob (multiplier)
            loan_scale = st.slider("Loan size scale", 0.5, 3.0, 1.0, 0.1)

        if st.button("🚀 Generate"):
            df = g.generate_clients_loans(
                n_rows=int(n_rows),
                default_rate=float(default_rate),
                gender_ratio=float(gender_ratio),
                mean_dti=float(mean_dti),
                seed=int(seed),
                loan_scale=float(loan_scale)
            )
            st.success(f"Generated dataset: {df.shape}")
            st.dataframe(df.head(50), use_container_width=True)
            dataset_info = {"source":"synthetic","rows":len(df),"params":{"default_rate":default_rate,"gender_ratio":gender_ratio,"mean_dti":mean_dti,"seed":seed,"loan_scale":loan_scale}}

    elif src == "Upload file":
        uploaded = st.file_uploader("Upload CSV/XLSX", type=["csv","xlsx","xls"])
        if uploaded:
            df = load_uploaded_csv(uploaded)
            if df is not None and len(df) > 0:
                st.success(f"Loaded dataset: {df.shape}")
                st.dataframe(df.head(50), use_container_width=True)
                dataset_info = {"source":"upload","name":uploaded.name,"rows":len(df)}
            else:
                st.error("Could not read the uploaded file. Please upload CSV/XLSX.")

    else:  # Pick batch file
        batch_files = sorted(glob.glob("data/sandbox_batches/*.csv"))
        if not batch_files:
            st.info("No batch files in data/sandbox_batches/. Generate batches from the synth module.")
        else:
            pick = st.selectbox("Choose batch CSV", batch_files)
            if st.button("📥 Load batch"):
                df = pd.read_csv(pick)
                st.success(f"Loaded dataset: {df.shape}")
                st.dataframe(df.head(50), use_container_width=True)
                dataset_info = {"source":"batch","file":pick,"rows":len(df)}

    # Save dataset to session state for other tabs
    if df is not None:
        st.session_state["sandbox_df"] = df
        st.session_state["dataset_info"] = dataset_info

# === Tab 1: Train One Model ===
with tabs[1]:
    st.subheader("Train a Single Model with Controls")
    df = st.session_state.get("sandbox_df", None)
    if df is None:
        st.warning("Load or generate a dataset in the 'Dataset' tab first.")
    else:
        algo = st.selectbox("Algorithm", ["LogReg", "SGD", "XGBoost", "HybridBlend"])

        # Common options
        use_smote = st.checkbox("Use SMOTE (balance classes)", value=True)

        # Per-model hyperparameters
        hp = {}
        if algo == "LogReg":
            c1, c2 = st.columns(2)
            with c1:
                hp["C"] = st.number_input("C (inverse regularization)", 0.0001, 1000.0, 1.0, step=0.1)
                hp["max_iter"] = st.number_input("max_iter", 100, 10000, 200, step=50)
            with c2:
                hp["penalty"] = st.selectbox("penalty", ["l2","none"])
                hp["fit_intercept"] = st.checkbox("fit_intercept", True)

        elif algo == "SGD":
            c1, c2 = st.columns(2)
            with c1:
                hp["alpha"] = st.number_input("alpha", 1e-6, 1.0, 0.0001, format="%.6f")
                hp["max_iter"] = st.number_input("max_iter", 100, 5000, 1000, step=100)
            with c2:
                hp["loss"] = st.selectbox("loss", ["log_loss","modified_huber","hinge"])
                hp["fit_intercept"] = st.checkbox("fit_intercept", True)

        elif algo == "XGBoost":
            c1, c2, c3 = st.columns(3)
            with c1:
                hp["n_estimators"] = st.number_input("n_estimators", 50, 2000, 300, step=50)
                hp["max_depth"] = st.number_input("max_depth", 1, 12, 4, step=1)
            with c2:
                hp["learning_rate"] = st.slider("learning_rate", 0.01, 0.5, 0.1, 0.01)
                hp["subsample"] = st.slider("subsample", 0.5, 1.0, 0.9, 0.05)
            with c3:
                hp["colsample_bytree"] = st.slider("colsample_bytree", 0.5, 1.0, 0.9, 0.05)
                hp["reg_lambda"] = st.slider("reg_lambda", 0.0, 2.0, 1.0, 0.1)

        else:  # HybridBlend
            st.info("Hybrid uses internal optimized weights. Train its components too for best effect.")
            # optional blend weights
            c1, c2, c3 = st.columns(3)
            with c1:
                hp["w_logreg"] = st.slider("w_logreg", 0.0, 1.0, 0.33, 0.01)
            with c2:
                hp["w_sgd"] = st.slider("w_sgd", 0.0, 1.0, 0.33, 0.01)
            with c3:
                hp["w_xgb"] = st.slider("w_xgb", 0.0, 1.0, 0.34, 0.01)

        if st.button("🏋️ Train Model"):
            try:
                metrics, shap_values, model_path = engine.train_model(
                    df.copy(), algo,
                    use_smote=use_smote,
                    **hp
                )
                st.success(f"Saved model: {model_path}")
                st.json(metrics)

                # Save metadata next to model
                meta = {
                    "algo": algo,
                    "use_smote": use_smote,
                    "hyperparams": hp,
                    "metrics": metrics,
                    "dataset_info": st.session_state.get("dataset_info", {}),
                    "trained_at_utc": dt.datetime.utcnow().isoformat(timespec="seconds")+"Z"
                }
                meta_path = model_path.replace(".pkl", ".meta.json")
                with open(meta_path, "w") as fh:
                    json.dump(meta, fh, indent=2)
                st.caption(f"Metadata written → {meta_path}")

                log_action("train_model", {"model_path": model_path, **meta})
            except Exception as e:
                st.error(f"Training failed: {e}")

# === Tab 2: Versions (Pin / Load) ===
with tabs[2]:
    st.subheader("Model Versions & Pinning")
    models = list_model_files()
    if not models:
        st.info("No saved models yet. Train one in the previous tab.")
    else:
        prod = read_prod_pin()
        st.caption(f"Current PROD pin: {prod if prod else 'None'}")
        options = [m["file"] for m in models]
        sel = st.selectbox("Select a model file to pin", options)
        if st.button("📌 Pin as PROD"):
            write_prod_pin(sel)
            st.success(f"Pinned PROD → {sel}")
            log_action("pin_model", {"pinned": sel})

        st.markdown("#### Model Catalog")
        for m in models:
            st.write("**File:**", m["file"])
            st.json(m["meta"])

# === Tab 3: A/B Test ===
with tabs[3]:
    st.subheader("Compare Two Models on a Dataset (A/B)")
    models = list_model_files()
    if len(models) < 2:
        st.info("Need at least two models saved.")
    else:
        opt = [m["file"] for m in models]
        c1, c2 = st.columns(2)
        with c1:
            mA = st.selectbox("Model A", opt, key="abA")
        with c2:
            mB = st.selectbox("Model B", opt, key="abB")

        src = st.radio("Dataset for test", ["Use current dataset", "Upload CSV/XLSX"], horizontal=True)
        df_test = None
        if src == "Use current dataset":
            df_test = st.session_state.get("sandbox_df", None)
            if df_test is None:
                st.warning("No dataset in session. Load/generate one in 'Dataset' tab.")
        else:
            up = st.file_uploader("Upload test CSV/XLSX", type=["csv","xlsx","xls"], key="ab_upload")
            if up:
                df_test = load_uploaded_csv(up)

        if st.button("🔬 Run A/B Test") and df_test is not None:
            try:
                res = engine.compare_models(df_test.copy(), mA, mB)
                st.success("A/B complete.")
                st.json(res)
                log_action("ab_test", {"modelA": mA, "modelB": mB, "result": res, "rows": len(df_test)})
            except Exception as e:
                st.error(f"A/B failed: {e}")

# === Tab 4: Explainability ===
with tabs[4]:
    st.subheader("Explainability (SHAP)")
    models = list_model_files()
    if not models:
        st.info("Train a model first.")
    else:
        pick = st.selectbox("Pick a model to inspect", [m["file"] for m in models])
        df = st.session_state.get("sandbox_df", None)
        if df is None:
            st.warning("Load or generate a dataset in 'Dataset' first (used for background SHAP).")
        else:
            try:
                fig = engine.shap_summary_plot(df.copy(), pick)
                if fig is None:
                    st.info("SHAP summary not available for this model, or no numeric features.")
                else:
                    st.pyplot(fig)
            except Exception as e:
                st.error(f"SHAP summary failed: {e}")

# === Tab 5: Audit Logs ===
with tabs[5]:
    st.subheader("Audit Logs")
    ensure_audit_db()
    with sqlite3.connect(AUDIT_DB) as conn:
        df_logs = pd.read_sql("SELECT * FROM audit_logs ORDER BY id DESC LIMIT 500", conn)
    st.dataframe(df_logs, use_container_width=True)
'''

page_path = "pages/04_Admin_Sandbox.py"
with open(page_path, "w") as f:
    f.write(sandbox_code)

# Minimal smoke test: ensure file exists and audit DB can be created
def _smoke():
    assert os.path.exists(page_path), "Sandbox page was not written"
    # init audit DB
    with sqlite3.connect("data/audit.db") as conn:
        conn.execute("CREATE TABLE IF NOT EXISTS audit_logs (id INTEGER PRIMARY KEY, ts TEXT, action TEXT, details TEXT)")
        conn.commit()
    return True

ok = _smoke()
print("✅ Admin Sandbox page written →", page_path)
print("✅ Audit DB initialized at data/audit.db")
print("\nNext steps:")
print("  • In Colab you'll see Streamlit warnings (normal).")
print("  • To run the UI: `streamlit run pages/04_Admin_Sandbox.py` (locally/Cloud).")
print("  • In Colab, you can still import and call engine functions directly for training.")

In [None]:
# === ONE-CELL: Loan Company Portal Architecture (modules + pages) + smoke test ===
import os, json, sqlite3, textwrap, datetime as dt, re
from pathlib import Path

# --- Ensure folders ---
os.makedirs("modules/portal", exist_ok=True)
os.makedirs("pages", exist_ok=True)
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("tenants", exist_ok=True)

# --- Helper to write files ---
def write(path, content):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(textwrap.dedent(content))

# ============== modules/portal/__init__.py ==============
write("modules/portal/__init__.py", """
# Loan Company Portal package
""")

# ============== modules/portal/tenant_store.py ==============
write("modules/portal/tenant_store.py", r"""
import os, sqlite3, re
from pathlib import Path

DB = "data/tenants.db"
ROOT = Path("tenants")

def _slugify(name:str)->str:
    s = re.sub(r'[^a-zA-Z0-9]+', '_', name.strip().lower()).strip('_')
    return s or "tenant"

def _ensure_db():
    os.makedirs("data", exist_ok=True)
    with sqlite3.connect(DB) as conn:
        conn.execute("""
        CREATE TABLE IF NOT EXISTS tenants(
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT UNIQUE NOT NULL,
            slug TEXT UNIQUE NOT NULL,
            created_at TEXT NOT NULL
        )""")
        conn.commit()

def ensure_tenant(name:str)->dict:
    _ensure_db()
    slug = _slugify(name)
    with sqlite3.connect(DB) as conn:
        cur = conn.execute("SELECT id, name, slug, created_at FROM tenants WHERE slug=?", (slug,))
        row = cur.fetchone()
        if row:
            tenant = {"id":row[0], "name":row[1], "slug":row[2], "created_at":row[3]}
        else:
            import datetime as dt
            ts = dt.datetime.utcnow().isoformat(timespec="seconds")+"Z"
            conn.execute("INSERT INTO tenants(name, slug, created_at) VALUES (?,?,?)", (name, slug, ts))
            conn.commit()
            tenant = {"id":conn.execute("SELECT last_insert_rowid()").fetchone()[0], "name":name, "slug":slug, "created_at":ts}
    # ensure folders
    root = ROOT/slug
    for sub in ["uploads","processed","preds","reports","models"]:
        (root/sub).mkdir(parents=True, exist_ok=True)
    return tenant

def tenant_root(slug:str)->Path:
    return ROOT/slug

def list_tenants()->list:
    _ensure_db()
    with sqlite3.connect(DB) as conn:
        cur = conn.execute("SELECT id,name,slug,created_at FROM tenants ORDER BY id DESC")
        return [{"id":r[0],"name":r[1],"slug":r[2],"created_at":r[3]} for r in cur.fetchall()]
""")

# ============== modules/portal/schemas.py ==============
write("modules/portal/schemas.py", r"""
# Minimal column expectations + friendly names / mapping hints
REQUIRED = [
    "customer_id","customer_name","gov_id","branch","product","loan_amount",
    "status","loan_health","created_date"
]
OPTIONAL = [
    "age","gender","loan_type","debt_to_income","product_weeks"
]

CANONICAL_TYPES = {
    "customer_id":"str",
    "customer_name":"str",
    "gov_id":"str",
    "branch":"str",
    "product":"str",
    "loan_amount":"float",
    "status":"str",
    "loan_health":"str",
    "created_date":"date",
    "age":"int",
    "gender":"str",
    "loan_type":"str",
    "debt_to_income":"float",
    "product_weeks":"int"
}
""")

# ============== modules/portal/data_service.py ==============
write("modules/portal/data_service.py", r"""
import os, json, re, datetime as dt
from pathlib import Path
import pandas as pd
import numpy as np

from modules.portal import schemas
from modules.portal.recommender import risk_tier, recommend_limit
from modules.portal.tenant_store import tenant_root
from modules.portal.scoring import score_with_model_or_fallback

def _parse_date(x):
    for fmt in ["%Y-%m-%d","%d/%m/%Y","%m/%d/%Y","%Y/%m/%d","%d-%m-%Y"]:
        try:
            return pd.to_datetime(x, format=fmt, errors="raise")
        except Exception:
            continue
    return pd.to_datetime(x, errors="coerce")

def _kenya_age_from_govid(govid:str):
    # Heuristic buckets based on Kenyan ID ranges (approx)
    # 32xxxxxx-33xxxxx -> 28-30 range comment by user → map buckets
    if not isinstance(govid, str):
        govid = str(govid)
    digits = re.sub(r"\D","", govid)
    if len(digits) < 7:  # unknown
        return np.nan
    # Very rough buckets; refine later if you have a better mapping
    prefix = int(digits[:2])  # take first two digits for bucketing
    if 28 <= prefix <= 33: return 29
    if 34 <= prefix <= 36: return 26
    if 20 <= prefix <= 27: return 35
    if 10 <= prefix <= 19: return 45
    return np.nan

def validate_and_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # Normalize column names
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    # Ensure required columns exist (lightweight mapping hints can be added later)
    missing = [c for c in schemas.REQUIRED if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # Types & cleaning
    if "loan_amount" in df.columns:
        df["loan_amount"] = pd.to_numeric(df["loan_amount"], errors="coerce")

    if "debt_to_income" in df.columns:
        df["debt_to_income"] = pd.to_numeric(df["debt_to_income"], errors="coerce")

    if "age" not in df.columns or df["age"].isna().all():
        df["age"] = df.get("age", pd.Series([np.nan]*len(df)))
        # fill from gov_id heuristic if missing
        df.loc[df["age"].isna(), "age"] = df.loc[df["age"].isna(), "gov_id"].map(_kenya_age_from_govid)

    # Gender guess (very naive) if missing
    if "gender" not in df.columns:
        df["gender"] = "unknown"
    else:
        df["gender"] = df["gender"].fillna("unknown").str.lower()

    # Dates
    if "created_date" in df.columns:
        df["created_date"] = df["created_date"].apply(_parse_date)

    # Default target if available (loan_health != performing -> default=1)
    if "loan_health" in df.columns:
        df["default"] = (df["loan_health"].str.lower() != "performing").astype(int)
    else:
        df["default"] = 0

    # Fill simple gaps
    for col in ["branch","product","status","loan_type"]:
        if col in df.columns:
            df[col] = df[col].fillna("unknown").astype(str)

    return df

def attach_predictions_and_actions(df: pd.DataFrame, prod_model_path:str|None=None) -> pd.DataFrame:
    """Returns df with default_proba, risk_tier, rec_limit."""
    df = df.copy()
    # Score
    scores = score_with_model_or_fallback(df, prod_model_path)
    df["default_proba"] = scores
    # Risk tier
    df["risk_tier"] = df["default_proba"].apply(risk_tier)
    # Recommended limit
    df["rec_limit"] = df.apply(lambda r: recommend_limit(r), axis=1)
    return df

def save_tenant_upload(slug:str, df: pd.DataFrame, name:str="upload.csv") -> str:
    root = tenant_root(slug)
    path = root/"uploads"/name
    df.to_csv(path, index=False)
    return str(path)

def save_tenant_preds(slug:str, df: pd.DataFrame, name:str|None=None) -> str:
    root = tenant_root(slug)
    name = name or f"preds_{dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.csv"
    path = root/"preds"/name
    df.to_csv(path, index=False)
    return str(path)
""")

# ============== modules/portal/scoring.py ==============
write("modules/portal/scoring.py", r"""
import os, joblib
import numpy as np
import pandas as pd

PROD_PIN = "models/PROD_MODEL.txt"

# Minimal feature selector for inference consistency
NUMERIC_FEATURES = ["loan_amount","debt_to_income","age","product_weeks"]
CAT_FEATURES = ["branch","product","gender","status","loan_type","loan_health"]

def _safe_numeric(df: pd.DataFrame):
    out = pd.DataFrame(index=df.index)
    for c in NUMERIC_FEATURES:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
        else:
            out[c] = 0.0
    return out

def _prod_model_path(override:str|None)->str|None:
    if override and os.path.exists(override): return override
    if os.path.exists(PROD_PIN):
        try:
            with open(PROD_PIN,"r") as f:
                p = f.read().strip()
                return p if os.path.exists(p) else None
        except Exception:
            return None
    return None

def _heuristic_score(df: pd.DataFrame)->np.ndarray:
    # Simple risk from dti + amount scale + young age slightly up-risk
    dti = pd.to_numeric(df.get("debt_to_income", 0.3), errors="coerce").fillna(0.3)
    amt = pd.to_numeric(df.get("loan_amount", 20000), errors="coerce").fillna(20000)
    age = pd.to_numeric(df.get("age", 30), errors="coerce").fillna(30)

    # normalize
    dti_n = dti.clip(0,1)
    amt_n = (amt.clip(lower=0, upper=300000) / 300000.0)
    age_pen = np.where(age < 25, 0.05, 0.0)

    raw = 0.6*dti_n + 0.35*amt_n + age_pen
    return np.clip(raw, 0.01, 0.99).values

def score_with_model_or_fallback(df: pd.DataFrame, override_model_path:str|None=None)->np.ndarray:
    path = _prod_model_path(override_model_path)
    if not path:
        return _heuristic_score(df)

    try:
        model = joblib.load(path)
        # Try scikit API
        X = _safe_numeric(df)  # keep minimal consistent features
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X)[:,1]
        elif hasattr(model, "decision_function"):
            s = model.decision_function(X)
            # map to 0..1
            from scipy.special import expit
            proba = expit(s)
        else:
            # last resort: predict labels and soften
            pred = model.predict(X)
            proba = 0.7*pred + 0.15
        return np.clip(proba, 0.01, 0.99)
    except Exception:
        # Fallback if model can't load
        return _heuristic_score(df)
""")

# ============== modules/portal/recommender.py ==============
write("modules/portal/recommender.py", r"""
import numpy as np
import pandas as pd

# Risk tiers
def risk_tier(p: float|int) -> str:
    try:
        p = float(p)
    except Exception:
        p = 0.5
    if p < 0.10: return "A (very low)"
    if p < 0.20: return "B (low)"
    if p < 0.35: return "C (moderate)"
    if p < 0.50: return "D (elevated)"
    return "E (high)"

def _cap_by_tier(base_amt: float, tier: str) -> float:
    m = {
        "A (very low)": 1.25,
        "B (low)": 1.10,
        "C (moderate)": 0.90,
        "D (elevated)": 0.65,
        "E (high)": 0.40
    }
    return max(1000.0, base_amt * m.get(tier, 0.8))

def recommend_limit(row: pd.Series) -> float:
    # baseline = existing requested amount or historical avg; here use loan_amount
    base = float(row.get("loan_amount", 20000.0) or 20000.0)
    tier = row.get("risk_tier")
    return round(_cap_by_tier(base, tier), 2)
""")

# ============== modules/portal/reporting.py ==============
write("modules/portal/reporting.py", r"""
import pandas as pd

def kpis(df: pd.DataFrame) -> dict:
    total_loans = len(df)
    total_amount = float(df["loan_amount"].sum()) if "loan_amount" in df.columns else 0.0
    avg_amount = float(df["loan_amount"].mean()) if "loan_amount" in df.columns else 0.0
    default_rate = float(df.get("default", pd.Series([0]*len(df))).mean()) if len(df)>0 else 0.0
    avg_proba = float(df.get("default_proba", pd.Series([0.0]*len(df))).mean()) if len(df)>0 else 0.0
    return dict(
        total_loans=total_loans,
        total_amount=round(total_amount,2),
        avg_amount=round(avg_amount,2),
        default_rate=round(default_rate,3),
        avg_pred_default=round(avg_proba,3)
    )

def branch_summary(df: pd.DataFrame) -> pd.DataFrame:
    cols = [c for c in ["branch","loan_amount","default","default_proba"] if c in df.columns]
    if not cols or "branch" not in df.columns:
        return pd.DataFrame()
    agg = {
        "loan_amount":"sum"
    }
    if "default" in df.columns: agg["default"]="mean"
    if "default_proba" in df.columns: agg["default_proba"]="mean"
    out = df.groupby("branch", dropna=False).agg(agg).reset_index()
    out = out.rename(columns={"loan_amount":"total_amount","default":"default_rate","default_proba":"avg_proba"})
    return out.sort_values("total_amount", ascending=False)

def product_summary(df: pd.DataFrame) -> pd.DataFrame:
    if "product" not in df.columns: return pd.DataFrame()
    agg = {"loan_amount":"sum"}
    if "default" in df.columns: agg["default"]="mean"
    if "default_proba" in df.columns: agg["default_proba"]="mean"
    out = df.groupby("product", dropna=False).agg(agg).reset_index()
    out = out.rename(columns={"loan_amount":"total_amount","default":"default_rate","default_proba":"avg_proba"})
    return out.sort_values("total_amount", ascending=False)

def top_risky(df: pd.DataFrame, n:int=50) -> pd.DataFrame:
    if "default_proba" not in df.columns: return pd.DataFrame()
    cols = [c for c in ["customer_id","customer_name","branch","product","loan_amount","default_proba","risk_tier","rec_limit"] if c in df.columns]
    out = df[cols].copy()
    return out.sort_values("default_proba", ascending=False).head(n)
""")

# ============== modules/portal/viz.py ==============
write("modules/portal/viz.py", r"""
import plotly.express as px
import pandas as pd

def fig_branch_bar(df: pd.DataFrame):
    if "branch" not in df.columns or "loan_amount" not in df.columns:
        return None
    agg = df.groupby("branch", dropna=False)["loan_amount"].sum().reset_index()
    return px.bar(agg, x="branch", y="loan_amount", title="Total Amount by Branch")

def fig_product_pie(df: pd.DataFrame):
    if "product" not in df.columns or "loan_amount" not in df.columns:
        return None
    agg = df.groupby("product", dropna=False)["loan_amount"].sum().reset_index()
    return px.pie(agg, names="product", values="loan_amount", title="Product Mix")

def fig_risk_hist(df: pd.DataFrame):
    if "default_proba" not in df.columns:
        return None
    return px.histogram(df, x="default_proba", nbins=30, title="Predicted Default Probability Distribution")
""")

# ============== modules/portal/explain.py (stub; portal-side helper) ==============
write("modules/portal/explain.py", r"""
# Portal-side lightweight explainability helper; for rich SHAP use Admin Sandbox.
import pandas as pd
import numpy as np

def simple_feature_importance(row: pd.Series) -> list[tuple[str, float]]:
    # naive importance proxy using numeric magnitudes
    feats = {}
    for c in ["loan_amount","debt_to_income","age","product_weeks"]:
        if c in row:
            try:
                feats[c] = float(row[c] or 0.0)
            except Exception:
                feats[c] = 0.0
    if not feats:
        return []
    # normalize
    total = sum(abs(v) for v in feats.values()) or 1.0
    imp = [(k, abs(v)/total) for k,v in feats.items()]
    return sorted(imp, key=lambda t: t[1], reverse=True)
""")

# ============== modules/portal/auth.py (very light stub; UI will handle sessions) ==============
write("modules/portal/auth.py", r"""
import secrets
# Minimal token stub (extend with proper auth later)
def issue_token(tenant_slug:str)->str:
    return f"{tenant_slug}." + secrets.token_hex(8)

def validate_token(tok:str)->str|None:
    try:
        slug, _ = tok.split(".", 1)
        return slug
    except Exception:
        return None
""")

# ============== Client-facing pages (Streamlit stubs) ==============
write("pages/01_Company_Portal_Home.py", r"""
import streamlit as st
st.set_page_config(page_title="Loan Company Portal", layout="wide")
def app():
    st.title("🏦 Loan Company Portal — Home")
    st.info("Use the sidebar to navigate: Dashboard, Clients, Reports, Settings.")
if __name__ == "__main__":
    app()
""")

write("pages/02_Portfolio_Dashboard.py", r"""
import streamlit as st
import pandas as pd
from modules.portal.tenant_store import ensure_tenant, tenant_root
from modules.portal.data_service import validate_and_preprocess, attach_predictions_and_actions
from modules.portal.reporting import kpis, branch_summary, product_summary, top_risky
from modules.portal.scoring import _prod_model_path

def app():
    st.title("📊 Portfolio Dashboard")
    st.caption("Upload a file to get KPIs, risk, and summaries. Multi-tenant aware.")

    tenant_name = st.text_input("Tenant name", "demo_mfi")
    up = st.file_uploader("Upload CSV", type=["csv","xlsx","xls"])
    if st.button("Process"):
        t = ensure_tenant(tenant_name)
        if up:
            if up.name.lower().endswith(".csv"):
                df = pd.read_csv(up)
            else:
                df = pd.read_excel(up)
            df = validate_and_preprocess(df)
            df = attach_predictions_and_actions(df, _prod_model_path(None))
            st.success(f"Processed: {df.shape}")
            st.dataframe(df.head(50), use_container_width=True)

            m = kpis(df)
            st.write("**KPIs**", m)
            st.write("**By Branch**", branch_summary(df))
            st.write("**By Product**", product_summary(df))
            st.write("**Top Risky**", top_risky(df, 20))
        else:
            st.warning("Please upload a file.")

if __name__ == "__main__":
    app()
""")

write("pages/03_Client_Explorer.py", r"""
import streamlit as st
import pandas as pd
from modules.portal.data_service import validate_and_preprocess, attach_predictions_and_actions
from modules.portal.explain import simple_feature_importance
from modules.portal.scoring import _prod_model_path

def app():
    st.title("🔎 Client Explorer")
    st.caption("Search clients, view full profile with predictions and recommendations.")

    up = st.file_uploader("Upload CSV to explore", type=["csv","xlsx","xls"])
    q = st.text_input("Search by Customer ID / Name / Gov ID").strip().lower()

    if up and st.button("Load & Search"):
        if up.name.lower().endswith(".csv"):
            df = pd.read_csv(up)
        else:
            df = pd.read_excel(up)
        df = validate_and_preprocess(df)
        df = attach_predictions_and_actions(df, _prod_model_path(None))

        # search
        if q:
            mask = (
                df.get("customer_id","").astype(str).str.lower().str.contains(q, na=False) |
                df.get("customer_name","").astype(str).str.lower().str.contains(q, na=False) |
                df.get("gov_id","").astype(str).str.lower().str.contains(q, na=False)
            )
            hits = df[mask].copy()
        else:
            hits = df.copy()

        st.write(f"Results: {hits.shape[0]}")
        st.dataframe(hits.head(100), use_container_width=True)

        # profile if single selection
        if len(hits) == 1:
            r = hits.iloc[0]
            st.subheader("Client Profile")
            st.write(r.to_dict())
            st.markdown(f"**Predicted default probability:** {r['default_proba']:.3f}")
            st.markdown(f"**Risk tier:** {r['risk_tier']}")
            st.markdown(f"**Recommended limit:** {r['rec_limit']:,}")

            st.markdown("**Why? (simple feature importance proxy)**")
            st.write(simple_feature_importance(r))

if __name__ == "__main__":
    app()
""")

write("pages/06_Reports_Exports.py", r"""
import streamlit as st
import pandas as pd
from io import BytesIO
from modules.portal.data_service import validate_and_preprocess, attach_predictions_and_actions
from modules.portal.reporting import kpis, branch_summary, product_summary, top_risky
from modules.portal.scoring import _prod_model_path

def _to_excel(dfs: dict) -> bytes:
    bio = BytesIO()
    with pd.ExcelWriter(bio, engine="xlsxwriter") as writer:
        for name, df in dfs.items():
            df.to_excel(writer, sheet_name=name[:31], index=False)
    return bio.getvalue()

def app():
    st.title("🧾 Reports & Exports")
    up = st.file_uploader("Upload CSV/XLSX", type=["csv","xlsx","xls"])
    if up and st.button("Generate Reports"):
        df = pd.read_csv(up) if up.name.lower().endswith(".csv") else pd.read_excel(up)

In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1


In [None]:
# === LoanIQ Auth + Login System (all in one cell) ===
import sqlite3, hashlib
from pathlib import Path
import streamlit as st

# --- DB Setup ---
DB_PATH = Path("config/loaniq.db")
DB_PATH.parent.mkdir(parents=True, exist_ok=True)

def init_db():
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute("""
    CREATE TABLE IF NOT EXISTS users (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        company TEXT,
        email TEXT UNIQUE,
        password TEXT,
        role TEXT
    )
    """)
    conn.commit()
    conn.close()

def hash_pw(password:str) -> str:
    return hashlib.sha256(password.encode()).hexdigest()

def register_user(company:str, email:str, password:str, role:str="client"):
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute("INSERT INTO users (company,email,password,role) VALUES (?,?,?,?)",
              (company, email, hash_pw(password), role))
    conn.commit()
    conn.close()

def login_user(email:str, password:str):
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute("SELECT id, company, email, role FROM users WHERE email=? AND password=?",
              (email, hash_pw(password)))
    row = c.fetchone()
    conn.close()
    return row

# --- Bootstrap: Ensure DB + Admin user ---
init_db()
try:
    register_user("SystemAdmin", "Admin", "Shady868", role="admin")
except Exception:
    pass  # already exists

# --- Session Setup ---
if "user" not in st.session_state:
    st.session_state["user"] = None

# --- UI Logic ---
if st.session_state["user"] is None:
    st.title("🔑 LoanIQ Login")

    mode = st.radio("Login or Register?", ["Login", "Register"])

    if mode == "Login":
        email = st.text_input("Email / Username")
        pw = st.text_input("Password", type="password")
        if st.button("Login"):
            user = login_user(email, pw)
            if user:
                st.session_state["user"] = {
                    "id": user[0],
                    "company": user[1],
                    "email": user[2],
                    "role": user[3]
                }
                st.experimental_rerun()
            else:
                st.error("❌ Invalid credentials")

    else:  # Register
        company = st.text_input("Company Name")
        email = st.text_input("Email / Username")
        pw = st.text_input("Password", type="password")
        if st.button("Register & Continue"):
            try:
                register_user(company, email, pw)
                user = login_user(email, pw)  # auto-login
                st.session_state["user"] = {
                    "id": user[0],
                    "company": user[1],
                    "email": user[2],
                    "role": user[3]
                }
                st.success("✅ Registered and logged in!")
                st.experimental_rerun()
            except Exception as e:
                st.error(f"❌ Failed: {e}")

else:
    user = st.session_state["user"]
    st.sidebar.success(f"Logged in as {user['company']} ({user['role']})")
    if st.sidebar.button("Logout"):
        st.session_state["user"] = None
        st.experimental_rerun()

2025-08-31 13:08:29.571 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [None]:
# --- Role-Based Login + Dashboards ---
import sqlite3, hashlib
import streamlit as st
from pathlib import Path

# --- DB Setup ---
DB_FILE = "config/users.db"
Path("config").mkdir(exist_ok=True)

def init_db():
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS users (
            username TEXT PRIMARY KEY,
            password TEXT,
            role TEXT DEFAULT 'company'
        )
    """)
    conn.commit()
    conn.close()

init_db()

# --- Helpers ---
def hash_pw(password: str) -> str:
    return hashlib.sha256(password.encode()).hexdigest()

def add_user(username, password, role="company"):
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    try:
        c.execute("INSERT INTO users (username, password, role) VALUES (?, ?, ?)",
                  (username, hash_pw(password), role))
        conn.commit()
    except sqlite3.IntegrityError:
        pass
    conn.close()

def validate_user(username, password):
    # Admin hardcoded
    if username == "Admin" and password == "Shady868":
        return "admin"

    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("SELECT role FROM users WHERE username=? AND password=?",
              (username, hash_pw(password)))
    row = c.fetchone()
    conn.close()
    return row[0] if row else None

# --- Session state ---
if "user" not in st.session_state:
    st.session_state["user"] = None
if "role" not in st.session_state:
    st.session_state["role"] = None

# --- UI ---
st.title("🔐 LoanIQ Login Portal")

if st.session_state["user"]:
    st.success(f"Welcome, {st.session_state['user']} ({st.session_state['role']})")

    if st.session_state["role"] == "admin":
        st.subheader("🛠️ Admin Dashboard")
        st.write("Here you can manage all companies, view models, impersonate users.")
        # TODO: Hook into your Sandbox + Training tabs here

    elif st.session_state["role"] == "company":
        st.subheader("📊 Company Dashboard")
        st.write("Here you can upload datasets, view reports, predictions, and client insights.")
        # TODO: Hook into Reports + Predictions + Client Profiles

    if st.button("Logout"):
        st.session_state["user"] = None
        st.session_state["role"] = None
        st.experimental_rerun()

else:
    choice = st.radio("Choose action", ["Login", "Register"])

    if choice == "Login":
        user = st.text_input("Username")
        pw = st.text_input("Password", type="password")
        if st.button("Login"):
            role = validate_user(user, pw)
            if role:
                st.session_state["user"] = user
                st.session_state["role"] = role
                st.success(f"Logged in as {user} ({role})")
                st.experimental_rerun()
            else:
                st.error("❌ Invalid credentials")

    elif choice == "Register":
        new_user = st.text_input("New Username")
        new_pw = st.text_input("New Password", type="password")
        if st.button("Register"):
            if new_user and new_pw:
                add_user(new_user, new_pw, "company")
                st.success("✅ Registration successful. Please login.")
            else:
                st.error("Fill all fields")



In [None]:
# --- Role-Based Login + Dashboards ---
import sqlite3, hashlib
import streamlit as st
from pathlib import Path

# --- DB Setup ---
DB_FILE = "config/users.db"
Path("config").mkdir(exist_ok=True)

def init_db():
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS users (
            username TEXT PRIMARY KEY,
            password TEXT,
            role TEXT DEFAULT 'company'
        )
    """)
    conn.commit()
    conn.close()

init_db()

# --- Helpers ---
def hash_pw(password: str) -> str:
    return hashlib.sha256(password.encode()).hexdigest()

def add_user(username, password, role="company"):
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    try:
        c.execute("INSERT INTO users (username, password, role) VALUES (?, ?, ?)",
                  (username, hash_pw(password), role))
        conn.commit()
    except sqlite3.IntegrityError:
        pass
    conn.close()

def validate_user(username, password):
    # Admin hardcoded
    if username == "Admin" and password == "Shady868":
        return "admin"

    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("SELECT role FROM users WHERE username=? AND password=?",
              (username, hash_pw(password)))
    row = c.fetchone()
    conn.close()
    return row[0] if row else None

# --- Session state ---
if "user" not in st.session_state:
    st.session_state["user"] = None
if "role" not in st.session_state:
    st.session_state["role"] = None

# --- UI ---
st.title("🔐 LoanIQ Login Portal")

if st.session_state["user"]:
    st.success(f"Welcome, {st.session_state['user']} ({st.session_state['role']})")

    # --- Admin Portal ---
    if st.session_state["role"] == "admin":
        st.subheader("🛠️ Admin Dashboard")
        st.write("Manage companies, run experiments, audit logs, and impersonate users.")

        if st.button("🧪 Go to Sandbox / Training"):
            st.switch_page("pages/07_Sandbox.py")

        if st.button("📜 View Audit Logs"):
            st.switch_page("pages/06_Audit.py")

        if st.button("👤 Impersonate Company"):
            st.switch_page("pages/04_Client_Insights.py")

    # --- Company Portal ---
    elif st.session_state["role"] == "company":
        st.subheader("📊 Company Dashboard")
        st.write("Upload datasets, view reports, and explore predictions.")

        if st.button("📤 Upload & Manage Data"):
            st.switch_page("pages/02_DataUpload.py")

        if st.button("📑 Reports & Exports"):
            st.switch_page("pages/06_Reports_Exports.py")

        if st.button("🔮 Client Predictions & Insights"):
            st.switch_page("pages/04_Client_Insights.py")

    # --- Logout ---
    if st.button("Logout"):
        st.session_state["user"] = None
        st.session_state["role"] = None
        st.experimental_rerun()

else:
    choice = st.radio("Choose action", ["Login", "Register"])

    if choice == "Login":
        user = st.text_input("Username")
        pw = st.text_input("Password", type="password")
        if st.button("Login"):
            role = validate_user(user, pw)
            if role:
                st.session_state["user"] = user
                st.session_state["role"] = role
                st.success(f"Logged in as {user} ({role})")
                st.experimental_rerun()
            else:
                st.error("❌ Invalid credentials")

    elif choice == "Register":
        new_user = st.text_input("New Username")
        new_pw = st.text_input("New Password", type="password")
        if st.button("Register"):
            if new_user and new_pw:
                add_user(new_user, new_pw, "company")
                st.success("✅ Registration successful. Please login.")
            else:
                st.error("Fill all fields")



In [None]:
# --- Role-Based Login + Dashboards with Impersonation + Audit Logs ---
import sqlite3, hashlib, datetime
import streamlit as st
from pathlib import Path

# --- DB Setup ---
DB_FILE = "config/users.db"
Path("config").mkdir(exist_ok=True)

def init_db():
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS users (
            username TEXT PRIMARY KEY,
            password TEXT,
            role TEXT DEFAULT 'company'
        )
    """)
    c.execute("""
        CREATE TABLE IF NOT EXISTS audit_logs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            actor TEXT,
            action TEXT,
            target TEXT,
            ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    conn.commit()
    conn.close()

init_db()

# --- Helpers ---
def hash_pw(password: str) -> str:
    return hashlib.sha256(password.encode()).hexdigest()

def add_user(username, password, role="company"):
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    try:
        c.execute("INSERT INTO users (username, password, role) VALUES (?, ?, ?)",
                  (username, hash_pw(password), role))
        conn.commit()
    except sqlite3.IntegrityError:
        pass
    conn.close()

def validate_user(username, password):
    # Admin hardcoded
    if username == "Admin" and password == "Shady868":
        return "admin"
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("SELECT role FROM users WHERE username=? AND password=?",
              (username, hash_pw(password)))
    row = c.fetchone()
    conn.close()
    return row[0] if row else None

def list_companies():
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("SELECT username FROM users WHERE role='company'")
    rows = [r[0] for r in c.fetchall()]
    conn.close()
    return rows

def log_action(actor, action, target=""):
    conn = sqlite3.connect(DB_FILE)
    c = conn.cursor()
    c.execute("INSERT INTO audit_logs (actor, action, target, ts) VALUES (?, ?, ?, ?)",
              (actor, action, target, datetime.datetime.now()))
    conn.commit()
    conn.close()

# --- Session state ---
if "user" not in st.session_state:
    st.session_state["user"] = None
if "role" not in st.session_state:
    st.session_state["role"] = None
if "impersonating" not in st.session_state:
    st.session_state["impersonating"] = None

# --- UI ---
st.title("🔐 LoanIQ Login Portal")

if st.session_state["user"]:
    display_user = st.session_state["impersonating"] or st.session_state["user"]
    st.success(f"Welcome, {display_user} ({st.session_state['role']})")

    # --- Admin Portal ---
    if st.session_state["role"] == "admin" and not st.session_state["impersonating"]:
        st.subheader("🛠️ Admin Dashboard")

        if st.button("🧪 Go to Sandbox / Training"):
            st.switch_page("pages/07_Sandbox.py")

        if st.button("📜 View Audit Logs"):
            conn = sqlite3.connect(DB_FILE)
            logs = conn.execute("SELECT actor, action, target, ts FROM audit_logs ORDER BY ts DESC LIMIT 50").fetchall()
            conn.close()
            for actor, action, target, ts in logs:
                st.write(f"**{ts}** — {actor} → {action} {target}")

        companies = list_companies()
        if companies:
            choice = st.selectbox("👤 Choose a company to impersonate", companies)
            if st.button("Impersonate"):
                st.session_state["impersonating"] = choice
                st.session_state["role"] = "company"
                log_action("Admin", "impersonated", choice)
                st.experimental_rerun()

    # --- Company Portal (real or impersonated) ---
    if st.session_state["role"] == "company":
        st.subheader("📊 Company Dashboard")
        st.write("Upload datasets, view reports, and explore predictions.")

        if st.button("📤 Upload & Manage Data"):
            st.switch_page("pages/02_DataUpload.py")

        if st.button("📑 Reports & Exports"):
            st.switch_page("pages/06_Reports_Exports.py")

        if st.button("🔮 Client Predictions & Insights"):
            st.switch_page("pages/04_Client_Insights.py")

    # --- End Impersonation ---
    if st.session_state["impersonating"]:
        if st.button("❌ Stop Impersonation"):
            log_action("Admin", "stopped impersonating", st.session_state["impersonating"])
            st.session_state["impersonating"] = None
            st.session_state["role"] = "admin"
            st.experimental_rerun()

    # --- Logout ---
    if st.button("Logout"):
        log_action(st.session_state["user"], "logout")
        st.session_state["user"] = None
        st.session_state["role"] = None
        st.session_state["impersonating"] = None
        st.experimental_rerun()

else:
    choice = st.radio("Choose action", ["Login", "Register"])

    if choice == "Login":
        user = st.text_input("Username")
        pw = st.text_input("Password", type="password")
        if st.button("Login"):
            role = validate_user(user, pw)
            if role:
                st.session_state["user"] = user
                st.session_state["role"] = role
                log_action(user, "login")
                st.experimental_rerun()
            else:
                st.error("❌ Invalid credentials")

    elif choice == "Register":
        new_user = st.text_input("New Username")
        new_pw = st.text_input("New Password", type="password")
        if st.button("Register"):
            if new_user and new_pw:
                add_user(new_user, new_pw, "company")
                log_action(new_user, "register")
                st.success("✅ Registration successful. Please login.")
            else:
                st.error("Fill all fields")



In [None]:
import os

# === Step 1: Create folder structure for LoanIQ ===
folders = [
    "modules/bootstrap", "modules/core", "modules/ingestion",
    "modules/synth", "modules/features", "modules/ml",
    "modules/reports", "modules/sandbox", "modules/api",
    "pages", "tests", "data", "config"
]
for f in folders:
    os.makedirs(f, exist_ok=True)

# === Step 2: Ensure Python packages ===
open("modules/__init__.py", "a").close()
for d in ["bootstrap","core","ingestion","synth","features","ml","reports","sandbox","api"]:
    open(f"modules/{d}/__init__.py", "a").close()

# === Step 3: Create placeholder Streamlit pages ===
for n in range(1,8):
    open(f"pages/{n:02d}_placeholder.py", "a").close()

# === Step 4: Basic tests folder ===
open("tests/__init__.py", "a").close()

# === Step 5: Verify structure ===
print("✅ Folder scaffolding created.\n")
for root, dirs, files in os.walk(".", topdown=True):
    level = root.replace(os.getcwd(), "").count(os.sep)
    indent = " " * (2 * level)
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * (2 * (level + 1))
    for f in files:
        print(f"{subindent}{f}")

✅ Folder scaffolding created.

./
  .config/
    .last_survey_prompt.yaml
    gce
    config_sentinel
    default_configs.db
    hidden_gcloud_config_universe_descriptor_data_cache_configs.db
    .last_update_check.json
    active_config
    .last_opt_in_prompt.yaml
    logs/
      2025.08.28/
        13.42.14.257094.log
        13.42.40.032629.log
        13.41.44.528882.log
        13.42.40.767285.log
        13.42.30.169478.log
        13.42.24.254751.log
    configurations/
      config_default
  data/
  pages/
    02_placeholder.py
    06_placeholder.py
    04_placeholder.py
    07_placeholder.py
    01_placeholder.py
    05_placeholder.py
    03_placeholder.py
  modules/
    __init__.py
    reports/
      __init__.py
    features/
      __init__.py
    ml/
      __init__.py
    bootstrap/
      __init__.py
    sandbox/
      __init__.py
    ingestion/
      __init__.py
    synth/
      __init__.py
    api/
      __init__.py
    core/
      __init__.py
  config/
  tests/
    __ini

In [None]:

# Colab cell to create, run, and test modules/bootstrap/deps.py
# Run this entire block in Colab to execute all steps

# %%writefile modules/bootstrap/deps.py
# Estimated line count: 50

import os
import subprocess

# List of required free libraries
REQUIRED_LIBS = [
    'streamlit==1.38.0',
    'pandas==2.2.2',
    'numpy==1.26.4',
    'scikit-learn==1.5.1',
    'xgboost==2.1.1',
    'plotly==5.22.0',
    'faker==28.1.0',
    'openpyxl==3.1.5',
    'reportlab==4.2.2',
    'pytest==8.3.2',
    'shap==0.46.0'  # For explainability
]

def install_deps():
    """Install required libraries and create marker file."""
    os.makedirs('data', exist_ok=True)
    marker_path = os.path.join('data', '.deps_ok')

    if not os.path.exists(marker_path):
        for lib in REQUIRED_LIBS:
            try:
                __import__(lib.split('==')[0])
            except ImportError:
                subprocess.check_call(['pip', 'install', lib])
        with open(marker_path, 'w') as f:
            f.write('OK')
        print("Dependencies installed successfully.")
    else:
        print("Dependencies already installed.")

if __name__ == '__main__':
    install_deps()

# Test code (will be written to tests/test_bootstrap.py)
"""
# tests/test_bootstrap.py
import os
import pytest

def test_deps_install():
    from modules.bootstrap import deps
    deps.install_deps()
    marker_path = os.path.join('data', '.deps_ok')
    assert os.path.exists(marker_path), "Marker file not created"
    with open(marker_path, 'r') as f:
        assert f.read() == 'OK', "Marker file content incorrect"
"""

# Colab commands to execute (included in this cell)
"""
# Write the main file
!mkdir -p modules/bootstrap
!echo -e "# modules/bootstrap/deps.py\n$(cat << 'EOF'
import os
import subprocess

REQUIRED_LIBS = [
    'streamlit==1.38.0',
    'pandas==2.2.2',
    'numpy==1.26.4',
    'scikit-learn==1.5.1',
    'xgboost==2.1.1',
    'plotly==5.22.0',
    'faker==28.1.0',
    'openpyxl==3.1.5',
    'reportlab==4.2.2',
    'pytest==8.3.2',
    'shap==0.46.0'
]

def install_deps():
    os.makedirs('data', exist_ok=True)
    marker_path = os.path.join('data', '.deps_ok')

    if not os.path.exists(marker_path):
        for lib in REQUIRED_LIBS:
            try:
                __import__(lib.split('==')[0])
            except ImportError:
                subprocess.check_call(['pip', 'install', lib])
        with open(marker_path, 'w') as f:
            f.write('OK')
        print("Dependencies installed successfully.")
    else:
        print("Dependencies already installed.")

if __name__ == '__main__':
    install_deps()
EOF
)" > modules/bootstrap/deps.py

# Write the test file
!mkdir -p tests
!echo -e "# tests/test_bootstrap.py\nimport os\nimport pytest\n\ndef test_deps_install():\n    from modules.bootstrap import deps\n    deps.install_deps()\n    marker_path = os.path.join('data', '.deps_ok')\n    assert os.path.exists(marker_path), 'Marker file not created'\n    with open(marker_path, 'r') as f:\n        assert f.read() == 'OK', 'Marker file content incorrect'" > tests/test_bootstrap.py

# Run the script
!python modules/bootstrap/deps.py

# Run the test
!pytest tests/test_bootstrap.py -v

# Verify marker file
!ls data
"""

# Expected output:
# Dependencies installed successfully.
# ============================= test session starts =============================
# tests/test_bootstrap.py::test_deps_install PASSED
# =========================== 1 passed in 0.XXs ===========================
# .deps_ok

Dependencies installed successfully.


'\n# Write the main file\n!mkdir -p modules/bootstrap\n!echo -e "# modules/bootstrap/deps.py\n$(cat << \'EOF\'\nimport os\nimport subprocess\n\nREQUIRED_LIBS = [\n    \'streamlit==1.38.0\',\n    \'pandas==2.2.2\',\n    \'numpy==1.26.4\',\n    \'scikit-learn==1.5.1\',\n    \'xgboost==2.1.1\',\n    \'plotly==5.22.0\',\n    \'faker==28.1.0\',\n    \'openpyxl==3.1.5\',\n    \'reportlab==4.2.2\',\n    \'pytest==8.3.2\',\n    \'shap==0.46.0\'\n]\n\ndef install_deps():\n    os.makedirs(\'data\', exist_ok=True)\n    marker_path = os.path.join(\'data\', \'.deps_ok\')\n    \n    if not os.path.exists(marker_path):\n        for lib in REQUIRED_LIBS:\n            try:\n                __import__(lib.split(\'==\')[0])\n            except ImportError:\n                subprocess.check_call([\'pip\', \'install\', lib])\n        with open(marker_path, \'w\') as f:\n            f.write(\'OK\')\n        print("Dependencies installed successfully.")\n    else:\n        print("Dependencies already install

In [None]:
Up# Colab cell to create, run, and test modules/bootstrap/drive_persist.py and modules/bootstrap/tunnel.py
# Run this entire block in Colab to execute all steps

# %%writefile modules/bootstrap/drive_persist.py
# Estimated line count: 80

import os
from google.colab import drive
import hashlib
import pickle
import time

class DrivePersist:
    """Manage Google Drive persistence for Colab with atomic writes and retries."""
    DRIVE_ROOT = "/content/drive/MyDrive/loan_iq"
    MOUNT_PATH = "/content/drive"

    def __init__(self):
        """Mount Drive and ensure root directory."""
        if not os.path.exists(self.MOUNT_PATH):
            drive.mount(self.MOUNT_PATH)
        os.makedirs(self.DRIVE_ROOT, exist_ok=True)

    def persist_path(self, local_path):
        """Get Drive path for a local file."""
        relative_path = os.path.relpath(local_path, start=os.getcwd())
        return os.path.join(self.DRIVE_ROOT, relative_path)

    def save_file(self, local_path, data, max_retries=3):
        """Save data to Drive with atomic writes and retries."""
        drive_path = self.persist_path(local_path)
        os.makedirs(os.path.dirname(drive_path), exist_ok=True)

        for attempt in range(max_retries):
            try:
                temp_path = drive_path + '.tmp'
                with open(temp_path, 'wb') as f:
                    pickle.dump(data, f)
                os.rename(temp_path, drive_path)

                # Compute and save hash
                file_hash = hashlib.md5(str(data).encode()).hexdigest()
                with open(drive_path + '.hash', 'w') as f:
                    f.write(file_hash)
                return True
            except Exception as e:
                print(f"Retry {attempt + 1}/{max_retries} for {drive_path}: {e}")
                time.sleep(1)
        return False

    def load_file(self, local_path):
        """Load data from Drive, verify hash."""
        drive_path = self.persist_path(local_path)
        if not os.path.exists(drive_path):
            return None
        try:
            with open(drive_path, 'rb') as f:
                data = pickle.load(f)
            hash_path = drive_path + '.hash'
            if os.path.exists(hash_path):
                with open(hash_path, 'r') as f:
                    stored_hash = f.read()
                current_hash = hashlib.md5(str(data).encode()).hexdigest()
                if stored_hash != current_hash:
                    print(f"Hash mismatch for {drive_path}")
                    return None
            return data
        except Exception as e:
            print(f"Error loading {drive_path}: {e}")
            return None

if __name__ == '__main__':
    persist = DrivePersist()
    test_data = {'test': 'data'}
    test_path = os.path.join('data', 'test.pkl')
    persist.save_file(test_path, test_data)
    print(f"Saved to {persist.persist_path(test_path)}")
    loaded = persist.load_file(test_path)
    print(f"Loaded: {loaded}")

# %%writefile modules/bootstrap/tunnel.py
# Estimated line count: 60

import os
import subprocess
import time

NGROK_AUTH_TOKEN = "31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF"

def setup_tunnel(port=8501):
    """Set up Ngrok tunnel for Streamlit with hardcoded authtoken."""
    try:
        subprocess.check_call(['pip', 'install', 'pyngrok==7.2.0'])
        from pyngrok import ngrok
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)

        # Terminate existing tunnels
        ngrok.kill()

        # Start new tunnel
        tunnel = ngrok.connect(port, bind_tls=True)
        public_url = tunnel.public_url
        print(f"Streamlit accessible at: {public_url}")
        return public_url
    except Exception as e:
        print(f"Tunnel setup failed: {e}")
        return None

def get_new_tunnel_url():
    """Command to get new Ngrok URL (for README/runbook)."""
    cmd = f"!ngrok http 8501 --authtoken {NGROK_AUTH_TOKEN}"
    print(f"Run this in Colab to get new URL:\n{cmd}")
    return cmd

if __name__ == '__main__':
    setup_tunnel()
    get_new_tunnel_url()

# Test code (will be written to tests/test_bootstrap.py)
"""
# tests/test_bootstrap.py
import os
import pytest
from modules.bootstrap import drive_persist, tunnel

def test_drive_persist():
    persist = drive_persist.DrivePersist()
    test_data = {'test': 'data'}
    test_path = os.path.join('data', 'test.pkl')
    assert persist.save_file(test_path, test_data), "Failed to save to Drive"
    loaded = persist.load_file(test_path)
    assert loaded == test_data, "Loaded data mismatch"
    assert os.path.exists(persist.persist_path(test_path) + '.hash'), "Hash file missing"

def test_tunnel_setup():
    public_url = tunnel.setup_tunnel()
    assert public_url is None or isinstance(public_url, str), "Invalid tunnel URL"
    cmd = tunnel.get_new_tunnel_url()
    assert NGROK_AUTH_TOKEN in cmd, "Ngrok authtoken not in command"
"""

# Colab commands to execute (run this entire cell)
"""
# Create directories
!mkdir -p modules/bootstrap tests data

# Write drive_persist.py
!echo -e "# modules/bootstrap/drive_persist.py\n$(cat << 'EOF'
import os
from google.colab import drive
import hashlib
import pickle
import time

class DrivePersist:
    DRIVE_ROOT = \"/content/drive/MyDrive/loan_iq\"
    MOUNT_PATH = \"/content/drive\"

    def __init__(self):
        if not os.path.exists(self.MOUNT_PATH):
            drive.mount(self.MOUNT_PATH)
        os.makedirs(self.DRIVE_ROOT, exist_ok=True)

    def persist_path(self, local_path):
        relative_path = os.path.relpath(local_path, start=os.getcwd())
        return os.path.join(self.DRIVE_ROOT, relative_path)

    def save_file(self, local_path, data, max_retries=3):
        os.makedirs(os.path.dirname(drive_path), exist_ok=True)
        for attempt in range(max_retries):
            try:
                temp_path = drive_path + '.tmp'
                with open(temp_path, 'wb') as f:
                    pickle.dump(data, f)
                os.rename(temp_path, drive_path)
                file_hash = hashlib.md5(str(data).encode()).hexdigest()
                with open(drive_path + '.hash', 'w') as f:
                    f.write(file_hash)
                return True
            except Exception as e:
                print(f\"Retry {attempt + 1}/{max_retries} for {drive_path}: {e}\")
                time.sleep(1)
        return False

    def load_file(self, local_path):
        drive_path = self.persist_path(local_path)
        if not os.path.exists(drive_path):
            return None
        try:
            with open(drive_path, 'rb') as f:
                data = pickle.load(f)
            hash_path = drive_path + '.hash'
            if os.path.exists(hash_path):
                with open(hash_path, 'r') as f:
                    stored_hash = f.read()
                current_hash = hashlib.md5(str(data).encode()).hexdigest()
                if stored_hash != current_hash:
                    print(f\"Hash mismatch for {drive_path}\")
                    return None
            return data
        except Exception as e:
            print(f\"Error loading {drive_path}: {e}\")
            return None

if __name__ == '__main__':
    persist = DrivePersist()
    test_data = {'test': 'data'}
    test_path = os.path.join('data', 'test.pkl')
    persist.save_file(test_path, test_data)
    print(f\"Saved to {persist.persist_path(test_path)}\")
    loaded = persist.load_file(test_path)
    print(f\"Loaded: {loaded}\")
EOF
)" > modules/bootstrap/drive_persist.py

# Write tunnel.py
!echo -e "# modules/bootstrap/tunnel.py\n$(cat << 'EOF'
import os
import subprocess
import time

NGROK_AUTH_TOKEN = \"31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF\"

def setup_tunnel(port=8501):
    try:
        subprocess.check_call(['pip', 'install', 'pyngrok==7.2.0'])
        from pyngrok import ngrok
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        ngrok.kill()
        tunnel = ngrok.connect(port, bind_tls=True)
        public_url = tunnel.public_url
        print(f\"Streamlit accessible at: {public_url}\")
        return public_url
    except Exception as e:
        print(f\"Tunnel setup failed: {e}\")
        return None

def get_new_tunnel_url():
    cmd = f\"!ngrok http 8501 --authtoken {NGROK_AUTH_TOKEN}\"
    print(f\"Run this in Colab to get new URL:\n{cmd}\")
    return cmd

if __name__ == '__main__':
    setup_tunnel()
    get_new_tunnel_url()
EOF
)" > modules/bootstrap/tunnel.py

# Write test file (appending to existing test_bootstrap.py)
!echo -e "# tests/test_bootstrap.py\n$(cat << 'EOF'
import os
import pytest
from modules.bootstrap import drive_persist, tunnel

def test_deps_install():
    from modules.bootstrap import deps
    deps.install_deps()
    marker_path = os.path.join('data', '.deps_ok')
    assert os.path.exists(marker_path), 'Marker file not created'
    with open(marker_path, 'r') as f:
        assert f.read() == 'OK', 'Marker file content incorrect'

def test_drive_persist():
    persist = drive_persist.DrivePersist()
    test_data = {'test': 'data'}
    test_path = os.path.join('data', 'test.pkl')
    assert persist.save_file(test_path, test_data), 'Failed to save to Drive'
    loaded = persist.load_file(test_path)
    assert loaded == test_data, 'Loaded data mismatch'
    assert os.path.exists(persist.persist_path(test_path) + '.hash'), 'Hash file missing'

def test_tunnel_setup():
    public_url = tunnel.setup_tunnel()
    assert public_url is None or isinstance(public_url, str), 'Invalid tunnel URL'
    cmd = tunnel.get_new_tunnel_url()
    assert NGROK_AUTH_TOKEN in cmd, 'Ngrok authtoken not in command'
EOF
)" > tests/test_bootstrap.py

# Run dependencies (ensure environment)
!python modules/bootstrap/deps.py

# Run drive_persist.py (will prompt for Google Drive auth code)
!python modules/bootstrap/drive_persist.py

# Run tunnel.py (may take time to set up Ngrok)
!python modules/bootstrap/tunnel.py

# Run tests
!pytest tests/test_bootstrap.py -v

# Verify files
!ls data
!ls modules/bootstrap
"""

# Expected output:
# Dependencies installed successfully.
# Mounted at /content/drive
# Saved to /content/drive/MyDrive/loan_iq/data/test.pkl
# Loaded: {'test': 'data'}
# Streamlit accessible at: https://<ngrok-url>.ngrok.io
# Run this in Colab to get new URL:
# !ngrok http 8501 --authtoken 31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF
# ============================= test session starts =============================
# tests/test_bootstrap.py::test_deps_install PASSED
# tests/test_bootstrap.py::test_drive_persist PASSED
# tests/test_bootstrap.py::test_tunnel_setup PASSED
# =========================== 3 passed in 0.XXs ===========================
# test.pkl  test.pkl.hash  .deps_ok
# deps.py  drive_persist.py  tunnel.py

Mounted at /content/drive
Saved to /content/drive/MyDrive/loan_iq/data/test.pkl
Loaded: {'test': 'data'}
Streamlit accessible at: https://f7e668fceeb4.ngrok-free.app
Run this in Colab to get new URL:
!ngrok http 8501 --authtoken 31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF


'\n# Create directories\n!mkdir -p modules/bootstrap tests data\n\n# Write drive_persist.py\n!echo -e "# modules/bootstrap/drive_persist.py\n$(cat << \'EOF\'\nimport os\nfrom google.colab import drive\nimport hashlib\nimport pickle\nimport time\n\nclass DrivePersist:\n    DRIVE_ROOT = "/content/drive/MyDrive/loan_iq"\n    MOUNT_PATH = "/content/drive"\n    \n    def __init__(self):\n        if not os.path.exists(self.MOUNT_PATH):\n            drive.mount(self.MOUNT_PATH)\n        os.makedirs(self.DRIVE_ROOT, exist_ok=True)\n    \n    def persist_path(self, local_path):\n        relative_path = os.path.relpath(local_path, start=os.getcwd())\n        return os.path.join(self.DRIVE_ROOT, relative_path)\n    \n    def save_file(self, local_path, data, max_retries=3):\n        os.makedirs(os.path.dirname(drive_path), exist_ok=True)\n        for attempt in range(max_retries):\n            try:\n                temp_path = drive_path + \'.tmp\'\n                with open(temp_path, \'wb\') as

In [None]:
# Colab cell to create, run, and test modules/core/config.py and modules/core/db.py
# Run this entire block in Colab to execute all steps

# Ensure Python path includes current directory for module imports
import sys
import os
sys.path.append(os.getcwd())

# Create directories to prevent path errors
!mkdir -p modules/core tests data models data/reports
!ls modules/core || echo "Directory modules/core created"
!ls data || echo "Directory data created"

# Write config.py
!echo -e "# modules/core/config.py\n# Estimated line count: 60\n\nimport sys\nimport os\nsys.path.append(os.getcwd())\nimport random\nimport numpy as np\n\n# Hardcoded admin credentials\nADMIN_CREDENTIALS = {\n    \"username\": \"admin\",\n    \"password\": \"Shady868\"\n}\n\n# Random seeds for reproducibility\nSEEDS = {\n    \"faker\": 42,\n    \"numpy\": 42,\n    \"random\": 42\n}\n\n# App configuration\nCONFIG = {\n    \"data_dir\": os.path.join(\"data\"),\n    \"model_dir\": os.path.join(\"models\"),\n    \"report_dir\": os.path.join(\"data\", \"reports\"),\n    \"db_path\": os.path.join(\"data\", \"loan_iq.db\"),\n    \"drive_root\": \"/content/drive/MyDrive/loan_iq\",\n    \"streamlit_port\": 8501,\n    \"fraud_types\": [\"ghost_client\", \"duplicate_id\", \"missed_payment\", \"identity_theft\"],\n    \"regions\": [\"urban\", \"rural\", \"semi_urban\"],\n    \"max_clients_batch\": 70000,\n    \"default_batch_size\": 1000\n}\n\ndef init_seeds():\n    \"\"\"Initialize random seeds for reproducibility.\"\"\"\n    random.seed(SEEDS[\"random\"])\n    np.random.seed(SEEDS[\"numpy\"])\n\ndef get_config():\n    \"\"\"Return config dictionary, ensure directories exist.\"\"\"\n    os.makedirs(CONFIG[\"data_dir\"], exist_ok=True)\n    os.makedirs(CONFIG[\"model_dir\"], exist_ok=True)\n    os.makedirs(CONFIG[\"report_dir\"], exist_ok=True)\n    return CONFIG\n\nif __name__ == \"__main__\":\n    init_seeds()\n    config = get_config()\n    print(f\"Config loaded: {config}\")" > modules/core/config.py

# Write db.py
!echo -e "# modules/core/db.py\n# Estimated line count: 120\n\nimport sys\nimport os\nsys.path.append(os.getcwd())\nimport sqlite3\nimport json\nfrom datetime import datetime\ntry:\n    from modules.core import config\nexcept ImportError as e:\n    print(f\"Import error: {e}\")\n    raise\n\nclass DB:\n    \"\"\"SQLite database wrapper for Loan IQ.\"\"\"\n    def __init__(self):\n        print(f\"sys.path: {sys.path}\")  # Debug path\n        self.db_path = config.get_config()[\"db_path\"]\n        os.makedirs(os.path.dirname(self.db_path), exist_ok=True)\n        self.conn = sqlite3.connect(self.db_path)\n        self.cursor = self.conn.cursor()\n        self.create_tables()\n\n    def create_tables(self):\n        \"\"\"Create database tables.\"\"\"\n        tables = [\n            \"CREATE TABLE IF NOT EXISTS users (user_id INTEGER PRIMARY KEY, username TEXT UNIQUE, password TEXT, role TEXT)\",\n            \"CREATE TABLE IF NOT EXISTS clients (client_id TEXT PRIMARY KEY, name TEXT, branch TEXT, region TEXT, income REAL, created_at TIMESTAMP)\",\n            \"CREATE TABLE IF NOT EXISTS loans (loan_id TEXT PRIMARY KEY, client_id TEXT, amount REAL, status TEXT, start_date TIMESTAMP, FOREIGN KEY (client_id) REFERENCES clients(client_id))\",\n            \"CREATE TABLE IF NOT EXISTS transactions (transaction_id TEXT PRIMARY KEY, loan_id TEXT, amount REAL, date TIMESTAMP, type TEXT, FOREIGN KEY (loan_id) REFERENCES loans(loan_id))\",\n            \"CREATE TABLE IF NOT EXISTS models (model_id TEXT PRIMARY KEY, type TEXT, version TEXT, created_at TIMESTAMP)\",\n            \"CREATE TABLE IF NOT EXISTS model_versions (version_id TEXT PRIMARY KEY, model_id TEXT, config_json TEXT, data_hash TEXT, metrics_json TEXT, commit_ref TEXT, comments TEXT, created_at TIMESTAMP, FOREIGN KEY (model_id) REFERENCES models(model_id))\",\n            \"CREATE TABLE IF NOT EXISTS audit_logs (log_id INTEGER PRIMARY KEY AUTOINCREMENT, actor_id TEXT, actor_role TEXT, action TEXT, target_id TEXT, target_type TEXT, reason TEXT, timestamp TIMESTAMP, before_snapshot TEXT, after_snapshot TEXT, reversible BOOLEAN, reversal_id INTEGER)\",\n            \"CREATE TABLE IF NOT EXISTS simulations (sim_id TEXT PRIMARY KEY, user_id TEXT, params_json TEXT, created_at TIMESTAMP)\",\n            \"CREATE TABLE IF NOT EXISTS reports (report_id TEXT PRIMARY KEY, type TEXT, path TEXT, created_at TIMESTAMP)\",\n            \"CREATE TABLE IF NOT EXISTS assets (asset_id TEXT PRIMARY KEY, path TEXT, type TEXT, created_at TIMESTAMP)\"\n        ]\n        for table_sql in tables:\n            self.cursor.execute(table_sql)\n        self.conn.commit()\n\n    def log_action(self, actor_id, actor_role, action, target_id, target_type, reason, before_snapshot, after_snapshot, reversible=False):\n        \"\"\"Log an admin action to audit_logs.\"\"\"\n        timestamp = datetime.utcnow().isoformat()\n        snapshot_before = json.dumps(before_snapshot) if before_snapshot else \"\"\n        snapshot_after = json.dumps(after_snapshot) if after_snapshot else \"\"\n        self.cursor.execute(\n            \"INSERT INTO audit_logs (actor_id, actor_role, action, target_id, target_type, reason, timestamp, before_snapshot, after_snapshot, reversible) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n            (actor_id, actor_role, action, target_id, target_type, reason, timestamp, snapshot_before, snapshot_after, reversible)\n        )\n        self.conn.commit()\n        return self.cursor.lastrowid\n\n    def get_audit_trail(self, target_id=None, target_type=None):\n        \"\"\"Retrieve audit logs, optionally filtered.\"\"\"\n        query = \"SELECT * FROM audit_logs\"\n        params = []\n        if target_id and target_type:\n            query += \" WHERE target_id = ? AND target_type = ?\"\n            params = [target_id, target_type]\n        self.cursor.execute(query, params)\n        return self.cursor.fetchall()\n\n    def rollback_action(self, action_id):\n        \"\"\"Attempt to rollback an action if reversible.\"\"\"\n        self.cursor.execute(\"SELECT reversible, before_snapshot, target_id, target_type, action FROM audit_logs WHERE log_id = ?\", (action_id,))\n        result = self.cursor.fetchone()\n        if not result or not result[0]:\n            return False\n        before_snapshot = json.loads(result[1]) if result[1] else {}\n        target_id, target_type, action = result[2], result[3], result[4]\n        if target_type == \"user\" and action == \"edit\":\n            self.cursor.execute(\"UPDATE users SET username = ?, password = ?, role = ? WHERE user_id = ?\",\n                              (before_snapshot.get(\"username\"), before_snapshot.get(\"password\"), before_snapshot.get(\"role\"), target_id))\n            self.conn.commit()\n            return True\n        return False\n\n    def close(self):\n        \"\"\"Close database connection.\"\"\"\n        self.conn.close()\n\nif __name__ == \"__main__\":\n    db = DB()\n    db.cursor.execute(\"INSERT OR IGNORE INTO users (user_id, username, password, role) VALUES (?, ?, ?, ?)\",\n                     (1, \"admin\", \"Shady868\", \"admin\"))\n    db.conn.commit()\n    db.log_action(\"1\", \"admin\", \"init\", \"1\", \"user\", \"Initialize admin user\", {}, {\"username\": \"admin\"})\n    print(\"Database initialized.\")\n    db.close()" > modules/core/db.py

# Write test file
!echo -e "# tests/test_core.py\n# Estimated line count: 20\n\nimport sys\nimport os\nsys.path.append(os.getcwd())\nfrom modules.core import config, db\n\ndef test_config_init():\n    cfg = config.get_config()\n    assert os.path.exists(cfg[\"data_dir\"]), \"Data directory not created\"\n    assert cfg[\"streamlit_port\"] == 8501, \"Incorrect port\"\n    assert config.ADMIN_CREDENTIALS[\"username\"] == \"admin\", \"Admin username incorrect\"\n\ndef test_db_create_and_log():\n    database = db.DB()\n    database.cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table' AND name='audit_logs'\")\n    assert database.cursor.fetchone(), \"Audit logs table not created\"\n    log_id = database.log_action(\"1\", \"admin\", \"test_action\", \"test_id\", \"test_type\", \"Test reason\", {\"key\": \"before\"}, {\"key\": \"after\"}, True)\n    assert log_id, \"Failed to log action\"\n    audit_logs = database.get_audit_trail(\"test_id\", \"test_type\")\n    assert len(audit_logs) > 0, \"Audit log not recorded\"\n    database.close()" > tests/test_core.py

# Ensure dependencies are installed
!python modules/bootstrap/deps.py

# Verify directories
!ls modules/core || echo "modules/core not found"
!ls data || echo "data not found"

# Run config.py
!python modules/core/config.py

# Run db.py
!python modules/core/db.py

# Run tests
!pytest tests/test_core.py -v

# Verify files
!ls modules/core
!ls data

# Expected output:
# Dependencies installed successfully.
# modules/core created
# data created
# Config loaded: {'data_dir': 'data', 'model_dir': 'models', 'report_dir': 'data/reports', 'db_path': 'data/loan_iq.db', 'drive_root': '/content/drive/MyDrive/loan_iq', 'streamlit_port': 8501, 'fraud_types': ['ghost_client', 'duplicate_id', 'missed_payment', 'identity_theft'], 'regions': ['urban', 'rural', 'semi_urban'], 'max_clients_batch': 70000, 'default_batch_size': 1000}
# sys.path: [...'/content'...]
# Database initialized.
# ============================= test session starts =============================
# tests/test_core.py::test_config_init PASSED
# tests/test_core.py::test_db_create_and_log PASSED
# =========================== 2 passed in 0.XXs ===========================
# config.py  db.py
# .deps_ok  loan_iq.db  reports

reports
python3: can't open file '/content/modules/bootstrap/deps.py': [Errno 2] No such file or directory
config.py  db.py
reports
Config loaded: {'data_dir': 'data', 'model_dir': 'models', 'report_dir': 'data/reports', 'db_path': 'data/loan_iq.db', 'drive_root': '/content/drive/MyDrive/loan_iq', 'streamlit_port': 8501, 'fraud_types': ['ghost_client', 'duplicate_id', 'missed_payment', 'identity_theft'], 'regions': ['urban', 'rural', 'semi_urban'], 'max_clients_batch': 70000, 'default_batch_size': 1000}
sys.path: ['/content/modules/core', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/content', '/content']
  timestamp = datetime.utcnow().isoformat()
Database initialized.
platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: Faker-28.1.0, anyio-4.10.0, typeguard-4.4.4, langsmith-0.

In [None]:
# Colab cell to create, run, and test modules/bootstrap/deps.py, modules/core/config.py, modules/core/db.py, modules/core/utils.py, and modules/core/auth.py
# Run this entire block in Colab to execute all steps

import sys
import os
sys.path.append(os.getcwd())
print(f"Current working directory: {os.getcwd()}")  # Debug

# Create directories and reset database/dependencies
!mkdir -p modules/bootstrap modules/core tests data models data/reports
!rm -f data/loan_iq.db data/.deps_ok
!ls modules/bootstrap || echo "Directory modules/bootstrap created"
!ls modules/core || echo "Directory modules/core created"
!ls data || echo "Directory data created"

# Write deps.py using Python
os.makedirs('modules/bootstrap', exist_ok=True)
with open('modules/bootstrap/deps.py', 'w') as f:
    f.write('''# modules/bootstrap/deps.py
# Estimated line count: 50

import os
import subprocess

REQUIRED_LIBS = [
    'streamlit==1.38.0',
    'pandas==2.2.2',
    'numpy==1.26.4',
    'scikit-learn==1.5.1',
    'xgboost==2.1.1',
    'plotly==5.22.0',
    'faker==28.1.0',
    'openpyxl==3.1.5',
    'reportlab==4.2.2',
    'pytest==8.3.2',
    'shap==0.46.0'
]

def install_deps():
    """Install required libraries and create marker file."""
    os.makedirs('data', exist_ok=True)
    marker_path = os.path.join('data', '.deps_ok')
    if not os.path.exists(marker_path):
        for lib in REQUIRED_LIBS:
            try:
                __import__(lib.split('==')[0])
            except ImportError:
                subprocess.check_call(['pip', 'install', lib])
        with open(marker_path, 'w') as f:
            f.write('OK')
        print("Dependencies installed successfully.")
    else:
        print("Dependencies already installed.")

if __name__ == '__main__':
    install_deps()
''')
!test -f modules/bootstrap/deps.py && echo "deps.py created" || echo "Failed to create deps.py"

# Write config.py using Python
os.makedirs('modules/core', exist_ok=True)
with open('modules/core/config.py', 'w') as f:
    f.write('''# modules/core/config.py
# Estimated line count: 60

import sys
import os
sys.path.append(os.getcwd())
import random
import numpy as np

ADMIN_CREDENTIALS = {
    "username": "admin",
    "password": "Shady868"
}

SEEDS = {
    "faker": 42,
    "numpy": 42,
    "random": 42
}

CONFIG = {
    "data_dir": os.path.join("data"),
    "model_dir": os.path.join("models"),
    "report_dir": os.path.join("data", "reports"),
    "db_path": os.path.join("data", "loan_iq.db"),
    "drive_root": "/content/drive/MyDrive/loan_iq",
    "streamlit_port": 8501,
    "fraud_types": ["ghost_client", "duplicate_id", "missed_payment", "identity_theft"],
    "regions": ["urban", "rural", "semi_urban"],
    "max_clients_batch": 70000,
    "default_batch_size": 1000
}

def init_seeds():
    """Initialize random seeds for reproducibility."""
    random.seed(SEEDS["random"])
    np.random.seed(SEEDS["numpy"])

def get_config():
    """Return config dictionary, ensure directories exist."""
    os.makedirs(CONFIG["data_dir"], exist_ok=True)
    os.makedirs(CONFIG["model_dir"], exist_ok=True)
    os.makedirs(CONFIG["report_dir"], exist_ok=True)
    return CONFIG

if __name__ == "__main__":
    init_seeds()
    config = get_config()
    print(f"Config loaded: {config}")
''')
!test -f modules/core/config.py && echo "config.py created" || echo "Failed to create config.py"

# Write db.py using Python
with open('modules/core/db.py', 'w') as f:
    f.write('''# modules/core/db.py
# Estimated line count: 120

import sys
import os
sys.path.append(os.getcwd())
import sqlite3
import json
from datetime import datetime, UTC
try:
    from modules.core import config
except ImportError as e:
    print(f"Import error: {e}")
    raise

class DB:
    """SQLite database wrapper for Loan IQ."""
    def __init__(self):
        print(f"sys.path: {sys.path}")  # Debug path
        self.db_path = config.get_config()["db_path"]
        os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
        print(f"Creating database at: {self.db_path}")  # Debug
        self.conn = sqlite3.connect(self.db_path)
        self.conn.row_factory = sqlite3.Row  # Enable dict-like row access
        self.cursor = self.conn.cursor()
        self.create_tables()
        print(f"Database created: {os.path.exists(self.db_path)}")  # Debug

    def create_tables(self):
        """Create database tables."""
        tables = [
            "CREATE TABLE IF NOT EXISTS users (user_id TEXT PRIMARY KEY, username TEXT UNIQUE, password TEXT, role TEXT)",
            "CREATE TABLE IF NOT EXISTS clients (client_id TEXT PRIMARY KEY, name TEXT, branch TEXT, region TEXT, income REAL, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS loans (loan_id TEXT PRIMARY KEY, client_id TEXT, amount REAL, status TEXT, start_date TIMESTAMP, FOREIGN KEY (client_id) REFERENCES clients(client_id))",
            "CREATE TABLE IF NOT EXISTS transactions (transaction_id TEXT PRIMARY KEY, loan_id TEXT, amount REAL, date TIMESTAMP, type TEXT, FOREIGN KEY (loan_id) REFERENCES loans(loan_id))",
            "CREATE TABLE IF NOT EXISTS models (model_id TEXT PRIMARY KEY, type TEXT, version TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS model_versions (version_id TEXT PRIMARY KEY, model_id TEXT, config_json TEXT, data_hash TEXT, metrics_json TEXT, commit_ref TEXT, comments TEXT, created_at TIMESTAMP, FOREIGN KEY (model_id) REFERENCES models(model_id))",
            "CREATE TABLE IF NOT EXISTS audit_logs (log_id INTEGER PRIMARY KEY AUTOINCREMENT, actor_id TEXT, actor_role TEXT, action TEXT, target_id TEXT, target_type TEXT, reason TEXT, timestamp TIMESTAMP, before_snapshot TEXT, after_snapshot TEXT, reversible BOOLEAN, reversal_id INTEGER)",
            "CREATE TABLE IF NOT EXISTS simulations (sim_id TEXT PRIMARY KEY, user_id TEXT, params_json TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS reports (report_id TEXT PRIMARY KEY, type TEXT, path TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS assets (asset_id TEXT PRIMARY KEY, path TEXT, type TEXT, created_at TIMESTAMP)"
        ]
        for table_sql in tables:
            self.cursor.execute(table_sql)
        self.conn.commit()

    def log_action(self, actor_id, actor_role, action, target_id, target_type, reason, before_snapshot, after_snapshot, reversible=False):
        """Log an admin action to audit_logs."""
        timestamp = datetime.now(UTC).isoformat()
        snapshot_before = json.dumps(before_snapshot) if before_snapshot else ""
        snapshot_after = json.dumps(after_snapshot) if after_snapshot else ""
        self.cursor.execute(
            "INSERT INTO audit_logs (actor_id, actor_role, action, target_id, target_type, reason, timestamp, before_snapshot, after_snapshot, reversible) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
            (actor_id, actor_role, action, target_id, target_type, reason, timestamp, snapshot_before, snapshot_after, reversible)
        )
        self.conn.commit()
        return self.cursor.lastrowid

    def get_audit_trail(self, target_id=None, target_type=None):
        """Retrieve audit logs, optionally filtered."""
        query = "SELECT * FROM audit_logs"
        params = []
        if target_id and target_type:
            query += " WHERE target_id = ? AND target_type = ?"
            params = [target_id, target_type]
        self.cursor.execute(query, params)
        return self.cursor.fetchall()

    def rollback_action(self, action_id):
        """Attempt to rollback an action if reversible."""
        self.cursor.execute("SELECT reversible, before_snapshot, target_id, target_type, action FROM audit_logs WHERE log_id = ?", (action_id,))
        result = self.cursor.fetchone()
        if not result or not result[0]:
            return False
        before_snapshot = json.loads(result[1]) if result[1] else {}
        target_id, target_type, action = result[2], result[3], result[4]
        if target_type == "user" and action == "edit":
            self.cursor.execute("UPDATE users SET username = ?, password = ?, role = ? WHERE user_id = ?",
                              (before_snapshot.get("username"), before_snapshot.get("password"), before_snapshot.get("role"), target_id))
            self.conn.commit()
            return True
        return False

    def close(self):
        """Close database connection."""
        self.conn.close()

if __name__ == "__main__":
    db = DB()
    db.cursor.execute("INSERT OR IGNORE INTO users (user_id, username, password, role) VALUES (?, ?, ?, ?)",
                     ("1", "admin", "Shady868", "admin"))
    db.conn.commit()
    db.log_action("1", "admin", "init", "1", "user", "Initialize admin user", {}, {"username": "admin"})
    print("Database initialized.")
    db.close()
''')
!test -f modules/core/db.py && echo "db.py created" || echo "Failed to create db.py"

# Write utils.py using Python
with open('modules/core/utils.py', 'w') as f:
    f.write('''# modules/core/utils.py
# Estimated line count: 80

import sys
import os
sys.path.append(os.getcwd())
import json
from functools import wraps
try:
    from modules.core import db, config
except ImportError as e:
    print(f"Import error: {e}")
    raise

def audit_wrapper(func):
    """Decorator to log admin actions with snapshots and reason."""
    @wraps(func)
    def wrapper(*args, actor_id, actor_role, reason, **kwargs):
        if not reason:
            raise ValueError("Reason is required for audited actions")
        database = db.DB()
        target_id = kwargs.get('target_id', args[0] if args else 'unknown')
        target_type = kwargs.get('target_type', func.__name__)
        before_snapshot = {}
        try:
            if target_type in ['user', 'edit_user', 'delete_user', 'add_user']:
                database.cursor.execute("SELECT * FROM users WHERE user_id = ?", (target_id,))
                row = database.cursor.fetchone()
                before_snapshot = dict(row) if row else {}
                print(f"Before snapshot: {before_snapshot}")  # Debug
            filtered_kwargs = {k: v for k, v in kwargs.items() if k != 'target_type'}
            result = func(*args, actor_id=actor_id, actor_role=actor_role, reason=reason, **filtered_kwargs)
            after_snapshot = {}
            if target_type in ['user', 'edit_user', 'delete_user', 'add_user']:
                database.cursor.execute("SELECT * FROM users WHERE user_id = ?", (target_id,))
                row = database.cursor.fetchone()
                after_snapshot = dict(row) if row else {}
                print(f"After snapshot: {after_snapshot}")  # Debug
            reversible = target_type in ['user', 'edit_user', 'add_user']
            log_id = database.log_action(
                actor_id, actor_role, func.__name__, target_id, target_type, reason,
                before_snapshot, after_snapshot, reversible
            )
            database.close()
            return result
        except Exception as e:
            database.close()
            raise Exception(f"Action failed: {e}")
    return wrapper

def dict_diff(before, after):
    """Compute difference between two dictionaries for audit logging."""
    diff = {}
    for key in set(before.keys()) | set(after.keys()):
        if before.get(key) != after.get(key):
            diff[key] = {'before': before.get(key), 'after': after.get(key)}
    return diff

if __name__ == "__main__":
    @audit_wrapper
    def test_action(target_id, actor_id, actor_role, reason):
        return {"result": "test"}
    result = test_action("test_id", actor_id="1", actor_role="admin", reason="Test audit")
    print(f"Test action result: {result}")
''')
!test -f modules/core/utils.py && echo "utils.py created" || echo "Failed to create utils.py"

# Write auth.py using Python
with open('modules/core/auth.py', 'w') as f:
    f.write('''# modules/core/auth.py
# Estimated line count: 80

import sys
import os
sys.path.append(os.getcwd())
import sqlite3
try:
    from modules.core import config, db, utils
except ImportError as e:
    print(f"Import error: {e}")
    raise

def authenticate(username, password):
    """Authenticate user against stored credentials."""
    cfg = config.get_config()
    if username == config.ADMIN_CREDENTIALS["username"] and password == config.ADMIN_CREDENTIALS["password"]:
        return {"user_id": "1", "role": "admin"}
    database = db.DB()
    database.cursor.execute("SELECT user_id, role FROM users WHERE username = ? AND password = ?", (username, password))
    user = database.cursor.fetchone()
    database.close()
    if user:
        return {"user_id": user[0], "role": user[1]}
    return None

@utils.audit_wrapper
def add_user(username, password, role, actor_id, actor_role, reason, target_id=None):
    """Add a new user with audit logging."""
    target_id = target_id or f"u_{str(hash(username))[:8]}"
    print(f"Adding user with target_id: {target_id}")  # Debug
    database = db.DB()
    try:
        database.cursor.execute("INSERT INTO users (user_id, username, password, role) VALUES (?, ?, ?, ?)",
                              (target_id, username, password, role))
        database.conn.commit()
        database.close()
        return target_id
    except sqlite3.IntegrityError as e:
        database.close()
        raise ValueError(f"Failed to add user {username}: {e}")

@utils.audit_wrapper
def edit_user(user_id, updates, actor_id, actor_role, reason, target_id=None):
    """Edit user details with audit logging."""
    target_id = target_id or user_id
    database = db.DB()
    allowed_fields = ['username', 'password', 'role']
    updates = {k: v for k, v in updates.items() if k in allowed_fields}
    if not updates:
        database.close()
        raise ValueError("No valid fields to update")
    set_clause = ", ".join(f"{k} = ?" for k in updates.keys())
    values = list(updates.values()) + [user_id]
    try:
        database.cursor.execute(f"UPDATE users SET {set_clause} WHERE user_id = ?", values)
        database.conn.commit()
        database.close()
        return True
    except sqlite3.IntegrityError as e:
        database.close()
        raise ValueError(f"Failed to edit user {user_id}: {e}")

@utils.audit_wrapper
def delete_user(user_id, actor_id, actor_role, reason, target_id=None, confirmation=None):
    """Delete user with audit logging and confirmation."""
    target_id = target_id or user_id
    if confirmation != f"CONFIRM DELETE {user_id}":
        raise ValueError("Invalid confirmation for deletion")
    database = db.DB()
    try:
        database.cursor.execute("DELETE FROM users WHERE user_id = ?", (user_id,))
        database.conn.commit()
        database.close()
        return True
    except sqlite3.Error as e:
        database.close()
        raise ValueError(f"Failed to delete user {user_id}: {e}")

if __name__ == "__main__":
    user = authenticate("admin", "Shady868")
    print(f"Auth result: {user}")
    try:
        new_user_id = add_user("test_user", "test_pass", "user", actor_id="1", actor_role="admin", reason="Test add user", target_id="test_1")
        print(f"Added user: {new_user_id}")
    except Exception as e:
        print(f"Error adding user: {e}")
''')
!test -f modules/core/auth.py && echo "auth.py created" || echo "Failed to create auth.py"

# Write test_core.py using Python
os.makedirs('tests', exist_ok=True)
with open('tests/test_core.py', 'w') as f:
    f.write('''# tests/test_core.py
# Estimated line count: 60

import sys
import os
import sqlite3
sys.path.append(os.getcwd())
try:
    from modules.core import config, db, utils, auth
except ImportError as e:
    print(f"Import error: {e}")
    raise

def test_config_init():
    cfg = config.get_config()
    assert os.path.exists(cfg["data_dir"]), "Data directory not created"
    assert cfg["streamlit_port"] == 8501, "Incorrect port"
    assert config.ADMIN_CREDENTIALS["username"] == "admin", "Admin username incorrect"

def test_db_create_and_log():
    database = db.DB()
    database.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='audit_logs'")
    assert database.cursor.fetchone(), "Audit logs table not created"
    log_id = database.log_action("1", "admin", "test_action", "test_id", "test_type", "Test reason", {"key": "before"}, {"key": "after"}, True)
    assert log_id, "Failed to log action"
    audit_logs = database.get_audit_trail("test_id", "test_type")
    assert len(audit_logs) > 0, "Audit log not recorded"
    database.close()

def test_authenticate():
    user = auth.authenticate("admin", "Shady868")
    assert user == {"user_id": "1", "role": "admin"}, "Admin authentication failed"
    user = auth.authenticate("wrong", "wrong")
    assert user is None, "Invalid credentials should fail"

def test_audit_wrapper():
    @utils.audit_wrapper
    def test_action(target_id, actor_id, actor_role, reason):
        return True
    result = test_action("test_id", actor_id="1", actor_role="admin", reason="Test audit")
    assert result, "Audit wrapper failed"
    database = db.DB()
    logs = database.get_audit_trail("test_id", "test_action")
    assert len(logs) > 0, "Audit log not recorded"
    database.close()

def test_add_user():
    database = db.DB()
    database.cursor.execute("DELETE FROM users WHERE user_id = ?", ("test_2",))
    database.conn.commit()
    database.close()
    user_id = auth.add_user("test_user2", "test_pass2", "user", actor_id="1", actor_role="admin", reason="Test add user", target_id="test_2")
    assert user_id == "test_2", "Failed to add user"
    database = db.DB()
    database.cursor.execute("SELECT username FROM users WHERE user_id = ?", ("test_2",))
    result = database.cursor.fetchone()
    assert result and result[0] == "test_user2", "User not added correctly"
    database.close()

def test_edit_user():
    database = db.DB()
    database.cursor.execute("DELETE FROM users WHERE user_id = ?", ("test_2",))
    database.cursor.execute("INSERT INTO users (user_id, username, password, role) VALUES (?, ?, ?, ?)",
                          ("test_2", "test_user2", "test_pass2", "user"))
    database.conn.commit()
    database.close()
    result = auth.edit_user("test_2", {"password": "new_pass"}, actor_id="1", actor_role="admin", reason="Test edit user", target_id="test_2")
    assert result, "Failed to edit user"
    database = db.DB()
    database.cursor.execute("SELECT password FROM users WHERE user_id = ?", ("test_2",))
    result = database.cursor.fetchone()
    assert result and result[0] == "new_pass", "User not edited correctly"
    database.close()

def test_db_existence():
    assert os.path.exists("data/loan_iq.db"), "Database file not created"
''')
!test -f tests/test_core.py && echo "test_core.py created" || echo "Failed to create test_core.py"

# Ensure dependencies are installed
!python modules/bootstrap/deps.py

# Verify directories
!ls modules/bootstrap || echo "modules/bootstrap not found"
!ls modules/core || echo "modules/core not found"
!ls data || echo "data not found"

# Run config.py
!python modules/core/config.py

# Run db.py
!python modules/core/db.py

# Run utils.py
!python modules/core/utils.py

# Run auth.py
!python modules/core/auth.py

# Run tests
!pytest tests/test_core.py -v

# Verify files
!ls modules/bootstrap
!ls modules/core
!ls data

# Expected output:
# Current working directory: /content
# Directory modules/bootstrap created
# Directory modules/core created
# Directory data created
# deps.py created
# config.py created
# db.py created
# utils.py created
# auth.py created
# test_core.py created
# Dependencies installed successfully.
# deps.py
# auth.py  config.py  db.py  utils.py
# .deps_ok  loan_iq.db  reports
# Config loaded: {'data_dir': 'data', 'model_dir': 'models', 'report_dir': 'data/reports', 'db_path': 'data/loan_iq.db', 'drive_root': '/content/drive/MyDrive/loan_iq', 'streamlit_port': 8501, 'fraud_types': ['ghost_client', 'duplicate_

Current working directory: /content
deps.py
auth.py  config.py  db.py  __pycache__	utils.py
reports
deps.py created
config.py created
db.py created
utils.py created
auth.py created
test_core.py created
Dependencies installed successfully.
deps.py
auth.py  config.py  db.py  __pycache__	utils.py
reports
Config loaded: {'data_dir': 'data', 'model_dir': 'models', 'report_dir': 'data/reports', 'db_path': 'data/loan_iq.db', 'drive_root': '/content/drive/MyDrive/loan_iq', 'streamlit_port': 8501, 'fraud_types': ['ghost_client', 'duplicate_id', 'missed_payment', 'identity_theft'], 'regions': ['urban', 'rural', 'semi_urban'], 'max_clients_batch': 70000, 'default_batch_size': 1000}
sys.path: ['/content/modules/core', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/content', '/content']
Creating database at: data/loan_iq.db
Database created: True
Database initialized.
s

In [None]:
# Colab cell to create, run, and test modules/synth/faker_engine.py and modules/synth/generators.py
# Run this entire block in Colab to execute all steps

import sys
import os
sys.path.append(os.getcwd())
print(f"Current working directory: {os.getcwd()}")  # Debug

# Create directories and reset database to prevent schema conflicts
!mkdir -p modules/synth tests data models data/reports
!rm -f data/loan_iq.db
!ls modules/synth || echo "Directory modules/synth created"
!ls tests || echo "Directory tests created"
!ls data || echo "Directory data created"

# Write faker_engine.py using Python
os.makedirs('modules/synth', exist_ok=True)
with open('modules/synth/faker_engine.py', 'w') as f:
    f.write('''# modules/synth/faker_engine.py
# Estimated line count: 300

import sys
import os
sys.path.append(os.getcwd())
from faker import Faker
import random
from datetime import datetime, timedelta
try:
    from modules.core import config
except ImportError as e:
    print(f"Import error: {e}")
    raise

class LoanIQFaker:
    """Custom Faker for generating Loan IQ synthetic data with fraud patterns."""
    def __init__(self):
        self.faker = Faker()
        Faker.seed(config.SEEDS["faker"])
        random.seed(config.SEEDS["random"])
        self.config = config.get_config()
        self.fraud_types = self.config["fraud_types"]
        self.regions = self.config["regions"]

    def client_id(self):
        """Generate unique client ID."""
        return f"C_{self.faker.uuid4().split('-')[0]}"

    def loan_id(self):
        """Generate unique loan ID."""
        return f"L_{self.faker.uuid4().split('-')[0]}"

    def transaction_id(self):
        """Generate unique transaction ID."""
        return f"T_{self.faker.uuid4().split('-')[0]}"

    def client_name(self, fraud_type=None):
        """Generate client name, with ghost client fraud option."""
        if fraud_type == "ghost_client" and random.random() < 0.1:
            return None  # Ghost client has no name
        return self.faker.name()

    def duplicate_id(self, existing_ids):
        """Generate client ID with chance of duplication for fraud."""
        if random.random() < 0.05:  # 5% chance of duplicate ID
            return random.choice(existing_ids) if existing_ids else self.client_id()
        return self.client_id()

    def income(self, fraud_type=None):
        """Generate income, with variance for fraud."""
        if fraud_type == "identity_theft" and random.random() < 0.1:
            return random.uniform(100000, 1000000)  # Suspiciously high income
        return random.uniform(20000, 100000)

    def branch(self):
        """Generate branch name."""
        return self.faker.city()

    def region(self):
        """Generate region from config."""
        return random.choice(self.regions)

    def loan_amount(self, fraud_type=None):
        """Generate loan amount, with variance for fraud."""
        if fraud_type == "missed_payment" and random.random() < 0.2:
            return random.uniform(50000, 200000)  # Higher loan for missed payments
        return random.uniform(1000, 50000)

    def loan_status(self, fraud_type=None):
        """Generate loan status, with fraud influence."""
        statuses = ["active", "paid", "default"]
        if fraud_type == "missed_payment" and random.random() < 0.3:
            return "default"
        return random.choice(statuses)

    def transaction_amount(self, loan_amount):
        """Generate transaction amount based on loan."""
        return random.uniform(100, min(loan_amount * 0.1, 5000))

    def transaction_type(self, fraud_type=None):
        """Generate transaction type, with fraud influence."""
        types = ["payment", "fee", "interest"]
        if fraud_type == "identity_theft" and random.random() < 0.1:
            return "suspicious_transfer"
        return random.choice(types)

    def random_date(self, start_days=-365, end_days=0):
        """Generate random date within range."""
        start = datetime.now() + timedelta(days=start_days)
        end = datetime.now() + timedelta(days=end_days)
        return self.faker.date_time_between(start, end).isoformat()

if __name__ == "__main__":
    faker = LoanIQFaker()
    print(f"Client ID: {faker.client_id()}")
    print(f"Client Name: {faker.client_name()}")
    print(f"Loan ID: {faker.loan_id()}")
    print(f"Transaction ID: {faker.transaction_id()}")
    print(f"Income: {faker.income()}")
    print(f"Branch: {faker.branch()}")
    print(f"Region: {faker.region()}")
    print(f"Loan Amount: {faker.loan_amount()}")
    print(f"Loan Status: {faker.loan_status()}")
    print(f"Transaction Amount: {faker.transaction_amount(10000)}")
    print(f"Transaction Type: {faker.transaction_type()}")
    print(f"Random Date: {faker.random_date()}")
''')
!test -f modules/synth/faker_engine.py && echo "faker_engine.py created" || echo "Failed to create faker_engine.py"

# Write generators.py using Python
with open('modules/synth/generators.py', 'w') as f:
    f.write('''# modules/synth/generators.py
# Estimated line count: 250

import sys
import os
import random
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
from datetime import datetime
try:
    from modules.core import db, config
    from modules.synth import faker_engine
except ImportError as e:
    print(f"Import error: {e}")
    raise

class DataGenerator:
    """Generate synthetic data for Loan IQ and store in database."""
    def __init__(self):
        self.faker = faker_engine.LoanIQFaker()
        self.config = config.get_config()
        self.db_path = self.config["db_path"]

    def generate_clients(self, n, fraud_ratio=0.1):
        """Generate n clients with optional fraud patterns."""
        clients = []
        existing_ids = []
        for _ in range(n):
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            client_id = self.faker.duplicate_id(existing_ids) if fraud_type == "duplicate_id" else self.faker.client_id()
            existing_ids.append(client_id)
            clients.append({
                "client_id": client_id,
                "name": self.faker.client_name(fraud_type),
                "branch": self.faker.branch(),
                "region": self.faker.region(),
                "income": self.faker.income(fraud_type),
                "created_at": self.faker.random_date()
            })
        return pd.DataFrame(clients)

    def generate_loans(self, clients, n_per_client=2, fraud_ratio=0.1):
        """Generate loans for given clients."""
        loans = []
        for client_id in clients["client_id"]:
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            for _ in range(random.randint(1, n_per_client)):
                loans.append({
                    "loan_id": self.faker.loan_id(),
                    "client_id": client_id,
                    "amount": self.faker.loan_amount(fraud_type),
                    "status": self.faker.loan_status(fraud_type),
                    "start_date": self.faker.random_date()
                })
        return pd.DataFrame(loans)

    def generate_transactions(self, loans, n_per_loan=3, fraud_ratio=0.1):
        """Generate transactions for given loans."""
        transactions = []
        for loan_id, loan_amount in zip(loans["loan_id"], loans["amount"]):
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            for _ in range(random.randint(1, n_per_loan)):
                transactions.append({
                    "transaction_id": self.faker.transaction_id(),
                    "loan_id": loan_id,
                    "amount": self.faker.transaction_amount(loan_amount),
                    "date": self.faker.random_date(),
                    "type": self.faker.transaction_type(fraud_type)
                })
        return pd.DataFrame(transactions)

    def save_to_db(self, clients, loans, transactions, actor_id="1", actor_role="admin", reason="Synthetic data generation"):
        """Save generated data to loan_iq.db with audit logging."""
        database = db.DB()
        print(f"Saving to database: {self.db_path}")  # Debug
        try:
            # Save clients
            for _, row in clients.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO clients (client_id, name, branch, region, income, created_at) VALUES (?, ?, ?, ?, ?, ?)",
                    (row["client_id"], row["name"], row["branch"], row["region"], row["income"], row["created_at"])
                )
            # Save loans
            for _, row in loans.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO loans (loan_id, client_id, amount, status, start_date) VALUES (?, ?, ?, ?, ?)",
                    (row["loan_id"], row["client_id"], row["amount"], row["status"], row["start_date"])
                )
            # Save transactions
            for _, row in transactions.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO transactions (transaction_id, loan_id, amount, date, type) VALUES (?, ?, ?, ?, ?)",
                    (row["transaction_id"], row["loan_id"], row["amount"], row["date"], row["type"])
                )
            database.conn.commit()
            database.log_action(
                actor_id, actor_role, "generate_data", "multiple", "synthetic_data", reason,
                {}, {"clients": len(clients), "loans": len(loans), "transactions": len(transactions)}
            )
            print(f"Saved {len(clients)} clients, {len(loans)} loans, {len(transactions)} transactions to DB")
        finally:
            database.close()

    def export_to_csv(self, clients, loans, transactions, output_dir=None):
        """Export data to CSV files."""
        output_dir = output_dir or self.config["data_dir"]
        os.makedirs(output_dir, exist_ok=True)
        clients.to_csv(os.path.join(output_dir, "clients.csv"), index=False)
        loans.to_csv(os.path.join(output_dir, "loans.csv"), index=False)
        transactions.to_csv(os.path.join(output_dir, "transactions.csv"), index=False)
        print(f"Exported data to {output_dir}/[clients,loans,transactions].csv")

if __name__ == "__main__":
    generator = DataGenerator()
    clients = generator.generate_clients(10, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    generator.export_to_csv(clients, loans, transactions)
    print("Generated and saved synthetic data.")
''')
!test -f modules/synth/generators.py && echo "generators.py created" || echo "Failed to create generators.py"

# Write test_synth.py using Python
with open('tests/test_synth.py', 'w') as f:
    f.write('''# tests/test_synth.py
# Estimated line count: 80

import sys
import os
import sqlite3
import pandas as pd
sys.path.append(os.getcwd())
try:
    from modules.core import config, db
    from modules.synth import faker_engine, generators
except ImportError as e:
    print(f"Import error: {e}")
    raise

def test_faker_engine():
    faker = faker_engine.LoanIQFaker()
    assert len(faker.client_id()) > 0, "Client ID not generated"
    assert faker.region() in config.get_config()["regions"], "Invalid region"
    assert isinstance(faker.income(), float), "Income not float"
    assert isinstance(faker.loan_amount(), float), "Loan amount not float"
    assert faker.loan_status() in ["active", "paid", "default"], "Invalid loan status"

def test_generate_clients():
    generator = generators.DataGenerator()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    assert len(clients) == 5, "Incorrect number of clients"
    assert set(clients.columns) == {"client_id", "name", "branch", "region", "income", "created_at"}, "Incorrect client columns"
    assert clients["region"].isin(config.get_config()["regions"]).all(), "Invalid regions"

def test_generate_loans():
    generator = generators.DataGenerator()
    clients = generator.generate_clients(3, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    assert len(loans) >= 3, "Incorrect number of loans"
    assert set(loans.columns) == {"loan_id", "client_id", "amount", "status", "start_date"}, "Incorrect loan columns"
    assert loans["client_id"].isin(clients["client_id"]).all(), "Invalid client IDs in loans"

def test_generate_transactions():
    generator = generators.DataGenerator()
    clients = generator.generate_clients(2, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    assert len(transactions) >= 2, "Incorrect number of transactions"
    assert set(transactions.columns) == {"transaction_id", "loan_id", "amount", "date", "type"}, "Incorrect transaction columns"
    assert transactions["loan_id"].isin(loans["loan_id"]).all(), "Invalid loan IDs in transactions"

def test_save_to_db():
    generator = generators.DataGenerator()
    database = db.DB()
    print(f"Clearing tables for test")  # Debug
    database.cursor.execute("DELETE FROM clients")
    database.cursor.execute("DELETE FROM loans")
    database.cursor.execute("DELETE FROM transactions")
    database.conn.commit()
    database.close()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    database = db.DB()
    database.cursor.execute("SELECT COUNT(*) FROM clients")
    assert database.cursor.fetchone()[0] == 5, "Clients not saved to DB"
    database.cursor.execute("SELECT COUNT(*) FROM loans")
    assert database.cursor.fetchone()[0] >= 5, "Loans not saved to DB"
    database.cursor.execute("SELECT COUNT(*) FROM transactions")
    assert database.cursor.fetchone()[0] >= 5, "Transactions not saved to DB"
    database.cursor.execute("SELECT * FROM audit_logs WHERE target_type = 'synthetic_data'")
    assert len(database.cursor.fetchall()) > 0, "Audit log not recorded"
    database.close()

def test_export_to_csv():
    generator = generators.DataGenerator()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.export_to_csv(clients, loans, transactions)
    assert os.path.exists(os.path.join(config.get_config()["data_dir"], "clients.csv")), "Clients CSV not exported"
    assert os.path.exists(os.path.join(config.get_config()["data_dir"], "loans.csv")), "Loans CSV not exported"
    assert os.path.exists(os.path.join(config.get_config()["data_dir"], "transactions.csv")), "Transactions CSV not exported"
''')
!test -f tests/test_synth.py && echo "test_synth.py created" || echo "Failed to create test_synth.py"

# Ensure dependencies are installed (assuming deps.py exists from previous cell)
!python modules/bootstrap/deps.py

# Verify directories
!ls modules/synth || echo "modules/synth not found"
!ls tests || echo "tests not found"
!ls data || echo "data not found"

# Run faker_engine.py
!python modules/synth/faker_engine.py

# Run generators.py
!python modules/synth/generators.py

# Run tests
!pytest tests/test_synth.py -v

# Verify files
!ls modules/synth
!ls tests
!ls data

# Expected output:
# Current working directory: /content
# Directory modules/synth created
# Directory tests created
# Directory data created
# faker_engine.py created
# generators.py created
# test_synth.py created
# Dependencies installed successfully.
# faker_engine.py  generators.py
# test_core.py  test_synth.py
# .deps_ok  clients.csv  loans.csv  loan_iq.db  reports  transactions.csv
# Client ID: C_...
# Client Name: ...
# Loan ID: L_...
# Transaction ID: T_...
# Income: ...
# Branch: ...
# Region: ...
# Loan Amount: ...
# Loan Status: ...
# Transaction Amount: ...
# Transaction Type: ...
# Random Date: ...
# Saving to database: data/loan_iq.db
# Saved 10 clients, ... loans, ... transactions to DB
# Exported data to data/[clients,loans,transactions].csv
# Generated and saved synthetic data.
# ============================= test session starts =============================
# tests/test_synth.py::test_faker_engine PASSED
# tests/test_synth.py::test_generate_clients PASSED
# tests/test_synth.py::test_generate_loans PASSED
# tests/test_synth.py::test_generate_transactions PASSED
# tests/test_synth.py::test_save_to_db PASSED
# tests/test_synth.py::test_export_to_csv PASSED
# =========================== 6 passed in 0.XXs ===========================

Current working directory: /content
faker_engine.py  generators.py	__pycache__
__pycache__  test_core.py  test_synth.py
reports
faker_engine.py created
generators.py created
test_synth.py created
Dependencies already installed.
faker_engine.py  generators.py	__pycache__
__pycache__  test_core.py  test_synth.py
reports
Client ID: C_bdd640fb
Client Name: Daniel Doyle
Loan ID: L_8b9d2434
Transaction ID: T_0822e8f3
Income: 71154.1438766307
Branch: North Jefferyhaven
Region: urban
Loan Amount: 37335.97448823181
Loan Status: active
Transaction Amount: 300.88966433394046
Transaction Type: interest
Random Date: 2025-04-07T05:38:09.639320
sys.path: ['/content/modules/synth', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/content', '/content', '/content', '/content']
Creating database at: data/loan_iq.db
Database created: True
Saving to database: data/loan_iq.db
Save

In [None]:

import os
os.makedirs('modules/bootstrap', exist_ok=True)
with open('modules/bootstrap/deps.py', 'w') as f:
    f.write('''# modules/bootstrap/deps.py
import os
import subprocess

REQUIRED_LIBS = [
    'streamlit==1.38.0',
    'pandas==2.2.2',
    'numpy==1.26.4',
    'scikit-learn==1.5.1',
    'xgboost==2.1.1',
    'plotly==5.22.0',
    'faker==28.1.0',
    'openpyxl==3.1.5',
    'reportlab==4.2.2',
    'pytest==8.3.2',
    'shap==0.46.0'
]

def install_deps():
    """Install required libraries and create marker file."""
    os.makedirs('data', exist_ok=True)
    marker_path = os.path.join('data', '.deps_ok')
    if not os.path.exists(marker_path):
        for lib in REQUIRED_LIBS:
            try:
                __import__(lib.split('==')[0])
            except ImportError:
                subprocess.check_call(['pip', 'install', lib])
        with open(marker_path, 'w') as f:
            f.write('OK')
        print("Dependencies installed successfully.")
    else:
        print("Dependencies already installed.")

if __name__ == '__main__':
    install_deps()
''')
!test -f modules/bootstrap/deps.py && echo "deps.py created" || echo "Failed to create deps.py"

deps.py created


In [None]:
import os
os.makedirs('modules/core', exist_ok=True)
with open('modules/core/config.py', 'w') as f:
    f.write('''# modules/core/config.py
import sys
import os
sys.path.append(os.getcwd())
import random
import numpy as np

ADMIN_CREDENTIALS = {
    "username": "admin",
    "password": "Shady868"
}

SEEDS = {
    "faker": 42,
    "numpy": 42,
    "random": 42
}

CONFIG = {
    "data_dir": os.path.join("data"),
    "model_dir": os.path.join("models"),
    "report_dir": os.path.join("data", "reports"),
    "db_path": os.path.join("data", "loan_iq.db"),
    "drive_root": "/content/drive/MyDrive/loan_iq",
    "streamlit_port": 8501,
    "fraud_types": ["ghost_client", "duplicate_id", "missed_payment", "identity_theft"],
    "regions": ["urban", "rural", "semi_urban"],
    "max_clients_batch": 70000,
    "default_batch_size": 1000
}

def init_seeds():
    """Initialize random seeds for reproducibility."""
    random.seed(SEEDS["random"])
    np.random.seed(SEEDS["numpy"])

def get_config():
    """Return config dictionary, ensure directories exist."""
    os.makedirs(CONFIG["data_dir"], exist_ok=True)
    os.makedirs(CONFIG["model_dir"], exist_ok=True)
    os.makedirs(CONFIG["report_dir"], exist_ok=True)
    return CONFIG

if __name__ == "__main__":
    init_seeds()
    config = get_config()
    print(f"Config loaded: {config}")
''')
!test -f modules/core/config.py && echo "config.py created" || echo "Failed to create config.py"

config.py created


In [None]:
import os
os.makedirs('modules/core', exist_ok=True)
with open('modules/core/db.py', 'w') as f:
    f.write('''# modules/core/db.py
import sys
import os
sys.path.append(os.getcwd())
import sqlite3
import json
from datetime import datetime, UTC
try:
    from modules.core import config
except ImportError as e:
    print(f"Import error: {e}")
    raise

class DB:
    """SQLite database wrapper for Loan IQ."""
    def __init__(self):
        print(f"sys.path: {sys.path}")  # Debug
        self.db_path = config.get_config()["db_path"]
        os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
        print(f"Creating database at: {self.db_path}")  # Debug
        self.conn = sqlite3.connect(self.db_path)
        self.conn.row_factory = sqlite3.Row
        self.cursor = self.conn.cursor()
        self.create_tables()
        print(f"Database created: {os.path.exists(self.db_path)}")  # Debug

    def create_tables(self):
        """Create database tables."""
        tables = [
            "CREATE TABLE IF NOT EXISTS users (user_id TEXT PRIMARY KEY, username TEXT UNIQUE, password TEXT, role TEXT)",
            "CREATE TABLE IF NOT EXISTS clients (client_id TEXT PRIMARY KEY, name TEXT, branch TEXT, region TEXT, income REAL, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS loans (loan_id TEXT PRIMARY KEY, client_id TEXT, amount REAL, status TEXT, start_date TIMESTAMP, FOREIGN KEY (client_id) REFERENCES clients(client_id))",
            "CREATE TABLE IF NOT EXISTS transactions (transaction_id TEXT PRIMARY KEY, loan_id TEXT, amount REAL, date TIMESTAMP, type TEXT, FOREIGN KEY (loan_id) REFERENCES loans(loan_id))",
            "CREATE TABLE IF NOT EXISTS models (model_id TEXT PRIMARY KEY, type TEXT, version TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS model_versions (version_id TEXT PRIMARY KEY, model_id TEXT, config_json TEXT, data_hash TEXT, metrics_json TEXT, commit_ref TEXT, comments TEXT, created_at TIMESTAMP, FOREIGN KEY (model_id) REFERENCES models(model_id))",
            "CREATE TABLE IF NOT EXISTS audit_logs (log_id INTEGER PRIMARY KEY AUTOINCREMENT, actor_id TEXT, actor_role TEXT, action TEXT, target_id TEXT, target_type TEXT, reason TEXT, timestamp TIMESTAMP, before_snapshot TEXT, after_snapshot TEXT, reversible BOOLEAN, reversal_id INTEGER)",
            "CREATE TABLE IF NOT EXISTS simulations (sim_id TEXT PRIMARY KEY, user_id TEXT, params_json TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS reports (report_id TEXT PRIMARY KEY, type TEXT, path TEXT, created_at TIMESTAMP)",
            "CREATE TABLE IF NOT EXISTS assets (asset_id TEXT PRIMARY KEY, path TEXT, type TEXT, created_at TIMESTAMP)"
        ]
        for table_sql in tables:
            self.cursor.execute(table_sql)
        self.conn.commit()

    def log_action(self, actor_id, actor_role, action, target_id, target_type, reason, before_snapshot, after_snapshot, reversible=False):
        """Log an admin action to audit_logs."""
        timestamp = datetime.now(UTC).isoformat()
        snapshot_before = json.dumps(before_snapshot) if before_snapshot else ""
        snapshot_after = json.dumps(after_snapshot) if after_snapshot else ""
        self.cursor.execute(
            "INSERT INTO audit_logs (actor_id, actor_role, action, target_id, target_type, reason, timestamp, before_snapshot, after_snapshot, reversible) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
            (actor_id, actor_role, action, target_id, target_type, reason, timestamp, snapshot_before, snapshot_after, reversible)
        )
        self.conn.commit()
        return self.cursor.lastrowid

    def get_audit_trail(self, target_id=None, target_type=None):
        """Retrieve audit logs, optionally filtered."""
        query = "SELECT * FROM audit_logs"
        params = []
        if target_id and target_type:
            query += " WHERE target_id = ? AND target_type = ?"
            params = [target_id, target_type]
        self.cursor.execute(query, params)
        return self.cursor.fetchall()

    def rollback_action(self, action_id):
        """Attempt to rollback an action if reversible."""
        self.cursor.execute("SELECT reversible, before_snapshot, target_id, target_type, action FROM audit_logs WHERE log_id = ?", (action_id,))
        result = self.cursor.fetchone()
        if not result or not result[0]:
            return False
        before_snapshot = json.loads(result[1]) if result[1] else {}
        target_id, target_type, action = result[2], result[3], result[4]
        if target_type == "user" and action == "edit":
            self.cursor.execute("UPDATE users SET username = ?, password = ?, role = ? WHERE user_id = ?",
                              (before_snapshot.get("username"), before_snapshot.get("password"), before_snapshot.get("role"), target_id))
            self.conn.commit()
            return True
        return False

    def close(self):
        """Close database connection."""
        self.conn.close()

if __name__ == "__main__":
    db = DB()
    db.cursor.execute("INSERT OR IGNORE INTO users (user_id, username, password, role) VALUES (?, ?, ?, ?)",
                     ("1", "admin", "Shady868", "admin"))
    db.conn.commit()
    db.log_action("1", "admin", "init", "1", "user", "Initialize admin user", {}, {"username": "admin"})
    print("Database initialized.")
    db.close()
''')
!test -f modules/core/db.py && echo "db.py created" || echo "Failed to create db.py"

db.py created


In [None]:
import os
os.makedirs('modules/core', exist_ok=True)
with open('modules/core/utils.py', 'w') as f:
    f.write('''# modules/core/utils.py
import sys
import os
sys.path.append(os.getcwd())
import json
from functools import wraps
try:
    from modules.core import db, config
except ImportError as e:
    print(f"Import error: {e}")
    raise

def audit_wrapper(func):
    """Decorator to log admin actions with snapshots and reason."""
    @wraps(func)
    def wrapper(*args, actor_id, actor_role, reason, **kwargs):
        if not reason:
            raise ValueError("Reason is required for audited actions")
        database = db.DB()
        target_id = kwargs.get('target_id', args[0] if args else 'unknown')
        target_type = kwargs.get('target_type', func.__name__)
        before_snapshot = {}
        try:
            if target_type in ['user', 'edit_user', 'delete_user', 'add_user']:
                database.cursor.execute("SELECT * FROM users WHERE user_id = ?", (target_id,))
                row = database.cursor.fetchone()
                before_snapshot = dict(row) if row else {}
                print(f"Before snapshot: {before_snapshot}")  # Debug
            filtered_kwargs = {k: v for k, v in kwargs.items() if k != 'target_type'}
            result = func(*args, actor_id=actor_id, actor_role=actor_role, reason=reason, **filtered_kwargs)
            after_snapshot = {}
            if target_type in ['user', 'edit_user', 'delete_user', 'add_user']:
                database.cursor.execute("SELECT * FROM users WHERE user_id = ?", (target_id,))
                row = database.cursor.fetchone()
                after_snapshot = dict(row) if row else {}
                print(f"After snapshot: {after_snapshot}")  # Debug
            reversible = target_type in ['user', 'edit_user', 'add_user']
            log_id = database.log_action(
                actor_id, actor_role, func.__name__, target_id, target_type, reason,
                before_snapshot, after_snapshot, reversible
            )
            database.close()
            return result
        except Exception as e:
            database.close()
            raise Exception(f"Action failed: {e}")
    return wrapper

def dict_diff(before, after):
    """Compute difference between two dictionaries for audit logging."""
    diff = {}
    for key in set(before.keys()) | set(after.keys()):
        if before.get(key) != after.get(key):
            diff[key] = {'before': before.get(key), 'after': after.get(key)}
    return diff

if __name__ == "__main__":
    @audit_wrapper
    def test_action(target_id, actor_id, actor_role, reason):
        return {"result": "test"}
    result = test_action("test_id", actor_id="1", actor_role="admin", reason="Test audit")
    print(f"Test action result: {result}")
''')
!test -f modules/core/utils.py && echo "utils.py created" || echo "Failed to create utils.py"

utils.py created


In [None]:
import os
os.makedirs('modules/core', exist_ok=True)
with open('modules/core/auth.py', 'w') as f:
    f.write('''# modules/core/auth.py
import sys
import os
import sqlite3
import uuid
from datetime import datetime, UTC
sys.path.append(os.getcwd())
try:
    from modules.core import db, config
except ImportError as e:
    print(f"Import error: {e}")
    raise

class Auth:
    def __init__(self):
        self.config = config.get_config()
        self.db_path = self.config["db_path"]

    def authenticate(self, username, password):
        database = db.DB()
        try:
            database.cursor.execute(
                "SELECT user_id, role FROM users WHERE username = ? AND password = ?",
                (username, password)
            )
            user = database.cursor.fetchone()
            if user:
                return {"user_id": user[0], "role": user[1]}
            return None
        finally:
            database.close()

    def register(self, username, password):
        """Register a new user with default user role."""
        database = db.DB()
        try:
            user_id = f"U_{uuid.uuid4().hex[:8]}"
            database.cursor.execute(
                "INSERT OR IGNORE INTO users (user_id, username, password, role, created_at) VALUES (?, ?, ?, ?, ?)",
                (user_id, username, password, "user", datetime.now(UTC).isoformat())
            )
            database.conn.commit()
            database.log_action(
                "1", "admin", "register_user", user_id, "user",
                f"Registered new user {username}", {}, {}
            )
            print(f"Registered user: {username} with role: user")
            return {"user_id": user_id, "role": "user"}
        except sqlite3.IntegrityError:
            print(f"Registration failed: Username {username} already exists")
            return None
        finally:
            database.close()

if __name__ == "__main__":
    database = db.DB()
    try:
        # Hardcode admin user
        database.cursor.execute(
            "INSERT OR REPLACE INTO users (user_id, username, password, role, created_at) VALUES (?, ?, ?, ?, ?)",
            ("1", "admin", "Shady868", "admin", datetime.now(UTC).isoformat())
        )
        # Add test user
        database.cursor.execute(
            "INSERT OR REPLACE INTO users (user_id, username, password, role, created_at) VALUES (?, ?, ?, ?, ?)",
            ("test_1", "test_user", "test_pass", "user", datetime.now(UTC).isoformat())
        )
        database.conn.commit()
        database.log_action(
            "1", "admin", "add_user", "test_1", "user", "Added test user", {}, {}
        )
        print("Added user: test_1")
    finally:
        database.close()
''')
!test -f modules/core/auth.py && echo "auth.py created" || echo "Failed to create auth.py"

auth.py created


In [None]:
import os
os.makedirs('modules/synth', exist_ok=True)
with open('modules/synth/faker_engine.py', 'w') as f:
    f.write('''# modules/synth/faker_engine.py
import sys
import os
sys.path.append(os.getcwd())
from faker import Faker
import random
from datetime import datetime, timedelta
try:
    from modules.core import config
except ImportError as e:
    print(f"Import error: {e}")
    raise

class LoanIQFaker:
    """Custom Faker for generating Loan IQ synthetic data with patterns."""
    def __init__(self):
        self.faker = Faker()
        Faker.seed(config.SEEDS["faker"])
        random.seed(config.SEEDS["random"])
        self.config = config.get_config()
        self.fraud_types = self.config["fraud_types"]
        self.regions = self.config["regions"]

    def client_id(self):
        """Generate unique client ID."""
        return f"C_{self.faker.uuid4().split('-')[0]}"

    def loan_id(self):
        """Generate unique loan ID."""
        return f"L_{self.faker.uuid4().split('-')[0]}"

    def transaction_id(self):
        """Generate unique transaction ID."""
        return f"T_{self.faker.uuid4().split('-')[0]}"

    def client_name(self, fraud_type=None):
        """Generate client name, with ghost client pattern."""
        if fraud_type == "ghost_client" and random.random() < 0.1:
            return None
        return self.faker.name()

    def duplicate_id(self, existing_ids):
        """Generate client ID with chance of duplication."""
        if random.random() < 0.05:
            return random.choice(existing_ids) if existing_ids else self.client_id()
        return self.client_id()

    def income(self, fraud_type=None):
        """Generate income, with variance for patterns."""
        if fraud_type == "identity_theft" and random.random() < 0.1:
            return random.uniform(100000, 1000000)
        return random.uniform(20000, 100000)

    def branch(self):
        """Generate branch name."""
        return self.faker.city()

    def region(self):
        """Generate region from config."""
        return random.choice(self.regions)

    def loan_amount(self, fraud_type=None):
        """Generate loan amount, with variance for patterns."""
        if fraud_type == "missed_payment" and random.random() < 0.2:
            return random.uniform(50000, 200000)
        return random.uniform(1000, 50000)

    def loan_status(self, fraud_type=None):
        """Generate loan status, with pattern influence."""
        statuses = ["active", "paid", "default"]
        if fraud_type == "missed_payment" and random.random() < 0.3:
            return "default"
        return random.choice(statuses)

    def transaction_amount(self, loan_amount):
        """Generate transaction amount based on loan."""
        return random.uniform(100, min(loan_amount * 0.1, 5000))

    def transaction_type(self, fraud_type=None):
        """Generate transaction type, with pattern influence."""
        types = ["payment", "fee", "interest"]
        if fraud_type == "identity_theft" and random.random() < 0.1:
            return "suspicious_transfer"
        return random.choice(types)

    def random_date(self, start_days=-365, end_days=0):
        """Generate random date within range."""
        start = datetime.now() + timedelta(days=start_days)
        end = datetime.now() + timedelta(days=end_days)
        return self.faker.date_time_between(start, end).isoformat()

if __name__ == "__main__":
    faker = LoanIQFaker()
    print(f"Client ID: {faker.client_id()}")
    print(f"Client Name: {faker.client_name()}")
    print(f"Loan ID: {faker.loan_id()}")
    print(f"Transaction ID: {faker.transaction_id()}")
    print(f"Income: {faker.income()}")
    print(f"Branch: {faker.branch()}")
    print(f"Region: {faker.region()}")
    print(f"Loan Amount: {faker.loan_amount()}")
    print(f"Loan Status: {faker.loan_status()}")
    print(f"Transaction Amount: {faker.transaction_amount(10000)}")
    print(f"Transaction Type: {faker.transaction_type()}")
    print(f"Random Date: {faker.random_date()}")
''')
!test -f modules/synth/faker_engine.py && echo "faker_engine.py created" || echo "Failed to create faker_engine.py"

faker_engine.py created


In [None]:
import os
os.makedirs('modules/synth', exist_ok=True)
with open('modules/synth/generators.py', 'w') as f:
    f.write('''# modules/synth/generators.py
import sys
import os
import random
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
from datetime import datetime
try:
    from modules.core import db, config
    from modules.synth import faker_engine
except ImportError as e:
    print(f"Import error: {e}")
    raise

class DataGenerator:
    """Generate synthetic data for Loan IQ and store in database."""
    def __init__(self):
        self.faker = faker_engine.LoanIQFaker()
        self.config = config.get_config()
        self.db_path = self.config["db_path"]

    def generate_clients(self, n, fraud_ratio=0.1):
        """Generate n clients with optional patterns."""
        clients = []
        existing_ids = []
        for _ in range(n):
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            client_id = self.faker.duplicate_id(existing_ids) if fraud_type == "duplicate_id" else self.faker.client_id()
            existing_ids.append(client_id)
            clients.append({
                "client_id": client_id,
                "name": self.faker.client_name(fraud_type),
                "branch": self.faker.branch(),
                "region": self.faker.region(),
                "income": self.faker.income(fraud_type),
                "created_at": self.faker.random_date()
            })
        return pd.DataFrame(clients)

    def generate_loans(self, clients, n_per_client=2, fraud_ratio=0.1):
        """Generate loans for given clients."""
        loans = []
        for client_id in clients["client_id"]:
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            for _ in range(random.randint(1, n_per_client)):
                loans.append({
                    "loan_id": self.faker.loan_id(),
                    "client_id": client_id,
                    "amount": self.faker.loan_amount(fraud_type),
                    "status": self.faker.loan_status(fraud_type),
                    "start_date": self.faker.random_date()
                })
        return pd.DataFrame(loans)

    def generate_transactions(self, loans, n_per_loan=3, fraud_ratio=0.1):
        """Generate transactions for given loans."""
        transactions = []
        for loan_id, loan_amount in zip(loans["loan_id"], loans["amount"]):
            fraud_type = random.choices(
                self.faker.fraud_types + [None],
                weights=[fraud_ratio / len(self.faker.fraud_types)] * len(self.faker.fraud_types) + [1 - fraud_ratio],
                k=1
            )[0]
            for _ in range(random.randint(1, n_per_loan)):
                transactions.append({
                    "transaction_id": self.faker.transaction_id(),
                    "loan_id": loan_id,
                    "amount": self.faker.transaction_amount(loan_amount),
                    "date": self.faker.random_date(),
                    "type": self.faker.transaction_type(fraud_type)
                })
        return pd.DataFrame(transactions)

    def save_to_db(self, clients, loans, transactions, actor_id="1", actor_role="admin", reason="Synthetic data generation"):
        """Save generated data to loan_iq.db with audit logging."""
        database = db.DB()
        print(f"Saving to database: {self.db_path}")  # Debug
        try:
            for _, row in clients.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO clients (client_id, name, branch, region, income, created_at) VALUES (?, ?, ?, ?, ?, ?)",
                    (row["client_id"], row["name"], row["branch"], row["region"], row["income"], row["created_at"])
                )
            for _, row in loans.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO loans (loan_id, client_id, amount, status, start_date) VALUES (?, ?, ?, ?, ?)",
                    (row["loan_id"], row["client_id"], row["amount"], row["status"], row["start_date"])
                )
            for _, row in transactions.iterrows():
                database.cursor.execute(
                    "INSERT OR IGNORE INTO transactions (transaction_id, loan_id, amount, date, type) VALUES (?, ?, ?, ?, ?)",
                    (row["transaction_id"], row["loan_id"], row["amount"], row["date"], row["type"])
                )
            database.conn.commit()
            database.log_action(
                actor_id, actor_role, "generate_data", "multiple", "synthetic_data", reason,
                {}, {"clients": len(clients), "loans": len(loans), "transactions": len(transactions)}
            )
            print(f"Saved {len(clients)} clients, {len(loans)} loans, {len(transactions)} transactions to DB")
        finally:
            database.close()

    def export_to_csv(self, clients, loans, transactions, output_dir=None):
        """Export data to CSV files."""
        output_dir = output_dir or self.config["data_dir"]
        os.makedirs(output_dir, exist_ok=True)
        clients.to_csv(os.path.join(output_dir, "clients.csv"), index=False)
        loans.to_csv(os.path.join(output_dir, "loans.csv"), index=False)
        transactions.to_csv(os.path.join(output_dir, "transactions.csv"), index=False)
        print(f"Exported data to {output_dir}/[clients,loans,transactions].csv")

if __name__ == "__main__":
    generator = DataGenerator()
    clients = generator.generate_clients(10, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    generator.export_to_csv(clients, loans, transactions)
    print("Generated and saved synthetic data.")
''')
!test -f modules/synth/generators.py && echo "generators.py created" || echo "Failed to create generators.py"

generators.py created


In [None]:
import os
os.makedirs('modules/models', exist_ok=True)
with open('modules/models/train.py', 'w') as f:
    f.write('''# modules/models/train.py
import sys
import os
import pickle
import json
import random
import uuid
import numpy as np
import pandas as pd
from datetime import datetime, UTC
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
sys.path.append(os.getcwd())
try:
    from modules.core import db, config
    from modules.synth import generators
except ImportError as e:
    print(f"Import error: {e}")
    raise

class ModelTrainer:
    """Train XGBoost model for default probability and loan limits."""
    def __init__(self):
        self.config = config.get_config()
        self.db_path = self.config["db_path"]
        self.model_dir = self.config["model_dir"]
        os.makedirs(self.model_dir, exist_ok=True)
        random.seed(config.SEEDS["random"])
        np.random.seed(config.SEEDS["numpy"])

    def prepare_data(self, clients, loans, transactions):
        """Prepare features and labels for training."""
        print("Preparing data for training")  # Debug
        data = loans.merge(clients, on="client_id", how="left")
        # Aggregate transactions and flatten column names
        agg_data = transactions.groupby("loan_id").agg({
            "amount": ["sum", "count"],
            "type": lambda x: x.value_counts().index[0] if not x.empty else "none"
        }).reset_index()
        # Flatten MultiIndex by renaming columns
        agg_data.columns = ['loan_id', 'transaction_amount_sum', 'transaction_count', 'transaction_type']
        data = data.merge(agg_data, on="loan_id", how="left")
        data = data[[
            "loan_id", "client_id", "amount", "status", "start_date",
            "name", "branch", "region", "income", "created_at",
            "transaction_amount_sum", "transaction_count", "transaction_type"
        ]]
        features = ["loan_amount", "income", "transaction_amount_sum", "transaction_count"]
        data = data.rename(columns={"amount": "loan_amount"})  # Rename for consistency
        X = data[features].fillna(0)
        y = data["status"].apply(lambda x: 1 if x == "default" else 0)
        print(f"Prepared {X.shape[0]} samples with features: {features}")  # Debug
        return X, y

    def train_model(self, X, y, model_id=None):
        """Train XGBoost model and save to file and database."""
        model_id = model_id or f"M_{random.getrandbits(32):08x}"
        model = XGBClassifier(
            n_estimators=100, max_depth=3, learning_rate=0.1,
            random_state=config.SEEDS["random"], eval_metric="auc"
        )
        model.fit(X, y)
        y_pred = model.predict_proba(X)[:, 1]
        auc = roc_auc_score(y, y_pred)
        accuracy = accuracy_score(y, model.predict(X))
        model_path = os.path.join(self.model_dir, f"{model_id}.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(model, f)
        print(f"Model saved to {model_path}")  # Debug
        database = db.DB()
        try:
            database.cursor.execute(
                "INSERT OR IGNORE INTO models (model_id, type, version, created_at) VALUES (?, ?, ?, ?)",
                (model_id, "xgboost", "1.0", datetime.now(UTC).isoformat())
            )
            database.cursor.execute(
                "INSERT INTO model_versions (version_id, model_id, config_json, data_hash, metrics_json, commit_ref, comments, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
                (
                    f"V_{uuid.uuid4().hex[:8]}", model_id,  # Unique version_id
                    json.dumps({"n_estimators": 100, "max_depth": 3, "learning_rate": 0.1}),
                    str(X.values.tobytes()),  # Use bytes of numeric data for hash
                    json.dumps({"auc": float(auc), "accuracy": float(accuracy)}),
                    "initial", "Trained for default probability", datetime.now(UTC).isoformat()
                )
            )
            database.conn.commit()
            database.log_action(
                "1", "admin", "train_model", model_id, "model",
                "Trained model for default probability", {}, {"auc": float(auc), "accuracy": float(accuracy)}
            )
            print(f"Model {model_id} trained. AUC: {auc:.3f}, Accuracy: {accuracy:.3f}")
            return model_id
        finally:
            database.close()

if __name__ == "__main__":
    trainer = ModelTrainer()
    generator = generators.DataGenerator()
    clients = generator.generate_clients(100, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    X, y = trainer.prepare_data(clients, loans, transactions)
    model_id = trainer.train_model(X, y)
    print(f"Trained model: {model_id}")
''')
!test -f modules/models/train.py && echo "train.py created" || echo "Failed to create train.py"

train.py created


In [None]:

import os
os.makedirs('modules/models', exist_ok=True)
with open('modules/models/predict.py', 'w') as f:
    f.write('''# modules/models/predict.py
import sys
import os
import pickle
import json
import numpy as np
import pandas as pd
import shap
from datetime import datetime, UTC
sys.path.append(os.getcwd())
try:
    from modules.core import db, config
    from modules.models import train
except ImportError as e:
    print(f"Import error: {e}")
    raise

class ModelPredictor:
    """Predict default probability and loan limits using trained model."""
    def __init__(self):
        self.config = config.get_config()
        self.db_path = self.config["db_path"]
        self.model_dir = self.config["model_dir"]
        self.explainer = None

    def predict(self, model_id, data):
        """Make predictions for given data using specified model."""
        model_path = os.path.join(self.model_dir, f"{model_id}.pkl")
        with open(model_path, "rb") as f:
            model = pickle.load(f)
        print(f"Loaded model: {model_id}")  # Debug
        trainer = train.ModelTrainer()
        X, _ = trainer.prepare_data(data["clients"], data["loans"], data["transactions"])
        probs = model.predict_proba(X)[:, 1]
        # Merge loans with clients to align incomes with loans
        merged_data = data["loans"].merge(data["clients"][["client_id", "income"]],
                                        on="client_id", how="left")
        loan_limits = merged_data["income"] * 2.0 * (1 - probs)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        result = pd.DataFrame({
            "loan_id": data["loans"]["loan_id"],
            "default_probability": probs,
            "recommended_loan_limit": loan_limits,
            "shap_values": [json.dumps(s.tolist()) for s in shap_values]
        })
        database = db.DB()
        try:
            database.log_action(
                "1", "admin", "predict", model_id, "model",
                "Made predictions for loans", {}, {"num_predictions": len(probs)}
            )
            database.conn.commit()
            print(f"Predictions made for {len(probs)} loans")  # Debug
            return result
        finally:
            database.close()

if __name__ == "__main__":
    predictor = ModelPredictor()
    trainer = train.ModelTrainer()
    generator = train.generators.DataGenerator()
    clients = generator.generate_clients(10, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    X, y = trainer.prepare_data(clients, loans, transactions)
    model_id = trainer.train_model(X, y)
    predictions = predictor.predict(model_id, {"clients": clients, "loans": loans, "transactions": transactions})
    print(predictions)
''')
!test -f modules/models/predict.py && echo "predict.py created" || echo "Failed to create predict.py"

predict.py created


In [None]:
import os
os.makedirs('tests', exist_ok=True)
with open('tests/test_models.py', 'w') as f:
    f.write('''# tests/test_models.py
import sys
import os
import pickle
import pandas as pd
import numpy as np
sys.path.append(os.getcwd())
try:
    from modules.core import config, db
    from modules.synth import generators
    from modules.models import train, predict
except ImportError as e:
    print(f"Import error: {e}")
    raise

def test_prepare_data():
    trainer = train.ModelTrainer()
    generator = generators.DataGenerator()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    X, y = trainer.prepare_data(clients, loans, transactions)
    assert X.shape[0] == len(loans), "Incorrect number of samples"
    assert set(X.columns) == {"loan_amount", "income", "transaction_amount_sum", "transaction_count"}, "Incorrect features"
    assert y.isin([0, 1]).all(), "Invalid labels"

def test_train_model():
    trainer = train.ModelTrainer()
    generator = generators.DataGenerator()
    clients = generator.generate_clients(10, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    X, y = trainer.prepare_data(clients, loans, transactions)
    model_id = trainer.train_model(X, y)
    assert os.path.exists(os.path.join(config.get_config()["model_dir"], f"{model_id}.pkl")), "Model file not saved"
    database = db.DB()
    try:
        database.cursor.execute("SELECT * FROM models WHERE model_id = ?", (model_id,))
        assert database.cursor.fetchone(), "Model not saved to DB"
        database.cursor.execute("SELECT * FROM model_versions WHERE model_id = ?", (model_id,))
        assert database.cursor.fetchone(), "Model version not saved to DB"
        database.cursor.execute("SELECT * FROM audit_logs WHERE target_type = 'model' AND action = 'train_model'")
        assert len(database.cursor.fetchall()) > 0, "Audit log not recorded"
    finally:
        database.close()

def test_predict():
    trainer = train.ModelTrainer()
    predictor = predict.ModelPredictor()
    generator = generators.DataGenerator()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    generator.save_to_db(clients, loans, transactions)
    X, y = trainer.prepare_data(clients, loans, transactions)
    model_id = trainer.train_model(X, y)
    predictions = predictor.predict(model_id, {"clients": clients, "loans": loans, "transactions": transactions})
    assert len(predictions) == len(loans), "Incorrect number of predictions"
    assert set(predictions.columns) == {"loan_id", "default_probability", "recommended_loan_limit", "shap_values"}, "Incorrect prediction columns"
    assert (predictions["default_probability"] >= 0).all() and (predictions["default_probability"] <= 1).all(), "Invalid probabilities"
    assert (predictions["recommended_loan_limit"] >= 0).all(), "Invalid loan limits"
    database = db.DB()
    try:
        database.cursor.execute("SELECT * FROM audit_logs WHERE target_type = 'model' AND action = 'predict'")
        assert len(database.cursor.fetchall()) > 0, "Audit log not recorded"
    finally:
        database.close()

def test_model_persistence():
    trainer = train.ModelTrainer()
    generator = generators.DataGenerator()
    clients = generator.generate_clients(5, fraud_ratio=0.2)
    loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
    transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
    X, y = trainer.prepare_data(clients, loans, transactions)
    model_id = trainer.train_model(X, y)
    model_path = os.path.join(config.get_config()["model_dir"], f"{model_id}.pkl")
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    assert model is not None, "Model not loaded correctly"
''')
!test -f tests/test_models.py && echo "test_models.py created" || echo "Failed to create test_models.py"

test_models.py created


In [None]:
import sys
import os
sys.path.append(os.getcwd())
print(f"Current working directory: {os.getcwd()}")  # Debug

# Reset database and marker file
!rm -f data/loan_iq.db data/.deps_ok

# Ensure dependencies are installed
!python modules/bootstrap/deps.py

# Verify directories
!ls modules/bootstrap || echo "modules/bootstrap not found"
!ls modules/core || echo "modules/core not found"
!ls modules/synth || echo "modules/synth not found"
!ls modules/models || echo "modules/models not found"
!ls tests || echo "tests not found"
!ls data || echo "data not found"

# Run all scripts
!python modules/core/config.py
!python modules/core/db.py
!python modules/core/utils.py
!python modules/core/auth.py
!python modules/synth/faker_engine.py
!python modules/synth/generators.py
!python modules/models/train.py
!python modules/models/predict.py

# Run tests
!pytest tests/test_models.py -v

# Verify files
!ls modules/bootstrap
!ls modules/core
!ls modules/synth
!ls modules/models
!ls tests
!ls data

# Expected output:
# Current working directory: /content
# Dependencies installed successfully.
# deps.py
# auth.py  config.py  db.py  utils.py
# faker_engine.py  generators.py
# predict.py  train.py
# test_models.py
# .deps_ok  clients.csv  loans.csv  loan_iq.db  models  reports  transactions.csv
# Config loaded: {...}
# sys.path: [...]
# Creating database at: data/loan_iq.db
# Database created: True
# Database initialized.
# Before snapshot: {}
# After snapshot: {}
# Test action result: {'result': 'test'}
# Auth result: {'user_id': '1', 'role': 'admin'}
# Adding user with target_id: test_1
# Before snapshot: {}
# After snapshot: {'user_id': 'test_1', 'username': 'test_user', 'password': 'test_pass', 'role': 'user'}
# Added user: test_1
# Client ID: C_...
# Client Name: ...
# Loan ID: L_...
# Transaction ID: T_...
# Income: ...
# Branch: ...
# Region: ...
# Loan Amount: ...
# Loan Status: ...
# Transaction Amount: ...
# Transaction Type: ...
# Random Date: ...
# Saving to database: data/loan_iq.db
# Saved 10 clients, ... loans, ... transactions to DB
# Exported data to data/[clients,loans,transactions].csv
# Generated and saved synthetic data.
# Saving to database: data/loan_iq.db
# Saved 100 clients, ... loans, ... transactions to DB
# Preparing data for training
# Prepared ... samples with features: ['loan_amount', 'income', 'transaction_amount_sum', 'transaction_count']
# Model saved to models/M_....pkl
# Model M_... trained. AUC: 0.XXX, Accuracy: 0.XXX
# Trained model: M_...
# Saving to database: data/loan_iq.db
# Saved 10 clients, ... loans, ... transactions to DB
# Preparing data for training
# Prepared ... samples with features: ['loan_amount', 'income', 'transaction_amount_sum', 'transaction_count']
# Model saved to models/M_....pkl
# Model M_... trained. AUC: 0.XXX, Accuracy: 0.XXX
# Loaded model: M_...
# Predictions made for ... loans
# [DataFrame with loan_id, default_probability, recommended_loan_limit, shap_values]
# ============================= test session starts =============================
# tests/test_models.py::test_prepare_data PASSED
# tests/test_models.py::test_train_model PASSED
# tests/test_models.py::test_predict PASSED
# tests/test_models.py::test_model_persistence PASSED
# =========================== 4 passed in 0.XXs ===========================

Current working directory: /content
Dependencies installed successfully.
deps.py
auth.py  config.py  db.py  __pycache__	utils.py
faker_engine.py  generators.py	__pycache__
predict.py  __pycache__  train.py
__pycache__  test_models.py
clients.csv  loans.csv	reports  transactions.csv
Config loaded: {'data_dir': 'data', 'model_dir': 'models', 'report_dir': 'data/reports', 'db_path': 'data/loan_iq.db', 'drive_root': '/content/drive/MyDrive/loan_iq', 'streamlit_port': 8501, 'fraud_types': ['ghost_client', 'duplicate_id', 'missed_payment', 'identity_theft'], 'regions': ['urban', 'rural', 'semi_urban'], 'max_clients_batch': 70000, 'default_batch_size': 1000}
sys.path: ['/content/modules/core', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/content', '/content']
Creating database at: data/loan_iq.db
Database created: True
Database initialized.
sys.path: ['/content/

In [None]:

import os
os.makedirs('modules/streamlit_app', exist_ok=True)
with open('modules/streamlit_app/app.py', 'w') as f:
    f.write('''# modules/streamlit_app/app.py
import sys
import os
import streamlit as st
import pandas as pd
import sqlite3
import plotly.express as px
import json
sys.path.append(os.getcwd())
try:
    from modules.core import config, db, auth
    from modules.synth import generators
    from modules.models import train, predict
except ImportError as e:
    print(f"Import error: {e}")
    raise

st.set_page_config(page_title="Loan IQ Dashboard", layout="wide")

def main():
    """Streamlit dashboard for Loan IQ."""
    config_data = config.get_config()
    st.title("Loan IQ Dashboard")

    # Authentication
    if "authenticated" not in st.session_state:
        st.session_state.authenticated = False
        st.session_state.user_role = None
        st.session_state.username = None

    if not st.session_state.authenticated:
        st.subheader("Login or Register")
        # Tabs for Login and Register
        tab1, tab2 = st.tabs(["Login", "Register"])

        with tab1:
            st.subheader("Login")
            login_username = st.text_input("Username", key="login_username")
            login_password = st.text_input("Password", type="password", key="login_password")
            if st.button("Login"):
                authenticator = auth.Auth()  # Updated to Auth
                user = authenticator.authenticate(login_username, login_password)
                if user:
                    st.session_state.authenticated = True
                    st.session_state.user_role = user["role"]
                    st.session_state.username = login_username
                    st.success(f"Logged in as {login_username} ({user['role']})")
                    st.rerun()
                else:
                    st.error("Invalid credentials. Contact admin at admin@loaniq.com for password issues.")

        with tab2:
            st.subheader("Register")
            reg_username = st.text_input("New Username", key="reg_username")
            reg_password = st.text_input("New Password", type="password", key="reg_password")
            if st.button("Register"):
                authenticator = auth.Auth()  # Updated to Auth
                user = authenticator.register(reg_username, reg_password)
                if user:
                    st.success(f"Registered {reg_username}. Logging in...")
                    st.session_state.authenticated = True
                    st.session_state.user_role = user["role"]
                    st.session_state.username = reg_username
                    st.rerun()
                else:
                    st.error("Registration failed: Username already exists")
        return

    # Sidebar for navigation
    st.sidebar.title(f"Welcome, {st.session_state.username}")
    page = st.sidebar.selectbox("Select Page", ["Data Overview", "Predictions", "Reports"])

    # Initialize database
    database = db.DB()
    conn = sqlite3.connect(config_data["db_path"])

    if page == "Data Overview":
        st.subheader("Data Overview")
        # Load data
        clients = pd.read_sql_query("SELECT * FROM clients LIMIT 10", conn)
        loans = pd.read_sql_query("SELECT * FROM loans LIMIT 10", conn)
        transactions = pd.read_sql_query("SELECT * FROM transactions LIMIT 10", conn)

        # Display tables
        st.write("### Clients")
        st.dataframe(clients)
        st.write("### Loans")
        st.dataframe(loans)
        st.write("### Transactions")
        st.dataframe(transactions)

        # Simple visualization
        if not loans.empty:
            fig = px.bar(loans, x="status", title="Loan Status Distribution")
            st.plotly_chart(fig)

    elif page == "Predictions":
        st.subheader("Loan Default Predictions")
        if st.session_state.user_role == "admin":
            if st.button("Generate New Data and Predictions"):
                generator = generators.DataGenerator()
                clients = generator.generate_clients(10, fraud_ratio=0.2)
                loans = generator.generate_loans(clients, n_per_client=2, fraud_ratio=0.2)
                transactions = generator.generate_transactions(loans, n_per_loan=3, fraud_ratio=0.2)
                generator.save_to_db(clients, loans, transactions)
                trainer = train.ModelTrainer()
                X, y = trainer.prepare_data(clients, loans, transactions)
                model_id = trainer.train_model(X, y)
                predictor = predict.ModelPredictor()
                predictions = predictor.predict(model_id, {"clients": clients, "loans": loans, "transactions": transactions})
                # Save predictions to session state
                st.session_state.predictions = predictions
                st.session_state.model_id = model_id
                st.success(f"Generated data and trained model {model_id}")

            # Display predictions
            if "predictions" in st.session_state:
                st.write("### Predictions")
                st.dataframe(st.session_state.predictions)
                # Plot default probabilities
                fig = px.histogram(st.session_state.predictions, x="default_probability",
                                 title="Default Probability Distribution")
                st.plotly_chart(fig)
        else:
            st.error("Access restricted to admin users")

    elif page == "Reports":
        st.subheader("Reports")
        # Example report: Average income by region
        query = """
        SELECT region, AVG(income) as avg_income
        FROM clients
        GROUP BY region
        """
        report = pd.read_sql_query(query, conn)
        st.write("### Average Income by Region")
        st.dataframe(report)
        fig = px.bar(report, x="region", y="avg_income", title="Average Income by Region")
        st.plotly_chart(fig)

    conn.close()

if __name__ == "__main__":
    main()
''')
!test -f modules/streamlit_app/app.py && echo "app.py created" || echo "Failed to create app.py"

# Install Streamlit and ngrok
!pip install streamlit pyngrok plotly --quiet

# Run Streamlit with ngrok
from pyngrok import ngrok
import subprocess

# Set up ngrok with hardcoded authtoken
!ngrok config add-authtoken 31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF

# Start Streamlit server
port = 8501
public_url = ngrok.connect(port).public_url
print(f"Streamlit app running at: {public_url}")
subprocess.Popen(["streamlit", "run", "modules/streamlit_app/app.py", "--server.port", str(port)])

app.py created
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app running at: https://b5be53fa8cf6.ngrok-free.app


<Popen: returncode: None args: ['streamlit', 'run', 'modules/streamlit_app/a...>

In [None]:
import shutil, os

# === Step 0: Backup existing file if it exists ===
db_file = "modules/core/db.py"
backup_file = "modules/core/db_backup.py"
if os.path.exists(db_file):
    shutil.copy(db_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write the new db.py ===
code = """\
import sqlite3
import os
from datetime import datetime

class Database:
    def __init__(self, db_path="data/loaniq.db"):
        os.makedirs(os.path.dirname(db_path), exist_ok=True)
        self.conn = sqlite3.connect(db_path, check_same_thread=False)
        self.cursor = self.conn.cursor()
        self._init_tables()

    def _init_tables(self):
        # Users table with created_at column
        self.cursor.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS users (
            user_id TEXT PRIMARY KEY,
            username TEXT UNIQUE,
            password TEXT,
            role TEXT,
            created_at TEXT
        )
        \"\"\")
        self.conn.commit()

    def add_user(self, user_id, username, password, role="user"):
        created_at = datetime.utcnow().isoformat()
        self.cursor.execute(\"\"\"
        INSERT INTO users (user_id, username, password, role, created_at)
        VALUES (?, ?, ?, ?, ?)
        \"\"\", (user_id, username, password, role, created_at))
        self.conn.commit()

    def list_users(self):
        self.cursor.execute("SELECT * FROM users")
        return self.cursor.fetchall()

# === Quick test ===
if __name__ == "__main__":
    db = Database()
    db.cursor.execute("PRAGMA table_info(users)")
    cols = [row[1] for row in db.cursor.fetchall()]
    print("✅ Users table columns:", cols)
"""

with open(db_file, "w") as f:
    f.write(code)

# === Step 2: Run test ===
from modules.core import db as db_module
db = db_module.Database()
db.cursor.execute("PRAGMA table_info(users)")
cols = [row[1] for row in db.cursor.fetchall()]
print("✅ Users table columns (test):", cols)

✅ Users table columns (test): ['user_id', 'username', 'password', 'role', 'created_at']


In [None]:
import shutil, os

# === Step 0: Backup existing file if it exists ===
cfg_file = "modules/core/config.py"
backup_file = "modules/core/config_backup.py"
if os.path.exists(cfg_file):
    shutil.copy(cfg_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write the new config.py ===
code = """\
import os

# === Admin Credentials ===
ADMIN_USERNAME = "admin"
ADMIN_PASSWORD = "Shady868"

# === Ngrok Token (env first, fallback to hardcoded) ===
NGROK_AUTHTOKEN = os.getenv(
    "NGROK_AUTHTOKEN",
    "31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF"
)

# === Database Path ===
DB_PATH = "data/loaniq.db"

def get_admin_credentials():
    return ADMIN_USERNAME, ADMIN_PASSWORD

def get_ngrok_token():
    return NGROK_AUTHTOKEN

def get_db_path():
    return DB_PATH
"""

with open(cfg_file, "w") as f:
    f.write(code)

# === Step 2: Run test ===
from modules.core import config

print("✅ Admin creds:", config.get_admin_credentials())
print("✅ Ngrok token (first 8 chars):", config.get_ngrok_token()[:8] + "...")
print("✅ DB path:", config.get_db_path())

✅ Admin creds: ('admin', 'Shady868')
✅ Ngrok token (first 8 chars): 31rYvgkl...
✅ DB path: data/loaniq.db


In [None]:
import shutil, os

# === Step 0: Backup existing file if it exists ===
tunnel_file = "modules/bootstrap/tunnel.py"
backup_file = "modules/bootstrap/tunnel_backup.py"
if os.path.exists(tunnel_file):
    shutil.copy(tunnel_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write the new tunnel.py ===
code = """\
import os
from pyngrok import ngrok, conf
from modules.core import config

def start(port:int=8501):
    token = config.get_ngrok_token()
    if token:
        conf.get_default().auth_token = token
    url = ngrok.connect(port, "http").public_url
    print(f"🌐 Ngrok tunnel started: {url}")
    return url

def stop():
    ngrok.kill()
    print("🛑 Ngrok tunnel stopped.")
"""

with open(tunnel_file, "w") as f:
    f.write(code)

# === Step 2: Run test ===
from modules.bootstrap import tunnel

print(">>> Starting Ngrok test tunnel...")
url = tunnel.start(8501)
print("✅ Test URL:", url)
tunnel.stop()

>>> Starting Ngrok test tunnel...
🌐 Ngrok tunnel started: https://07780d982eab.ngrok-free.app
✅ Test URL: https://07780d982eab.ngrok-free.app
🛑 Ngrok tunnel stopped.


In [None]:
import shutil, os

# === Step 0: Backup existing file if it exists ===
auth_file = "modules/core/auth.py"
backup_file = "modules/core/auth_backup.py"
if os.path.exists(auth_file):
    shutil.copy(auth_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write the new auth.py ===
code = """\
from modules.core import config, db

class AuthManager:
    def __init__(self):
        self.db = db.Database()
        self.admin_user, self.admin_pass = config.get_admin_credentials()

    def register(self, user_id, username, password, role="user"):
        # Adds new user into DB
        self.db.add_user(user_id, username, password, role)

    def login(self, username, password):
        # Admin login (hardcoded)
        if username == self.admin_user and password == self.admin_pass:
            return {"username": username, "role": "admin"}

        # Check database users
        users = self.db.list_users()
        for row in users:
            if row[1] == username and row[2] == password:
                return {"username": username, "role": row[3]}

        return None
"""

with open(auth_file, "w") as f:
    f.write(code)

# === Step 2: Run test ===
from modules.core.auth import AuthManager
import uuid

auth = AuthManager()

# Register test user
uid = str(uuid.uuid4())
auth.register(uid, "testuser", "testpass", role="user")

# Test login: user
user_login = auth.login("testuser", "testpass")
print("✅ User login:", user_login)

# Test login: admin
admin_login = auth.login("admin", "Shady868")
print("✅ Admin login:", admin_login)

# Test login: wrong
fail_login = auth.login("nosuch", "wrong")
print("✅ Failed login:", fail_login)

✅ User login: {'username': 'testuser', 'role': 'user'}
✅ Admin login: {'username': 'admin', 'role': 'admin'}
✅ Failed login: None


In [None]:
import shutil, os

# === Step 0: Backup existing file if it exists ===
main_file = "main.py"
backup_file = "main_backup.py"
if os.path.exists(main_file):
    shutil.copy(main_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write new main.py ===
code = """\
import streamlit as st
from modules.core.auth import AuthManager

# Import placeholder pages
import pages._01_placeholder as page01
import pages._02_placeholder as page02
import pages._03_placeholder as page03
import pages._04_placeholder as page04
import pages._05_placeholder as page05
import pages._06_placeholder as page06
import pages._07_placeholder as page07

PAGES = {
    "01 - Home": page01,
    "02 - Client Onboarding": page02,
    "03 - Client Dashboard": page03,
    "04 - Admin Sandbox": page04,
    "05 - Global Insights": page05,
    "06 - Reports & Exports": page06,
    "07 - Settings": page07,
}

def main():
    st.set_page_config(page_title="LoanIQ", layout="wide")
    st.title("💳 LoanIQ Platform")

    # === Auth ===
    if "user" not in st.session_state:
        st.session_state.user = None

    auth = AuthManager()

    if st.session_state.user is None:
        st.sidebar.subheader("Login")
        username = st.sidebar.text_input("Username")
        password = st.sidebar.text_input("Password", type="password")
        if st.sidebar.button("Login"):
            user = auth.login(username, password)
            if user:
                st.session_state.user = user
                st.experimental_rerun()
            else:
                st.error("Invalid credentials")
        st.stop()

    st.sidebar.success(f"Logged in as {st.session_state.user['username']} ({st.session_state.user['role']})")

    # === Page navigation ===
    choice = st.sidebar.radio("Go to", list(PAGES.keys()))
    page = PAGES[choice]
    if hasattr(page, "app"):
        page.app()
    else:
        st.warning("Page not yet implemented.")

if __name__ == "__main__":
    main()
"""

with open(main_file, "w") as f:
    f.write(code)

# === Step 2: Run test ===
print("✅ main.py written successfully")
print("✅ Pages wired:", [
    "01 - Home",
    "02 - Client Onboarding",
    "03 - Client Dashboard",
    "04 - Admin Sandbox",
    "05 - Global Insights",
    "06 - Reports & Exports",
    "07 - Settings"
])

✅ main.py written successfully
✅ Pages wired: ['01 - Home', '02 - Client Onboarding', '03 - Client Dashboard', '04 - Admin Sandbox', '05 - Global Insights', '06 - Reports & Exports', '07 - Settings']


In [None]:
import shutil, os

# === Step 0: Backup existing placeholder files ===
for i in range(1,8):
    page_file = f"pages/{i:02d}_placeholder.py"
    backup_file = f"pages/{i:02d}_placeholder_backup.py"
    if os.path.exists(page_file):
        shutil.copy(page_file, backup_file)
        print(f"📦 Backup created at {backup_file}")

# === Step 1: Write new page stubs ===
for i, title in enumerate([
    "Home",
    "Client Onboarding",
    "Client Dashboard",
    "Admin Sandbox",
    "Global Insights",
    "Reports & Exports",
    "Settings"
], start=1):
    code = f'''\
import streamlit as st

def app():
    st.header("📄 {title}")
    st.info("This page is not implemented yet. Placeholder only.")
'''
    with open(f"pages/{i:02d}_placeholder.py", "w") as f:
        f.write(code)

# === Step 2: Run test ===
import importlib
for i in range(1,8):
    mod = importlib.import_module(f"pages.{i:02d}_placeholder")
    print(f"✅ Page {i:02d} imported, has app():", hasattr(mod, "app"))

📦 Backup created at pages/01_placeholder_backup.py
📦 Backup created at pages/02_placeholder_backup.py
📦 Backup created at pages/03_placeholder_backup.py
📦 Backup created at pages/04_placeholder_backup.py
📦 Backup created at pages/05_placeholder_backup.py
📦 Backup created at pages/06_placeholder_backup.py
📦 Backup created at pages/07_placeholder_backup.py
✅ Page 01 imported, has app(): True
✅ Page 02 imported, has app(): True
✅ Page 03 imported, has app(): True
✅ Page 04 imported, has app(): True
✅ Page 05 imported, has app(): True
✅ Page 06 imported, has app(): True
✅ Page 07 imported, has app(): True


In [None]:
import subprocess, time
from modules.bootstrap import tunnel

# === Step 1: Start Ngrok tunnel ===
print(">>> Starting Ngrok tunnel on port 8501...")
url = tunnel.start(8501)

# === Step 2: Start Streamlit server ===
print(">>> Launching Streamlit app (main.py)...")
process = subprocess.Popen(["streamlit", "run", "main.py"])

# === Step 3: Wait and show public URL ===
time.sleep(5)  # give server a moment to boot
print("🌐 LoanIQ is live at:", url)
print("🛑 To stop: run tunnel.stop() and process.terminate()")

>>> Starting Ngrok tunnel on port 8501...
🌐 Ngrok tunnel started: https://4c96b790cd9d.ngrok-free.app
>>> Launching Streamlit app (main.py)...
🌐 LoanIQ is live at: https://4c96b790cd9d.ngrok-free.app
🛑 To stop: run tunnel.stop() and process.terminate()


In [None]:
import shutil, os

# === Step 0: Backup if it exists (just in case) ===
gen_file = "modules/synth/generators.py"
backup_file = "modules/synth/generators_backup.py"
if os.path.exists(gen_file):
    shutil.copy(gen_file, backup_file)
    print(f"📦 Backup created at {backup_file}")

# === Step 1: Write stub generators.py ===
code = """\
import pandas as pd

def generate_synthetic_clients(n=10):
    # Temporary stub — real Faker logic will come in Patch 8
    data = {
        "client_id": [f"C{i:03d}" for i in range(1, n+1)],
        "loan_amount": [1000 + i*50 for i in range(1, n+1)],
        "status": ["active"]*n,
    }
    return pd.DataFrame(data)
"""

with open(gen_file, "w") as f:
    f.write(code)

# === Step 2: Test import ===
from modules.synth import generators
df = generators.generate_synthetic_clients(5)
print("✅ Synthetic stub data:")
print(df)

✅ Synthetic stub data:
  client_id  loan_amount  status
0      C001         1050  active
1      C002         1100  active
2      C003         1150  active
3      C004         1200  active
4      C005         1250  active


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker(["en_KE"])

# 50 towns in Kenya spread across regions
TOWNS = [
    ("Nairobi","Central"),("Mombasa","Coast"),("Kisumu","Western"),("Nakuru","Rift"),
    ("Eldoret","Rift"),("Thika","Central"),("Machakos","Eastern"),("Nyeri","Central"),
    ("Meru","Eastern"),("Malindi","Coast"),("Garissa","Eastern"),("Embu","Eastern"),
    ("Kericho","Rift"),("Narok","Rift"),("Naivasha","Rift"),("Kakamega","Western"),
    ("Bungoma","Western"),("Kitale","Rift"),("Isiolo","Eastern"),("Wajir","Eastern"),
    ("Lamu","Coast"),("Kilifi","Coast"),("Kwale","Coast"),("Voi","Coast"),
    ("Marsabit","Eastern"),("Nanyuki","Central"),("Kiambu","Central"),("Muranga","Central"),
    ("Kerugoya","Central"),("Chuka","Eastern"),("Loitoktok","Rift"),("Kajiado","Rift"),
    ("Mwingi","Eastern"),("Hola","Coast"),("Makueni","Eastern"),("Sotik","Rift"),
    ("Litein","Rift"),("Kabarnet","Rift"),("Baringo","Rift"),("Migori","Western"),
    ("Homa Bay","Western"),("Siaya","Western"),("Busia","Western"),("Keroka","Western"),
    ("Oyugis","Western"),("Kitui","Eastern"),("Taveta","Coast"),("Kilgoris","Rift"),
    ("Kapenguria","Rift"),("Maralal","Rift")
]

PRODUCTS = [
    ("Inuka","5 weeks"),
    ("Kuza","4 weeks"),
    ("Fadhili","6 weeks"),
    ("Imarika","8 weeks"),
    ("Chama Boost","12 weeks")
]

LOAN_TYPES = ["normal","topup","emergency","business","group"]
STATUSES = ["active","pending_branch_approval"]

def infer_age_from_id(national_id: str) -> tuple[int,str]:
    """Infer approximate age from Kenyan ID ranges."""
    try:
        nid = int(national_id)
    except:
        return (random.randint(18,65),"unknown")
    if nid < 7000000:
        return (65,"65+ legacy")
    if 31000000 <= nid <= 33500000:
        return (random.randint(25,32),"25-32")
    if 33500001 <= nid <= 36000000:
        return (random.randint(18,25),"18-25")
    return (random.randint(33,60),"33-60")

def guess_gender_from_name(name: str) -> str:
    common_female = ["Achieng","Wambui","Njeri","Atieno","Chebet","Wanjiku","Naliaka"]
    if any(part in name for part in common_female):
        return "F"
    if name.lower().endswith("a"):
        return "F"
    return "M"

def generate_clients_loans(n_rows:int=1000, seed:int|None=None) -> pd.DataFrame:
    if seed is not None:
        random.seed(seed); np.random.seed(seed); Faker.seed(seed)
    rows = []
    for i in range(n_rows):
        name = fake.name()
        national_id = str(random.choice([random.randint(31000000,36000000), random.randint(7000000,36000000)]))
        age, age_band = infer_age_from_id(national_id)
        gender = guess_gender_from_name(name)
        branch, region = random.choice(TOWNS)
        product, weekly_term = random.choice(PRODUCTS)
        amount = random.randint(5000, 50000)
        ref_number = f"REF{random.randint(100000,999999)}"
        loan_type = random.choice(LOAN_TYPES)
        status = random.choices(STATUSES, weights=[0.85,0.15])[0]
        created_date = fake.date_between(start_date="-24m", end_date="today")

        # derived features
        income = random.randint(5000,100000)
        dti = round(amount / max(income,1),2)
        risk_score = max(300, min(850, int(850 - dti*400 + random.gauss(0,30))))
        default_prob = round(1/(1+np.exp(-(0.05*dti*100 + random.gauss(0,1)))),3)
        loan_health = "performing" if default_prob < 0.3 else "at_risk" if default_prob < 0.6 else "non_performing"
        business_type = random.choice(["retail","farming","transport","service","manufacturing","informal"])
        phone = f"+2547{random.randint(0,9)}{random.randint(1000000,9999999)}"

        rows.append({
            "customer_name": name,
            "national_id": national_id,
            "age": age,
            "age_band": age_band,
            "gender": gender,
            "branch": branch,
            "region": region,
            "product": product,
            "weekly_term": weekly_term,
            "amount": amount,
            "ref_number": ref_number,
            "loan_type": loan_type,
            "status": status,
            "loan_health": loan_health,
            "created_date": created_date,
            "income": income,
            "dti": dti,
            "risk_score": risk_score,
            "default_prob": default_prob,
            "business_type": business_type,
            "phone": phone
        })
    return pd.DataFrame(rows)

def generate_batches(total_rows=50000, batch_size=10000, out_dir="data/synth_batches", fmt="csv", seed=None):
    os.makedirs(out_dir, exist_ok=True)
    n_batches = (total_rows + batch_size - 1)//batch_size
    for i in range(n_batches):
        df = generate_clients_loans(n_rows=batch_size, seed=(None if seed is None else seed+i))
        out_path = os.path.join(out_dir, f"batch_{i+1}.{fmt}")
        if fmt=="csv":
            df.to_csv(out_path, index=False)
        else:
            df.to_parquet(out_path, index=False)
    return out_dir

Overwriting modules/synth/generators.py


In [None]:
%%writefile pages/_02_client_onboarding.py
try:
    import streamlit as st
except ImportError:
    st = None

import pandas as pd
from modules.synth import generators as g

def build_dataset(n_rows:int=1000, seed:int|None=None) -> pd.DataFrame:
    """Helper for generating Kenya-specific synthetic loan data."""
    return g.generate_clients_loans(n_rows=n_rows, seed=seed)

def app():
    if st is None:
        return  # No UI outside Streamlit

    st.header("🧩 Client Onboarding — Synthetic Data")
    st.write("Generate Kenya-specific synthetic loan data tailored to your needs.")

    with st.form(key="onboarding_form"):
        n_rows = st.number_input("Number of clients (rows)", min_value=50, max_value=200000, value=1000, step=50)
        seed = st.number_input("Random seed (optional)", min_value=0, max_value=10_000, value=42, step=1)
        submitted = st.form_submit_button("Generate dataset")

    if submitted:
        with st.spinner("Generating dataset..."):
            df = build_dataset(n_rows=int(n_rows), seed=int(seed))
        st.success(f"Generated {len(df):,} rows")
        st.dataframe(df.head(100))

        # Save a local CSV copy
        try:
            out_path = "data/onboarding_last.csv"
            df.to_csv(out_path, index=False)
            st.caption(f"Saved a copy at {out_path}")
        except Exception as e:
            st.warning(f"Could not save CSV: {e}")

        # Provide download
        csv_bytes = df.to_csv(index=False).encode("utf-8")
        st.download_button("⬇️ Download CSV", csv_bytes, file_name="loaniq_synthetic.csv", mime="text/csv")

Writing pages/_02_client_onboarding.py


In [None]:
!grep -n "def generate_clients_loans" modules/synth/generators.py

59:def generate_clients_loans(n_rows:int=1000, seed:int|None=None) -> pd.DataFrame:


In [None]:
!pip install faker streamlit pyngrok pandas pyarrow

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.6.0


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os

# ✅ fallback to English Faker (but we’ll inject Kenyan names ourselves)
fake = Faker("en")

# Kenyan-specific first names and surnames
KENYAN_MALE_NAMES = [
    "Otieno","Kamau","Mwangi","Mutiso","Obuya","Kiptoo","Cheruiyot","Omondi",
    "Ochieng","Mutua","Chebet","Kiplagat","Njoroge","Barasa","Were","Omondi",
]
KENYAN_FEMALE_NAMES = [
    "Achieng","Wambui","Njeri","Atieno","Chebet","Wanjiku","Naliaka","Nyambura",
    "Mwende","Cherono","Akoth","Kendi","Atieno","Syombua","Nasimiyu","Chelangat",
]
KENYAN_SURNAMES = [
    "Omondi","Otieno","Mutiso","Kamau","Kiptoo","Cheruiyot","Mwangi","Barasa",
    "Were","Ochieng","Njoroge","Obuya","Chebet","Mutua","Okoth","Kosgey","Kimani"
]

# 50 towns in Kenya spread across regions
TOWNS = [
    ("Nairobi","Central"),("Mombasa","Coast"),("Kisumu","Western"),("Nakuru","Rift"),
    ("Eldoret","Rift"),("Thika","Central"),("Machakos","Eastern"),("Nyeri","Central"),
    ("Meru","Eastern"),("Malindi","Coast"),("Garissa","Eastern"),("Embu","Eastern"),
    ("Kericho","Rift"),("Narok","Rift"),("Naivasha","Rift"),("Kakamega","Western"),
    ("Bungoma","Western"),("Kitale","Rift"),("Isiolo","Eastern"),("Wajir","Eastern"),
    ("Lamu","Coast"),("Kilifi","Coast"),("Kwale","Coast"),("Voi","Coast"),
    ("Marsabit","Eastern"),("Nanyuki","Central"),("Kiambu","Central"),("Muranga","Central"),
    ("Kerugoya","Central"),("Chuka","Eastern"),("Loitoktok","Rift"),("Kajiado","Rift"),
    ("Mwingi","Eastern"),("Hola","Coast"),("Makueni","Eastern"),("Sotik","Rift"),
    ("Litein","Rift"),("Kabarnet","Rift"),("Baringo","Rift"),("Migori","Western"),
    ("Homa Bay","Western"),("Siaya","Western"),("Busia","Western"),("Keroka","Western"),
    ("Oyugis","Western"),("Kitui","Eastern"),("Taveta","Coast"),("Kilgoris","Rift"),
    ("Kapenguria","Rift"),("Maralal","Rift")
]

PRODUCTS = [
    ("Inuka","5 weeks"),
    ("Kuza","4 weeks"),
    ("Fadhili","6 weeks"),
    ("Imarika","8 weeks"),
    ("Chama Boost","12 weeks")
]

LOAN_TYPES = ["normal","topup","emergency","business","group"]
STATUSES = ["active","pending_branch_approval"]

def random_kenyan_name() -> tuple[str,str]:
    """Return (full_name, gender)."""
    if random.random() < 0.5:
        first = random.choice(KENYAN_MALE_NAMES)
        gender = "M"
    else:
        first = random.choice(KENYAN_FEMALE_NAMES)
        gender = "F"
    surname = random.choice(KENYAN_SURNAMES)
    return f"{first} {surname}", gender

def infer_age_from_id(national_id: str) -> tuple[int,str]:
    try:
        nid = int(national_id)
    except:
        return (random.randint(18,65),"unknown")
    if nid < 7000000:
        return (65,"65+ legacy")
    if 31000000 <= nid <= 33500000:
        return (random.randint(25,32),"25-32")
    if 33500001 <= nid <= 36000000:
        return (random.randint(18,25),"18-25")
    return (random.randint(33,60),"33-60")

def generate_clients_loans(n_rows:int=1000, seed:int|None=None) -> pd.DataFrame:
    if seed is not None:
        random.seed(seed); np.random.seed(seed); Faker.seed(seed)
    rows = []
    for i in range(n_rows):
        name, gender = random_kenyan_name()
        national_id = str(random.choice([random.randint(31000000,36000000), random.randint(7000000,36000000)]))
        age, age_band = infer_age_from_id(national_id)
        branch, region = random.choice(TOWNS)
        product, weekly_term = random.choice(PRODUCTS)
        amount = random.randint(5000, 50000)
        ref_number = f"REF{random.randint(100000,999999)}"
        loan_type = random.choice(LOAN_TYPES)
        status = random.choices(STATUSES, weights=[0.85,0.15])[0]
        created_date = fake.date_between(start_date="-24m", end_date="today")

        # derived features
        income = random.randint(5000,100000)
        dti = round(amount / max(income,1),2)
        risk_score = max(300, min(850, int(850 - dti*400 + random.gauss(0,30))))
        default_prob = round(1/(1+np.exp(-(0.05*dti*100 + random.gauss(0,1)))),3)
        loan_health = "performing" if default_prob < 0.3 else "at_risk" if default_prob < 0.6 else "non_performing"
        business_type = random.choice(["retail","farming","transport","service","manufacturing","informal"])
        phone = f"+2547{random.randint(0,9)}{random.randint(1000000,9999999)}"

        rows.append({
            "customer_name": name,
            "national_id": national_id,
            "age": age,
            "age_band": age_band,
            "gender": gender,
            "branch": branch,
            "region": region,
            "product": product,
            "weekly_term": weekly_term,
            "amount": amount,
            "ref_number": ref_number,
            "loan_type": loan_type,
            "status": status,
            "loan_health": loan_health,
            "created_date": created_date,
            "income": income,
            "dti": dti,
            "risk_score": risk_score,
            "default_prob": default_prob,
            "business_type": business_type,
            "phone": phone
        })
    return pd.DataFrame(rows)

def generate_batches(total_rows=50000, batch_size=10000, out_dir="data/synth_batches", fmt="csv", seed=None):
    os.makedirs(out_dir, exist_ok=True)
    n_batches = (total_rows + batch_size - 1)//batch_size
    for i in range(n_batches):
        df = generate_clients_loans(n_rows=batch_size, seed=(None if seed is None else seed+i))
        out_path = os.path.join(out_dir, f"batch_{i+1}.{fmt}")
        if fmt=="csv":
            df.to_csv(out_path, index=False)
        else:
            df.to_parquet(out_path, index=False)
    return out_dir

Overwriting modules/synth/generators.py


In [None]:
import importlib
import modules.synth.generators as g
importlib.reload(g)

df = g.generate_clients_loans(n_rows=10, seed=123)
print(df[["customer_name","gender","branch","product","amount"]])

     customer_name gender    branch      product  amount
0     Mwangi Mutua      M   Makueni  Chama Boost   26770
1    Atieno Mutiso      F    Kisumu        Inuka   48721
2  Cherono Njoroge      F   Baringo  Chama Boost   42015
3    Syombua Obuya      F   Kajiado      Fadhili   16149
4     Kendi Omondi      F    Mwingi        Inuka   46864
5      Were Otieno      M   Mombasa      Fadhili   28071
6      Njeri Mutua      F   Makueni      Imarika    6909
7   Syombua Mutiso      F  Homa Bay        Inuka   38993
8     Akoth Kimani      F   Muranga        Inuka   13913
9    Chebet Kimani      M    Mwingi        Inuka   40747


In [None]:
%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")
        n_rows = st.number_input("Number of clients", 1000, 200000, 10000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 10000, step=1000)
        fraud_toggle = st.checkbox("Inject fraud patterns", value=False)
        if st.button("Generate Batches"):
            st.success(f"Would generate {n_rows} rows in batches of {batch_size}. Fraud={fraud_toggle}")

    # --- Tab 2: Models ---
    with tabs[1]:
        st.subheader("Model Training & Controls")
        algo = st.selectbox("Choose algorithm", ["LogReg","XGBoost","SGD"])
        lr = st.slider("Learning rate", 0.001, 0.5, 0.1)
        if st.button("Train Model"):
            st.info(f"Training {algo} with LR={lr}... (stub)")

    # --- Tab 3: Experiments ---
    with tabs[2]:
        st.subheader("Model Experiments (A/B Tests)")
        st.write("Compare ROC, PR, metrics between models (stub).")
        st.button("Run A/B Test")

    # --- Tab 4: Stress Tests ---
    with tabs[3]:
        st.subheader("What-If Shocks & Stress Testing")
        shock = st.selectbox("Shock type", ["Interest Rate ↑", "Unemployment ↑", "Repayment ↓"])
        if st.button("Simulate Shock"):
            st.warning(f"Simulating {shock}... (stub)")

    # --- Tab 5: Admin Tools ---
    with tabs[4]:
        st.subheader("Admin Actions")
        client_id = st.text_input("Impersonate Client ID")
        if st.button("Impersonate"):
            st.info(f"Impersonating {client_id}... (stub)")
        st.button("Emergency Kill-Switch")

    # --- Tab 6: Audit Viewer ---
    with tabs[5]:
        st.subheader("Audit Logs")
        st.write("Searchable logs (stub).")

Writing pages/_04_admin_sandbox.py


In [None]:
%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

from modules.synth import generators as g
import os

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")

        n_rows = st.number_input("Total number of clients", 1000, 200000, 20000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 5000, step=1000)
        fmt = st.selectbox("Output format", ["csv","parquet"])
        fraud_toggle = st.checkbox("Inject fraud patterns (future)")

        if st.button("Generate Batches"):
            with st.spinner("Generating synthetic data batches..."):
                out_dir = g.generate_batches(
                    total_rows=int(n_rows),
                    batch_size=int(batch_size),
                    out_dir="data/sandbox_batches",
                    fmt=fmt,
                    seed=42
                )
            st.success(f"✅ Generated {n_rows} rows into {out_dir}")
            files = os.listdir(out_dir)
            st.write("Files:", files[:5], "...")

Overwriting pages/_04_admin_sandbox.py


In [None]:
from modules.synth import generators as g

out_dir = g.generate_batches(total_rows=1200, batch_size=500, out_dir="data/test_batches", fmt="csv", seed=123)
import os
print("✅ Files created:", os.listdir(out_dir))

✅ Files created: ['batch_3.csv', 'batch_1.csv', 'batch_2.csv']


In [None]:
%%writefile modules/core/utils.py
import os, csv, sqlite3
from datetime import datetime

AUDIT_DB = "data/audit.db"
AUDIT_CSV = "data/audit_log.csv"

def _init_storage():
    os.makedirs("data", exist_ok=True)

    # SQLite
    conn = sqlite3.connect(AUDIT_DB)
    cur = conn.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS audit_log (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        user TEXT,
        action TEXT,
        params TEXT,
        status TEXT
    )
    """)
    conn.commit()
    conn.close()

    # CSV
    if not os.path.exists(AUDIT_CSV):
        with open(AUDIT_CSV, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["timestamp","user","action","params","status"])

def audit_log(user:str, action:str, params:str="", status:str="OK"):
    """Record an audit event (user, action, params, status)."""
    _init_storage()
    ts = datetime.utcnow().isoformat()

    # SQLite
    conn = sqlite3.connect(AUDIT_DB)
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO audit_log (timestamp,user,action,params,status) VALUES (?,?,?,?,?)",
        (ts,user,action,params,status)
    )
    conn.commit()
    conn.close()

    # CSV
    with open(AUDIT_CSV, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([ts,user,action,params,status])

    return {"timestamp":ts,"user":user,"action":action,"params":params,"status":status}

Writing modules/core/utils.py


In [None]:
%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

from modules.synth import generators as g
from modules.core import utils
import os

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")

        n_rows = st.number_input("Total number of clients", 1000, 200000, 20000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 5000, step=1000)
        fmt = st.selectbox("Output format", ["csv","parquet"])
        fraud_toggle = st.checkbox("Inject fraud patterns (future)")

        if st.button("Generate Batches"):
            with st.spinner("Generating synthetic data batches..."):
                out_dir = g.generate_batches(
                    total_rows=int(n_rows),
                    batch_size=int(batch_size),
                    out_dir="data/sandbox_batches",
                    fmt=fmt,
                    seed=42
                )
            utils.audit_log(user="admin", action="Generate Batches",
                            params=f"rows={n_rows},batch={batch_size},fmt={fmt},fraud={fraud_toggle}",
                            status="OK")
            st.success(f"✅ Generated {n_rows} rows into {out_dir}")
            files = os.listdir(out_dir)
            st.write("Files:", files[:5], "...")

    # --- Tab 2: Models ---
    with tabs[1]:
        st.subheader("Model Training & Controls")
        algo = st.selectbox("Choose algorithm", ["LogReg","XGBoost","SGD"])
        lr = st.slider("Learning rate", 0.001, 0.5, 0.1)
        if st.button("Train Model"):
            utils.audit_log(user="admin", action="Train Model", params=f"algo={algo},lr={lr}", status="OK")
            st.info(f"Training {algo} with LR={lr}... (stub)")

    # --- Tab 3: Experiments ---
    with tabs[2]:
        st.subheader("Model Experiments (A/B Tests)")
        st.write("Compare ROC, PR, metrics between models (stub).")
        if st.button("Run A/B Test"):
            utils.audit_log(user="admin", action="Run A/B Test", status="OK")
            st.info("Running A/B test... (stub)")

    # --- Tab 4: Stress Tests ---
    with tabs[3]:
        st.subheader("What-If Shocks & Stress Testing")
        shock = st.selectbox("Shock type", ["Interest Rate ↑", "Unemployment ↑", "Repayment ↓"])
        if st.button("Simulate Shock"):
            utils.audit_log(user="admin", action="Simulate Shock", params=shock, status="OK")
            st.warning(f"Simulating {shock}... (stub)")

    # --- Tab 5: Admin Tools ---
    with tabs[4]:
        st.subheader("Admin Actions")
        client_id = st.text_input("Impersonate Client ID")
        if st.button("Impersonate"):
            utils.audit_log(user="admin", action="Impersonate", params=client_id, status="OK")
            st.info(f"Impersonating {client_id}... (stub)")
        if st.button("Emergency Kill-Switch"):
            utils.audit_log(user="admin", action="Kill-Switch", status="OK")
            st.error("Kill switch triggered! (stub)")

    # --- Tab 6: Audit Viewer ---
    with tabs[5]:
        st.subheader("Audit Logs")

        import sqlite3
        import pandas as pd

        conn = sqlite3.connect("data/audit.db")
        df = pd.read_sql_query("SELECT * FROM audit_log ORDER BY id DESC LIMIT 200", conn)
        conn.close()

        if len(df) == 0:
            st.info("No logs yet.")
        else:
            st.dataframe(df)
            st.download_button(
                "⬇️ Download logs (CSV)",
                df.to_csv(index=False),
                file_name="audit_log.csv"
            )

Overwriting pages/_04_admin_sandbox.py


In [None]:
from modules.core import utils

# Simulate events
utils.audit_log("admin","Generate Batches","rows=1000,batch=200",status="OK")
utils.audit_log("admin","Train Model","algo=LogReg,lr=0.1",status="OK")

# Preview logs
!head -n 10 data/audit_log.csv

timestamp,user,action,params,status
2025-08-31T09:18:52.587395,admin,Generate Batches,"rows=1000,batch=200",OK
2025-08-31T09:18:52.597125,admin,Train Model,"algo=LogReg,lr=0.1",OK


In [None]:
%%writefile modules/ml/engine.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, confusion_matrix
from imblearn.over_sampling import SMOTE
import shap
import joblib, os

# --- Model Registry ---
MODEL_DIR = "data/models"
os.makedirs(MODEL_DIR, exist_ok=True)

def preprocess(df, target_col="default"):
    """Split, scale, rebalance dataset."""
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Rebalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

    return X_train_bal, X_test_scaled, y_train_bal, y_test, scaler

def train_model(df, algo="LogReg", target_col="default", **kwargs):
    """Train a model and return metrics + SHAP explainability."""
    X_train, X_test, y_train, y_test, scaler = preprocess(df, target_col)

    if algo == "LogReg":
        model = LogisticRegression(max_iter=1000, **kwargs)
    elif algo == "XGBoost":
        model = XGBClassifier(eval_metric="logloss", use_label_encoder=False, **kwargs)
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", max_iter=1000, **kwargs)
    else:
        raise ValueError("Unknown algorithm")

    # Fit
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model,"predict_proba") else None
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    pr, rc, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    cm = confusion_matrix(y_test, y_pred)

    metrics = {
        "algo": algo,
        "auc": auc,
        "precision": pr,
        "recall": rc,
        "f1": f1,
        "confusion_matrix": cm.tolist()
    }

    # SHAP values
    try:
        explainer = shap.Explainer(model, X_train)
        shap_values = explainer(X_test[:50])  # sample
    except Exception as e:
        shap_values = None

    # Save model
    fname = os.path.join(MODEL_DIR, f"{algo}_model.pkl")
    joblib.dump({"model":model,"scaler":scaler}, fname)

    return metrics, shap_values, fname

def blend_models(df, algos=["LogReg","XGBoost","SGD"], target_col="default"):
    """Blend multiple models and return weighted hybrid predictions."""
    preds = []
    weights = []
    X_train, X_test, y_train, y_test, scaler = preprocess(df, target_col)

    for algo in algos:
        m, _, path = train_model(df, algo, target_col)
        model_bundle = joblib.load(path)
        model = model_bundle["model"]
        y_proba = model.predict_proba(X_test)[:,1] if hasattr(model,"predict_proba") else None
        if y_proba is not None:
            auc = m["auc"] or 0.5
            preds.append(y_proba)
            weights.append(auc)

    # Weighted average
    weights = np.array(weights)/sum(weights)
    blended = np.average(preds, axis=0, weights=weights)

    auc = roc_auc_score(y_test, blended)
    return {"algo":"HybridBlend","auc":auc,"weights":weights.tolist()}

Writing modules/ml/engine.py


In [None]:
%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

from modules.synth import generators as g
from modules.core import utils
import os

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")

        n_rows = st.number_input("Total number of clients", 1000, 200000, 20000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 5000, step=1000)
        fmt = st.selectbox("Output format", ["csv","parquet"])
        fraud_toggle = st.checkbox("Inject fraud patterns (future)")

        if st.button("Generate Batches"):
            with st.spinner("Generating synthetic data batches..."):
                out_dir = g.generate_batches(
                    total_rows=int(n_rows),
                    batch_size=int(batch_size),
                    out_dir="data/sandbox_batches",
                    fmt=fmt,
                    seed=42
                )
            utils.audit_log(user="admin", action="Generate Batches",
                            params=f"rows={n_rows},batch={batch_size},fmt={fmt},fraud={fraud_toggle}",
                            status="OK")
            st.success(f"✅ Generated {n_rows} rows into {out_dir}")
            files = os.listdir(out_dir)
            st.write("Files:", files[:5], "...")

    # --- Tab 2: Models ---
    with tabs[1]:
        st.subheader("Model Training & Controls")

        algo = st.selectbox("Choose algorithm", ["LogReg","XGBoost","SGD","HybridBlend"])
        lr = st.slider("Learning rate (for SGD/XGB)", 0.001, 0.5, 0.1)

        if st.button("Train Model"):
            from modules.ml import engine
            import pandas as pd
            import matplotlib.pyplot as plt

            # Load a dataset (from batches generated in Tab 1)
            sample_file = "data/sandbox_batches/batch_000.csv"
            if not os.path.exists(sample_file):
                st.error("No synthetic data found. Please generate data in Tab 1 first.")
            else:
                df = pd.read_csv(sample_file)
                if "default" not in df.columns:
                    # Add fake target column for demo
                    df["default"] = (df["loan_health"] != "performing").astype(int)

                if algo == "HybridBlend":
                    metrics = engine.blend_models(df)
                    shap_values = None
                else:
                    metrics, shap_values, path = engine.train_model(df, algo, lr=lr)

                utils.audit_log(user="admin", action="Train Model", params=algo, status="OK")

                st.json(metrics)
                if shap_values is not None:
                    st.write("Feature importance (SHAP):")
                    fig = plt.figure()
                    shap.plots.bar(shap_values, show=False)
                    st.pyplot(fig)

    # --- Tab 3: Experiments ---
    with tabs[2]:
        st.subheader("Model Experiments (A/B Tests)")
        st.write("Compare ROC, PR, metrics between models (stub).")
        if st.button("Run A/B Test"):
            utils.audit_log(user="admin", action="Run A/B Test", status="OK")
            st.info("Running A/B test... (stub)")

    # --- Tab 4: Stress Tests ---
    with tabs[3]:
        st.subheader("What-If Shocks & Stress Testing")
        shock = st.selectbox("Shock type", ["Interest Rate ↑", "Unemployment ↑", "Repayment ↓"])
        if st.button("Simulate Shock"):
            utils.audit_log(user="admin", action="Simulate Shock", params=shock, status="OK")
            st.warning(f"Simulating {shock}... (stub)")

    # --- Tab 5: Admin Tools ---
    with tabs[4]:
        st.subheader("Admin Actions")
        client_id = st.text_input("Impersonate Client ID")
        if st.button("Impersonate"):
            utils.audit_log(user="admin", action="Impersonate", params=client_id, status="OK")
            st.info(f"Impersonating {client_id}... (stub)")
        if st.button("Emergency Kill-Switch"):
            utils.audit_log(user="admin", action="Kill-Switch", status="OK")
            st.error("Kill switch triggered! (stub)")

    # --- Tab 6: Audit Viewer ---
    with tabs[5]:
        st.subheader("Audit Logs")

        import sqlite3
        import pandas as pd

        conn = sqlite3.connect("data/audit.db")
        df = pd.read_sql_query("SELECT * FROM audit_log ORDER BY id DESC LIMIT 200", conn)
        conn.close()

        if len(df) == 0:
            st.info("No logs yet.")
        else:
            st.dataframe(df)
            st.download_button(
                "⬇️ Download logs (CSV)",
                df.to_csv(index=False),
                file_name="audit_log.csv"
            )

Overwriting pages/_04_admin_sandbox.py


In [None]:
from modules.synth import generators as g
import os

# Generate 1000 rows, split into 500-row batches
out_dir = g.generate_batches(
    total_rows=1000,
    batch_size=500,
    out_dir="data/sandbox_batches",
    fmt="csv",
    seed=42
)

print("✅ Batches saved to:", out_dir)
print("Files in folder:", os.listdir(out_dir))

✅ Batches saved to: data/sandbox_batches
Files in folder: ['batch_1.csv', 'batch_2.csv']


In [None]:
import os
print("data/ exists?", os.path.exists("data"))
print("sandbox_batches exists?", os.path.exists("data/sandbox_batches"))
if os.path.exists("data/sandbox_batches"):
    print("Files:", os.listdir("data/sandbox_batches"))

data/ exists? True
sandbox_batches exists? True
Files: ['batch_1.csv', 'batch_2.csv']


In [None]:
from modules.synth import generators as g

out_dir = g.generate_batches(
    total_rows=1000,      # 1k rows
    batch_size=500,       # 2 batches
    out_dir="data/sandbox_batches",
    fmt="csv",
    seed=42
)

print("✅ Generated batches:", out_dir)
print("Files now:", os.listdir(out_dir))

✅ Generated batches: data/sandbox_batches
Files now: ['batch_1.csv', 'batch_2.csv']


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random, os

fake = Faker("en")

# Kenyan towns, products etc. (trimmed for brevity here)
KENYAN_TOWNS = ["Nairobi","Mombasa","Kisumu","Nakuru","Eldoret","Meru","Nyeri","Machakos","Embu","Kitale"]
PRODUCTS = ["Inuka 5 weeks","Kuza 4 weeks","Fadhili 6 weeks","Jipange 8 weeks"]
LOAN_TYPES = ["normal","group","business"]
STATUSES = ["active","pending approval"]
HEALTH = ["performing","watch","non-performing"]

def generate_clients_loans(n_rows:int=1000, seed:int|None=None) -> pd.DataFrame:
    if seed: random.seed(seed); np.random.seed(seed)
    data = []
    for _ in range(n_rows):
        name = fake.name()
        natid = random.randint(32000000, 34000000)  # simulate Kenyan IDs
        age = (natid//100000 - 320) + 20           # crude age approximation
        branch = random.choice(KENYAN_TOWNS)
        product = random.choice(PRODUCTS)
        amount = random.randint(1000, 50000)
        loan_type = random.choice(LOAN_TYPES)
        status = random.choice(STATUSES)
        health = random.choice(HEALTH)
        data.append([name, natid, age, branch, product, amount, loan_type, status, health])
    return pd.DataFrame(data, columns=["customer_name","national_id","age","branch","product",
                                       "amount","loan_type","status","loan_health"])

def generate_batches(total_rows:int=10000, batch_size:int=2000, out_dir="data/sandbox_batches",
                     fmt="csv", seed:int|None=None):
    """Generate multiple batches with consistent naming (batch_000.csv, batch_001.csv, …)."""
    os.makedirs(out_dir, exist_ok=True)
    n_batches = int(np.ceil(total_rows/batch_size))
    for i in range(n_batches):
        df = generate_clients_loans(min(batch_size,total_rows-i*batch_size), seed=seed)
        fname = os.path.join(out_dir, f"batch_{i:03d}.{fmt}")
        if fmt=="csv":
            df.to_csv(fname, index=False)
        else:
            df.to_parquet(fname, index=False)
    return out_dir

Overwriting modules/synth/generators.py


In [None]:
import importlib
import modules.synth.generators as g

importlib.reload(g)

# Now regenerate batches
out_dir = g.generate_batches(1000, 500, "data/sandbox_batches", "csv", seed=42)
print("Files:", os.listdir(out_dir))

Files: ['batch_1.csv', 'batch_2.csv', 'batch_001.csv', 'batch_000.csv']


In [None]:
%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

from modules.synth import generators as g
from modules.core import utils
import os, glob

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")

        n_rows = st.number_input("Total number of clients", 1000, 200000, 20000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 5000, step=1000)
        fmt = st.selectbox("Output format", ["csv","parquet"])
        fraud_toggle = st.checkbox("Inject fraud patterns (future)")

        if st.button("Generate Batches"):
            with st.spinner("Generating synthetic data batches..."):
                out_dir = g.generate_batches(
                    total_rows=int(n_rows),
                    batch_size=int(batch_size),
                    out_dir="data/sandbox_batches",
                    fmt=fmt,
                    seed=42
                )
            utils.audit_log(user="admin", action="Generate Batches",
                            params=f"rows={n_rows},batch={batch_size},fmt={fmt},fraud={fraud_toggle}",
                            status="OK")
            st.success(f"✅ Generated {n_rows} rows into {out_dir}")
            files = os.listdir(out_dir)
            st.write("Files:", files[:5], "...")

    # --- Tab 2: Models ---
    with tabs[1]:
        st.subheader("Model Training & Controls")

        algo = st.selectbox("Choose algorithm", ["LogReg","XGBoost","SGD","HybridBlend"])
        lr = st.slider("Learning rate (for SGD/XGB)", 0.001, 0.5, 0.1)

        if st.button("Train Model"):
            from modules.ml import engine
            import pandas as pd
            import matplotlib.pyplot as plt
            import shap

            # ✅ Flexible batch loader (works with batch_1.csv or batch_000.csv)
            files = sorted(glob.glob("data/sandbox_batches/batch_*.csv"))
            if not files:
                st.error("No synthetic data found. Please generate data in Tab 1 first.")
            else:
                sample_file = files[0]
                df = pd.read_csv(sample_file)
                if "default" not in df.columns:
                    df["default"] = (df["loan_health"] != "performing").astype(int)

                if algo == "HybridBlend":
                    metrics = engine.blend_models(df)
                    shap_values = None
                else:
                    metrics, shap_values, path = engine.train_model(df, algo, lr=lr)

                utils.audit_log(user="admin", action="Train Model", params=algo, status="OK")

                st.json(metrics)
                if shap_values is not None:
                    st.write("Feature importance (SHAP):")
                    fig = plt.figure()
                    shap.plots.bar(shap_values, show=False)
                    st.pyplot(fig)

    # --- Tab 3: Experiments ---
    with tabs[2]:
        st.subheader("Model Experiments (A/B Tests)")
        st.write("Compare ROC, PR, metrics between models (stub).")
        if st.button("Run A/B Test"):
            utils.audit_log(user="admin", action="Run A/B Test", status="OK")
            st.info("Running A/B test... (stub)")

    # --- Tab 4: Stress Tests ---
    with tabs[3]:
        st.subheader("What-If Shocks & Stress Testing")
        shock = st.selectbox("Shock type", ["Interest Rate ↑", "Unemployment ↑", "Repayment ↓"])
        if st.button("Simulate Shock"):
            utils.audit_log(user="admin", action="Simulate Shock", params=shock, status="OK")
            st.warning(f"Simulating {shock}... (stub)")

    # --- Tab 5: Admin Tools ---
    with tabs[4]:
        st.subheader("Admin Actions")
        client_id = st.text_input("Impersonate Client ID")
        if st.button("Impersonate"):
            utils.audit_log(user="admin", action="Impersonate", params=client_id, status="OK")
            st.info(f"Impersonating {client_id}... (stub)")
        if st.button("Emergency Kill-Switch"):
            utils.audit_log(user="admin", action="Kill-Switch", status="OK")
            st.error("Kill switch triggered! (stub)")

    # --- Tab 6: Audit Viewer ---
    with tabs[5]:
        st.subheader("Audit Logs")

        import sqlite3
        import pandas as pd

        conn = sqlite3.connect("data/audit.db")
        df = pd.read_sql_query("SELECT * FROM audit_log ORDER BY id DESC LIMIT 200", conn)
        conn.close()

        if len(df) == 0:
            st.info("No logs yet.")
        else:
            st.dataframe(df)
            st.download_button(
                "⬇️ Download logs (CSV)",
                df.to_csv(index=False),
                file_name="audit_log.csv"
            )

Overwriting pages/_04_admin_sandbox.py


In [None]:

%%writefile pages/_04_admin_sandbox.py
try:
    import streamlit as st
except ImportError:
    st = None

from modules.synth import generators as g
from modules.core import utils
import os, glob

def app():
    if st is None:
        return

    st.title("🛠️ Admin Sandbox (Godmode)")
    st.caption("Super-admin tools for data, models, experiments, and stress tests.")

    tabs = st.tabs([
        "📊 Data Generation",
        "🤖 Models",
        "🧪 Experiments",
        "⚡ Stress Tests",
        "👤 Admin Tools",
        "📜 Audit Viewer"
    ])

    # --- Tab 1: Data Generation ---
    with tabs[0]:
        st.subheader("Synthetic Data Controls")

        n_rows = st.number_input("Total number of clients", 1000, 200000, 20000, step=1000)
        batch_size = st.number_input("Batch size", 1000, 50000, 5000, step=1000)
        fmt = st.selectbox("Output format", ["csv","parquet"])
        fraud_toggle = st.checkbox("Inject fraud patterns (future)")

        if st.button("Generate Batches"):
            with st.spinner("Generating synthetic data batches..."):
                out_dir = g.generate_batches(
                    total_rows=int(n_rows),
                    batch_size=int(batch_size),
                    out_dir="data/sandbox_batches",
                    fmt=fmt,
                    seed=42
                )
            utils.audit_log(user="admin", action="Generate Batches",
                            params=f"rows={n_rows},batch={batch_size},fmt={fmt},fraud={fraud_toggle}",
                            status="OK")
            st.success(f"✅ Generated {n_rows} rows into {out_dir}")
            files = os.listdir(out_dir)
            st.write("Files:", files[:5], "...")

    # --- Tab 2: Models ---
    with tabs[1]:
        st.subheader("Model Training & Controls")

        algo = st.selectbox("Choose algorithm", ["LogReg","XGBoost","SGD","HybridBlend"])
        lr = st.slider("Learning rate (for SGD/XGB)", 0.001, 0.5, 0.1)

        if st.button("Train Model"):
            from modules.ml import engine
            import pandas as pd
            import matplotlib.pyplot as plt
            import shap

            files = sorted(glob.glob("data/sandbox_batches/batch_*.csv"))
            if not files:
                st.error("No synthetic data found. Please generate data in Tab 1 first.")
            else:
                sample_file = files[0]
                df = pd.read_csv(sample_file)
                if "default" not in df.columns:
                    df["default"] = (df["loan_health"] != "performing").astype(int)

                if algo == "HybridBlend":
                    metrics = engine.blend_models(df)
                    shap_values = None
                else:
                    metrics, shap_values, path = engine.train_model(df, algo, lr=lr)

                utils.audit_log(user="admin", action="Train Model", params=algo, status="OK")

                st.json(metrics)
                if shap_values is not None:
                    st.write("Feature importance (SHAP):")
                    fig = plt.figure()
                    shap.plots.bar(shap_values, show=False)
                    st.pyplot(fig)

    # --- Tab 3: Experiments ---
    with tabs[2]:
        st.subheader("Model Experiments (A/B Tests)")
        from modules.ml import engine
        import pandas as pd
        import matplotlib.pyplot as plt
        from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, ConfusionMatrixDisplay

        algo1 = st.selectbox("Model A", ["LogReg","XGBoost","SGD"])
        algo2 = st.selectbox("Model B", ["LogReg","XGBoost","SGD"], index=1)

        if st.button("Run A/B Test"):
            files = sorted(glob.glob("data/sandbox_batches/batch_*.csv"))
            if not files:
                st.error("No synthetic data found. Please generate data in Tab 1 first.")
            else:
                df = pd.read_csv(files[0])
                if "default" not in df.columns:
                    df["default"] = (df["loan_health"] != "performing").astype(int)

                # Train both models
                m1, shap1, _ = engine.train_model(df, algo1)
                m2, shap2, _ = engine.train_model(df, algo2)

                utils.audit_log(user="admin", action="Run A/B Test", params=f"{algo1} vs {algo2}", status="OK")

                # Display metrics
                st.write("### Metrics")
                st.json({algo1: m1, algo2: m2})

                # ROC Curves
                st.write("### ROC Curves")
                fpr1, tpr1, _ = roc_curve(df["default"], m1["y_scores"])
                fpr2, tpr2, _ = roc_curve(df["default"], m2["y_scores"])
                fig, ax = plt.subplots()
                ax.plot(fpr1, tpr1, label=f"{algo1} AUC={auc(fpr1,tpr1):.3f}")
                ax.plot(fpr2, tpr2, label=f"{algo2} AUC={auc(fpr2,tpr2):.3f}")
                ax.plot([0,1],[0,1],'k--')
                ax.legend()
                st.pyplot(fig)

                # PR Curves
                st.write("### Precision-Recall Curves")
                p1,r1,_ = precision_recall_curve(df["default"], m1["y_scores"])
                p2,r2,_ = precision_recall_curve(df["default"], m2["y_scores"])
                fig, ax = plt.subplots()
                ax.plot(r1,p1,label=algo1)
                ax.plot(r2,p2,label=algo2)
                ax.legend()
                st.pyplot(fig)

                # Confusion Matrices
                st.write("### Confusion Matrices")
                cm1 = confusion_matrix(df["default"], m1["y_pred"])
                cm2 = confusion_matrix(df["default"], m2["y_pred"])
                fig, ax = plt.subplots(1,2, figsize=(10,4))
                ConfusionMatrixDisplay(cm1).plot(ax=ax[0], values_format="d")
                ax[0].set_title(algo1)
                ConfusionMatrixDisplay(cm2).plot(ax=ax[1], values_format="d")
                ax[1].set_title(algo2)
                st.pyplot(fig)

    # --- Tab 4: Stress Tests ---
    with tabs[3]:
        st.subheader("What-If Shocks & Stress Testing")
        shock = st.selectbox("Shock type", ["Interest Rate ↑", "Unemployment ↑", "Repayment ↓"])
        if st.button("Simulate Shock"):
            utils.audit_log(user="admin", action="Simulate Shock", params=shock, status="OK")
            st.warning(f"Simulating {shock}... (stub)")

    # --- Tab 5: Admin Tools ---
    with tabs[4]:
        st.subheader("Admin Actions")
        client_id = st.text_input("Impersonate Client ID")
        if st.button("Impersonate"):
            utils.audit_log(user="admin", action="Impersonate", params=client_id, status="OK")
            st.info(f"Impersonating {client_id}... (stub)")
        if st.button("Emergency Kill-Switch"):
            utils.audit_log(user="admin", action="Kill-Switch", status="OK")
            st.error("Kill switch triggered! (stub)")

    # --- Tab 6: Audit Viewer ---
    with tabs[5]:
        st.subheader("Audit Logs")

        import sqlite3
        import pandas as pd

        conn = sqlite3.connect("data/audit.db")
        df = pd.read_sql_query("SELECT * FROM audit_log ORDER BY id DESC LIMIT 200", conn)
        conn.close()

        if len(df) == 0:
            st.info("No logs yet.")
        else:
            st.dataframe(df)
            st.download_button(
                "⬇️ Download logs (CSV)",
                df.to_csv(index=False),
                file_name="audit_log.csv"
            )

Overwriting pages/_04_admin_sandbox.py


In [None]:
%%writefile modules/ml/engine.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier
import shap, joblib, os

def train_model(df, algo="LogReg", lr=0.1, test_size=0.3, seed=42):
    """Train a single model and return metrics, SHAP values (if any), and saved path."""

    # Features & target
    y = df["default"]
    X = df.drop(columns=["default","loan_health"], errors="ignore")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )

    # Pick model
    if algo == "LogReg":
        model = LogisticRegression(max_iter=500)
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", learning_rate="optimal", eta0=lr, max_iter=1000)
    elif algo == "XGBoost":
        model = XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", learning_rate=lr, n_estimators=200
        )
    else:
        raise ValueError(f"Unsupported algo {algo}")

    # Train
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    try:
        y_scores = model.predict_proba(X_test)[:,1]
    except:
        # fallback if predict_proba not available
        y_scores = model.decision_function(X_test)

    # Metrics
    metrics = {
        "algo": algo,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "auc": roc_auc_score(y_test, y_scores),
        "n_train": len(y_train),
        "n_test": len(y_test),
        "y_pred": y_pred.tolist(),
        "y_scores": y_scores.tolist()
    }

    # Explainability (SHAP)
    shap_values = None
    try:
        explainer = shap.Explainer(model, X_test)
        shap_values = explainer(X_test)
    except Exception as e:
        pass  # SHAP may fail for some models

    # Save model
    os.makedirs("models", exist_ok=True)
    path = f"models/{algo}_model.pkl"
    joblib.dump(model, path)

    return metrics, shap_values, path

def blend_models(df, seed=42):
    """Simple hybrid blend: average predictions from LogReg + XGB + SGD."""
    y = df["default"]
    X = df.drop(columns=["default","loan_health"], errors="ignore")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed, stratify=y
    )

    models = {
        "LogReg": LogisticRegression(max_iter=500),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "SGD": SGDClassifier(loss="log_loss", max_iter=1000)
    }

    preds = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        try:
            p = model.predict_proba(X_test)[:,1]
        except:
            p = model.decision_function(X_test)
        preds.append(p)

    y_scores = np.mean(preds, axis=0)
    y_pred = (y_scores >= 0.5).astype(int)

    metrics = {
        "algo": "HybridBlend",
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "auc": roc_auc_score(y_test, y_scores),
        "n_test": len(y_test),
        "y_pred": y_pred.tolist(),
        "y_scores": y_scores.tolist()
    }

    return metrics

Overwriting modules/ml/engine.py


In [None]:
%%writefile modules/ml/engine.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier
import shap, joblib, os

def train_model(df, algo="LogReg", lr=0.1, test_size=0.3, seed=42):
    """Train a single model with encoding for categorical features."""
    y = df["default"]
    X = df.drop(columns=["default","loan_health"], errors="ignore")

    # Encode categoricals
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )

    if algo == "LogReg":
        model = LogisticRegression(max_iter=500)
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", learning_rate="optimal", eta0=lr, max_iter=1000)
    elif algo == "XGBoost":
        model = XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", learning_rate=lr, n_estimators=200
        )
    else:
        raise ValueError(f"Unsupported algo {algo}")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    try:
        y_scores = model.predict_proba(X_test)[:,1]
    except:
        y_scores = model.decision_function(X_test)

    metrics = {
        "algo": algo,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "auc": roc_auc_score(y_test, y_scores),
        "n_train": len(y_train),
        "n_test": len(y_test),
        "y_pred": y_pred.tolist(),
        "y_scores": y_scores.tolist()
    }

    shap_values = None
    try:
        explainer = shap.Explainer(model, X_test)
        shap_values = explainer(X_test)
    except:
        pass

    os.makedirs("models", exist_ok=True)
    path = f"models/{algo}_model.pkl"
    joblib.dump(model, path)

    return metrics, shap_values, path

def blend_models(df, seed=42):
    """Simple hybrid blend: average predictions from LogReg + XGB + SGD."""
    y = df["default"]
    X = df.drop(columns=["default","loan_health"], errors="ignore")
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed, stratify=y
    )

    models = {
        "LogReg": LogisticRegression(max_iter=500),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "SGD": SGDClassifier(loss="log_loss", max_iter=1000)
    }

    preds = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        try:
            p = model.predict_proba(X_test)[:,1]
        except:
            p = model.decision_function(X_test)
        preds.append(p)

    y_scores = np.mean(preds, axis=0)
    y_pred = (y_scores >= 0.5).astype(int)

    metrics = {
        "algo": "HybridBlend",
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "auc": roc_auc_score(y_test, y_scores),
        "n_test": len(y_test),
        "y_pred": y_pred.tolist(),
        "y_scores": y_scores.tolist()
    }

    return metrics

Overwriting modules/ml/engine.py


In [None]:
# === Test ML Engine Patch ===
import pandas as pd
from modules.synth import generators as g
from modules.ml import engine
import importlib

# reload engine to pick up changes
importlib.reload(engine)

# Generate a small synthetic dataset
df = g.generate_clients_loans(n_rows=500, seed=123)
df["default"] = (df["loan_health"] != "performing").astype(int)

# Train three models
for algo in ["LogReg", "SGD", "XGBoost"]:
    metrics, shap_values, path = engine.train_model(df, algo, lr=0.1)
    print(f"\n=== {algo} ===")
    print("Saved:", path)
    print("Accuracy:", round(metrics["accuracy"], 3),
          "Precision:", round(metrics["precision"], 3),
          "Recall:", round(metrics["recall"], 3),
          "AUC:", round(metrics["auc"], 3))
    print("Pred sample:", metrics["y_pred"][:10])
    print("Scores sample:", [round(s,3) for s in metrics["y_scores"][:10]])

# Test hybrid
metrics = engine.blend_models(df)
print("\n=== HybridBlend ===")
print("Accuracy:", round(metrics["accuracy"], 3),
      "Precision:", round(metrics["precision"], 3),
      "Recall:", round(metrics["recall"], 3),
      "AUC:", round(metrics["auc"], 3))
print("Pred sample:", metrics["y_pred"][:10])
print("Scores sample:", [round(s,3) for s in metrics["y_scores"][:10]])


=== LogReg ===
Saved: models/LogReg_model.pkl
Accuracy: 0.667 Precision: 0.667 Recall: 1.0 AUC: 0.551
Pred sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Scores sample: [0.709, 0.703, 0.621, 0.699, 0.705, 0.711, 0.719, 0.665, 0.692, 0.705]

=== SGD ===
Saved: models/SGD_model.pkl
Accuracy: 0.333 Precision: 0.0 Recall: 0.0 AUC: 0.5
Pred sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Scores sample: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
Saved: models/XGBoost_model.pkl
Accuracy: 0.58 Precision: 0.664 Recall: 0.75 AUC: 0.494
Pred sample: [1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
Scores sample: [0.813, 0.535, 0.603, 0.893, 0.625, 0.576, 0.355, 0.794, 0.974, 0.767]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== HybridBlend ===
Accuracy: 0.513 Precision: 0.708 Recall: 0.46 AUC: 0.524
Pred sample: [1, 0, 0, 1, 0, 0, 0, 1, 1, 0]
Scores sample: [0.53, 0.41, 0.437, 0.511, 0.445, 0.407, 0.367, 0.533, 0.561, 0.495]


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os

faker = Faker("en_US")

# Realistic Kenyan towns (~70 branches across regions)
KENYAN_TOWNS = [
    # Nairobi & Central
    "Nairobi", "Thika", "Kiambu", "Murang'a", "Nyeri", "Embu", "Meru",
    # Coast
    "Mombasa", "Kilifi", "Lamu", "Malindi", "Kwale", "Voi", "Taita",
    # Rift Valley
    "Nakuru", "Naivasha", "Kericho", "Eldoret", "Kitale", "Bomet", "Narok",
    # Eastern
    "Machakos", "Kitui", "Makueni", "Isiolo", "Marsabit",
    # Western
    "Kakamega", "Bungoma", "Vihiga", "Busia",
    # Nyanza
    "Kisumu", "Siaya", "Homa Bay", "Migori", "Kisii", "Nyamira",
]

LOAN_PRODUCTS = {
    "Inuka": 5,
    "Kuza": 4,
    "Fadhili": 6,
    "Imara": 8,
    "SME": 12
}

def derive_age_from_id(gov_id: str) -> int:
    """Estimate age from Kenyan ID pattern (rough approximation)."""
    try:
        prefix = int(str(gov_id)[:2])
        if prefix < 20:
            return random.randint(55, 70)
        elif prefix < 30:
            return random.randint(40, 55)
        elif prefix < 33:
            return random.randint(30, 40)
        elif prefix < 35:
            return random.randint(25, 30)
        else:
            return random.randint(20, 25)
    except:
        return random.randint(22, 55)

def guess_gender(name: str) -> str:
    """Guess gender from name (very rough)."""
    female_markers = ["a", "ah", "na", "trice", "lyn", "mary", "rose", "cynthia", "grace"]
    if any(name.lower().endswith(suf) for suf in female_markers):
        return "F"
    return random.choices(["F","M"], weights=[0.65,0.35])[0]

def generate_clients_loans(
    n_rows:int=1000,
    default_rate:float=0.2,
    gender_ratio:float=0.65,
    loan_mean:int=20000,
    branches:int=70,
    product_weeks:int=6,
    region_bias:str="balanced",
    seed:int|None=None
) -> pd.DataFrame:
    """Generate realistic synthetic microfinance loan data."""
    if seed: random.seed(seed); np.random.seed(seed)

    towns = random.sample(KENYAN_TOWNS, min(branches, len(KENYAN_TOWNS)))
    products = list(LOAN_PRODUCTS.keys())

    rows = []
    for i in range(n_rows):
        cust_name = faker.name()
        gov_id = str(random.randint(20000000, 39999999))
        age = derive_age_from_id(gov_id)

        # Gender bias
        if random.random() < gender_ratio:
            gender = "F"
        else:
            gender = "M"
        # backup from name guess
        if not gender:
            gender = guess_gender(cust_name)

        branch = random.choice(towns)
        product = random.choice(products)
        weeks = LOAN_PRODUCTS[product]

        # Loan size around mean ± variance
        amount = max(1000, int(np.random.normal(loan_mean, loan_mean*0.3)))

        # Loan type
        loan_type = random.choice(["normal", "group", "SME"])

        # Status
        status = random.choice(["active", "pending approval"])

        # Loan health based on default_rate
        if random.random() < default_rate:
            loan_health = random.choice(["delinquent", "default"])
        else:
            loan_health = "performing"

        # Derived features
        dti = round(random.uniform(0.1, 0.8), 2)  # debt-to-income
        created_date = datetime.now() - timedelta(days=random.randint(0, 365))

        rows.append({
            "customer_id": f"CUST{i+1:06d}",
            "customer_name": cust_name,
            "gov_id": gov_id,
            "age": age,
            "gender": gender,
            "branch": branch,
            "product": product,
            "product_weeks": weeks,
            "loan_amount": amount,
            "loan_type": loan_type,
            "status": status,
            "loan_health": loan_health,
            "debt_to_income": dti,
            "created_date": created_date.strftime("%Y-%m-%d")
        })

    return pd.DataFrame(rows)

def generate_batches(n_clients:int, batch_size:int, out_dir:str="data/sandbox_batches", fmt:str="csv", seed:int=1):
    """Generate multiple batch files for sandbox use."""
    os.makedirs(out_dir, exist_ok=True)
    n_batches = max(1, n_clients // batch_size)
    files = []
    for b in range(1, n_batches+1):
        df = generate_clients_loans(batch_size, seed=seed+b)
        fname = os.path.join(out_dir, f"batch_{b}.csv")
        df.to_csv(fname, index=False)
        files.append(fname)
    return files

Overwriting modules/synth/generators.py


In [None]:
import importlib
import modules.synth.generators as g

# reload to pick up the new code we just wrote
importlib.reload(g)

# Now test again
df = g.generate_clients_loans(
    n_rows=20,
    default_rate=0.2,
    gender_ratio=0.7,
    loan_mean=25000,
    branches=15,
    product_weeks=6,
    seed=42
)
print(df.head())
print("Shape:", df.shape)
print("Loan health counts:\n", df['loan_health'].value_counts())

  customer_id     customer_name    gov_id  age gender    branch  product  \
0  CUST000001     Paul Gonzalez  20999828   42      F  Machakos      SME   
1  CUST000002     Scott Sanchez  27396759   54      F    Kitale    Inuka   
2  CUST000003      Laura Walker  23429607   42      F     Nyeri  Fadhili   
3  CUST000004       Leah Abbott  24188470   52      F    Nakuru      SME   
4  CUST000005  Kristina Jackson  29710248   42      M     Taita    Inuka   

   product_weeks  loan_amount loan_type            status loan_health  \
0             12        28725    normal            active  performing   
1              5        23963    normal  pending approval  performing   
2              6        29857       SME  pending approval  performing   
3             12        36422     group            active  performing   
4              5        23243     group  pending approval  performing   

   debt_to_income created_date  
0            0.59   2025-01-29  
1            0.21   2025-03-12  
2    

In [None]:
import modules.synth.generators as g

df = g.generate_clients_loans(
    n_rows=20,
    default_rate=0.2,
    gender_ratio=0.7,
    loan_mean=25000,
    branches=15,
    product_weeks=6,
    seed=42
)
print(df.head())
print("Shape:", df.shape)
print("Loan health counts:\n", df['loan_health'].value_counts())

  customer_id    customer_name    gov_id  age gender    branch  product  \
0  CUST000001    Kayla Watkins  20999828   42      F  Machakos      SME   
1  CUST000002   Joshua Shannon  27396759   54      F    Kitale    Inuka   
2  CUST000003     Adam Marquez  23429607   42      F     Nyeri  Fadhili   
3  CUST000004      David Hardy  24188470   52      F    Nakuru      SME   
4  CUST000005  Charles Johnson  29710248   42      M     Taita    Inuka   

   product_weeks  loan_amount loan_type            status loan_health  \
0             12        28725    normal            active  performing   
1              5        23963    normal  pending approval  performing   
2              6        29857       SME  pending approval  performing   
3             12        36422     group            active  performing   
4              5        23243     group  pending approval  performing   

   debt_to_income created_date  
0            0.59   2025-01-29  
1            0.21   2025-03-12  
2          

In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import pandas as pd
import os
import importlib
from modules.synth import generators as g

importlib.reload(g)  # always reload latest generator

def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    st.markdown("### 🔹 Data Generation Controls")

    # --- Sliders for 7 parameters ---
    n_rows = st.slider("Number of clients", 100, 10000, 1000, step=100)
    default_rate = st.slider("Default rate (%)", 0, 50, 20, step=1) / 100.0
    gender_ratio = st.slider("Female ratio (%)", 0, 100, 70, step=5) / 100.0
    loan_mean = st.number_input("Average loan amount (KES)", 5000, 100000, 25000, step=1000)
    branches = st.slider("Number of branches", 5, 70, 20, step=1)
    product_weeks = st.slider("Typical product cycle (weeks)", 4, 12, 6, step=1)
    region_bias = st.selectbox("Region bias", ["balanced", "Eastern", "Coast", "Central", "Rift", "Nairobi", "Western"])

    # --- Button to generate dataset ---
    if st.button("🚀 Generate Dataset"):
        df = g.generate_clients_loans(
            n_rows=n_rows,
            default_rate=default_rate,
            gender_ratio=gender_ratio,
            loan_mean=loan_mean,
            branches=branches,
            product_weeks=product_weeks,
            region_bias=region_bias,
            seed=42
        )
        st.success(f"✅ Generated dataset with {df.shape[0]} rows and {df.shape[1]} columns")
        st.dataframe(df.head(20))

        # Save to sandbox_batches
        os.makedirs("data/sandbox_batches", exist_ok=True)
        fname = f"data/sandbox_batches/sandbox_latest.csv"
        df.to_csv(fname, index=False)
        st.info(f"📂 Saved to {fname}")

        # Show quick stats
        st.write("### Stats")
        st.write(df["loan_health"].value_counts(normalize=True).round(2))
        st.write(df["gender"].value_counts(normalize=True).round(2))

Overwriting pages/_04_admin_sandbox.py


In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)

# Run the app function (simulates Streamlit run)
sandbox.app()
print("✅ Sandbox app loaded (UI should appear in Streamlit)")

2025-08-31 09:51:01.116 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-08-31 09:51:01.163 Session state does not function when running a script without `streamlit run`


✅ Sandbox app loaded (UI should appear in Streamlit)


In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import pandas as pd
import os
import importlib
from modules.synth import generators as g
from modules.ml import engine  # we'll use your ml engine

importlib.reload(g)
importlib.reload(engine)

def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    # === TAB SETUP ===
    tabs = st.tabs(["Data Generation", "Model Training", "Stress Testing", "Audit Logs"])

    # ---------------- TAB 1: Data Generation ----------------
    with tabs[0]:
        st.markdown("### 🔹 Data Generation Controls")

        n_rows = st.slider("Number of clients", 100, 10000, 1000, step=100)
        default_rate = st.slider("Default rate (%)", 0, 50, 20, step=1) / 100.0
        gender_ratio = st.slider("Female ratio (%)", 0, 100, 70, step=5) / 100.0
        loan_mean = st.number_input("Average loan amount (KES)", 5000, 100000, 25000, step=1000)
        branches = st.slider("Number of branches", 5, 70, 20, step=1)
        product_weeks = st.slider("Typical product cycle (weeks)", 4, 12, 6, step=1)
        region_bias = st.selectbox("Region bias", ["balanced", "Eastern", "Coast", "Central", "Rift", "Nairobi", "Western"])

        if st.button("🚀 Generate Dataset"):
            df = g.generate_clients_loans(
                n_rows=n_rows,
                default_rate=default_rate,
                gender_ratio=gender_ratio,
                loan_mean=loan_mean,
                branches=branches,
                product_weeks=product_weeks,
                region_bias=region_bias,
                seed=42
            )
            st.success(f"✅ Generated dataset with {df.shape[0]} rows and {df.shape[1]} columns")
            st.dataframe(df.head(20))

            os.makedirs("data/sandbox_batches", exist_ok=True)
            fname = f"data/sandbox_batches/sandbox_latest.csv"
            df.to_csv(fname, index=False)
            st.info(f"📂 Saved to {fname}")

            st.write("### Stats")
            st.write(df["loan_health"].value_counts(normalize=True).round(2))
            st.write(df["gender"].value_counts(normalize=True).round(2))

    # ---------------- TAB 2: Model Training ----------------
    with tabs[1]:
        st.markdown("### 🔹 Train Models")

        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname} with {df.shape[0]} rows")

            algo = st.selectbox("Choose algorithm", ["LogReg", "SGD", "XGBoost", "HybridBlend"])
            if st.button("⚡ Train Model"):
                metrics, shap_values, model_path = engine.train_model(df, algo)
                st.success(f"✅ {algo} trained and saved to {model_path}")

                st.write("### Metrics")
                st.json(metrics)

                st.write("### Prediction Sample")
                preds, scores = engine.predict_sample(df, algo)
                st.write(preds[:10])
                st.write(scores[:10])
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

Overwriting pages/_04_admin_sandbox.py


In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Sandbox with Model Training (Tab 2) loaded")



✅ Sandbox with Model Training (Tab 2) loaded


In [None]:
%%writefile modules/ml/engine.py
import pandas as pd
import numpy as np
import os, joblib
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb

# --- Core training function ---
def train_model(df, algo="LogReg", lr=0.1, seed=42):
    os.makedirs("models", exist_ok=True)

    # Features (numeric only)
    features = ["age", "loan_amount", "debt_to_income", "product_weeks"]
    X = df[features].values
    y = (df["loan_health"] != "performing").astype(int)

    if algo == "LogReg":
        model = LogisticRegression(max_iter=1000)
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", max_iter=1000)
    elif algo == "XGBoost":
        model = xgb.XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            learning_rate=lr,
            n_estimators=50,
            max_depth=4,
            random_state=seed
        )
    elif algo == "HybridBlend":
        # Load or train 3 base models
        base_models = {}
        for sub in ["LogReg", "SGD", "XGBoost"]:
            path = f"models/{sub}_model.pkl"
            if os.path.exists(path):
                base_models[sub] = joblib.load(path)
            else:
                metrics, _, _ = train_model(df, sub, seed=seed)
                base_models[sub] = joblib.load(path)

        def hybrid_predict(X):
            preds = []
            for sub, weight in zip(["LogReg","SGD","XGBoost"], [0.3, 0.2, 0.5]):
                try:
                    p = base_models[sub].predict_proba(X)[:,1]
                except:
                    p = base_models[sub].predict(X)
                preds.append(p * weight)
            return np.sum(preds, axis=0)

        model = base_models
        preds = (hybrid_predict(X) > 0.5).astype(int)
        scores = hybrid_predict(X)

        metrics = {
            "Accuracy": float(accuracy_score(y, preds)),
            "Precision": float(precision_score(y, preds, zero_division=0)),
            "Recall": float(recall_score(y, preds, zero_division=0)),
            "AUC": float(roc_auc_score(y, scores))
        }
        path = "models/HybridBlend_model.pkl"
        joblib.dump(model, path)
        return metrics, None, path

    # --- Train single model ---
    model.fit(X, y)
    preds = model.predict(X)
    scores = model.predict_proba(X)[:,1] if hasattr(model, "predict_proba") else preds

    metrics = {
        "Accuracy": float(accuracy_score(y, preds)),
        "Precision": float(precision_score(y, preds, zero_division=0)),
        "Recall": float(recall_score(y, preds, zero_division=0)),
        "AUC": float(roc_auc_score(y, scores))
    }

    path = f"models/{algo}_model.pkl"
    joblib.dump(model, path)

    return metrics, scores, path

def predict_sample(df, algo="LogReg", n=10):
    features = ["age", "loan_amount", "debt_to_income", "product_weeks"]
    X = df[features].values
    y = (df["loan_health"] != "performing").astype(int)

    model = joblib.load(f"models/{algo}_model.pkl")

    if algo == "HybridBlend":
        preds, scores = [], []
        for sub, weight in zip(["LogReg","SGD","XGBoost"], [0.3, 0.2, 0.5]):
            submodel = model[sub]
            try:
                p = submodel.predict_proba(X)[:,1]
            except:
                p = submodel.predict(X)
            scores.append(p * weight)
        score_final = np.sum(scores, axis=0)
        preds_final = (score_final > 0.5).astype(int)
        return preds_final[:n].tolist(), score_final[:n].round(3).tolist()

    preds = model.predict(X)
    scores = model.predict_proba(X)[:,1] if hasattr(model, "predict_proba") else preds
    return preds[:n].tolist(), scores[:n].round(3).tolist()

def stress_test(df, shock_type="default", intensity=0.2):
    """Apply stress scenario to dataset."""
    df = df.copy()
    if shock_type == "default":
        # Flip some performing loans to default
        mask = df["loan_health"]=="performing"
        flip_idx = df[mask].sample(frac=intensity, replace=False).index
        df.loc[flip_idx,"loan_health"]="default"
    elif shock_type == "loan_amount":
        df["loan_amount"] = df["loan_amount"] * (1+intensity)
    elif shock_type == "unemployment":
        # simulate by raising debt-to-income
        df["debt_to_income"] = np.minimum(1.0, df["debt_to_income"]*(1+intensity))
    return df

Overwriting modules/ml/engine.py


In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import pandas as pd
import os
import importlib
from modules.synth import generators as g
from modules.ml import engine

# Always reload modules (fresh patches)
importlib.reload(g)
importlib.reload(engine)

def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing"
    ])

    # ---------------- TAB 1: Data Generation ----------------
    with tabs[0]:
        st.subheader("🔹 Data Generation Controls")

        n_rows = st.slider("Number of clients", 100, 10000, 1000, step=100)
        default_rate = st.slider("Default rate (%)", 0, 50, 20, step=1) / 100.0
        gender_ratio = st.slider("Female ratio (%)", 0, 100, 70, step=5) / 100.0
        loan_mean = st.number_input("Average loan amount (KES)", 5000, 100000, 25000, step=1000)
        branches = st.slider("Number of branches", 5, 70, 20, step=1)
        product_weeks = st.slider("Typical product cycle (weeks)", 4, 12, 6, step=1)
        region_bias = st.selectbox("Region bias", ["balanced","Eastern","Coast","Central","Rift","Nairobi","Western"])

        if st.button("🚀 Generate Dataset"):
            df = g.generate_clients_loans(
                n_rows=n_rows,
                default_rate=default_rate,
                gender_ratio=gender_ratio,
                loan_mean=loan_mean,
                branches=branches,
                product_weeks=product_weeks,
                region_bias=region_bias,
                seed=42
            )
            st.success(f"✅ Generated dataset with {df.shape[0]} rows and {df.shape[1]} columns")
            st.dataframe(df.head(20))

            os.makedirs("data/sandbox_batches", exist_ok=True)
            fname = "data/sandbox_batches/sandbox_latest.csv"
            df.to_csv(fname, index=False)
            st.info(f"📂 Saved to {fname}")

    # ---------------- TAB 2: Model Training ----------------
    with tabs[1]:
        st.subheader("🔹 Model Training")

        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            if st.button("⚡ Train Models"):
                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, path = engine.train_model(df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
                    preds, scores = engine.predict_sample(df, algo)
                    st.write("Sample preds:", preds)
                    st.write("Sample scores:", scores)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

    # ---------------- TAB 3: Stress Testing ----------------
    with tabs[2]:
        st.subheader("🔹 Stress Testing")

        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            shock = st.selectbox("Shock type", ["default","loan_amount","unemployment"])
            intensity = st.slider("Shock intensity", 0.0, 1.0, 0.2, step=0.05)

            if st.button("🔥 Run Stress Test"):
                stressed_df = engine.stress_test(df, shock, intensity)
                st.write("📊 Preview stressed dataset")
                st.dataframe(stressed_df.head(10))

                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, _ = engine.train_model(stressed_df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

Overwriting pages/_04_admin_sandbox.py


In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Sandbox Tabs 1–3 fully loaded")



✅ Sandbox Tabs 1–3 fully loaded


In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import pandas as pd
import os
import importlib
import sqlite3
from modules.synth import generators as g
from modules.ml import engine

# Always reload modules
importlib.reload(g)
importlib.reload(engine)

def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing",
        "4️⃣ Audit Logs"
    ])

    # ---------------- TAB 1: Data Generation ----------------
    with tabs[0]:
        st.subheader("🔹 Data Generation Controls")
        n_rows = st.slider("Number of clients", 100, 10000, 1000, step=100)
        default_rate = st.slider("Default rate (%)", 0, 50, 20, step=1) / 100.0
        gender_ratio = st.slider("Female ratio (%)", 0, 100, 70, step=5) / 100.0
        loan_mean = st.number_input("Average loan amount (KES)", 5000, 100000, 25000, step=1000)
        branches = st.slider("Number of branches", 5, 70, 20, step=1)
        product_weeks = st.slider("Typical product cycle (weeks)", 4, 12, 6, step=1)
        region_bias = st.selectbox("Region bias", ["balanced","Eastern","Coast","Central","Rift","Nairobi","Western"])

        if st.button("🚀 Generate Dataset"):
            df = g.generate_clients_loans(
                n_rows=n_rows,
                default_rate=default_rate,
                gender_ratio=gender_ratio,
                loan_mean=loan_mean,
                branches=branches,
                product_weeks=product_weeks,
                region_bias=region_bias,
                seed=42
            )
            st.success(f"✅ Generated dataset with {df.shape[0]} rows")
            st.dataframe(df.head(20))

            os.makedirs("data/sandbox_batches", exist_ok=True)
            fname = "data/sandbox_batches/sandbox_latest.csv"
            df.to_csv(fname, index=False)
            st.info(f"📂 Saved to {fname}")

    # ---------------- TAB 2: Model Training ----------------
    with tabs[1]:
        st.subheader("🔹 Model Training")
        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            if st.button("⚡ Train Models"):
                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, path = engine.train_model(df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
                    preds, scores = engine.predict_sample(df, algo)
                    st.write("Sample preds:", preds)
                    st.write("Sample scores:", scores)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

    # ---------------- TAB 3: Stress Testing ----------------
    with tabs[2]:
        st.subheader("🔹 Stress Testing")
        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            shock = st.selectbox("Shock type", ["default","loan_amount","unemployment"])
            intensity = st.slider("Shock intensity", 0.0, 1.0, 0.2, step=0.05)

            if st.button("🔥 Run Stress Test"):
                stressed_df = engine.stress_test(df, shock, intensity)
                st.write("📊 Preview stressed dataset")
                st.dataframe(stressed_df.head(10))

                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, _ = engine.train_model(stressed_df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

    # ---------------- TAB 4: Audit Logs ----------------
    with tabs[3]:
        st.subheader("🔹 Audit Logs")

        db_path = "data/audit.db"
        if os.path.exists(db_path):
            conn = sqlite3.connect(db_path)
            df_audit = pd.read_sql("SELECT * FROM audit_logs ORDER BY created_at DESC LIMIT 200", conn)
            conn.close()

            st.write("Latest audit entries:")
            st.dataframe(df_audit)

            # Filters
            user_filter = st.text_input("Filter by user")
            action_filter = st.text_input("Filter by action")
            if user_filter:
                df_audit = df_audit[df_audit["user"].str.contains(user_filter, case=False)]
            if action_filter:
                df_audit = df_audit[df_audit["action"].str.contains(action_filter, case=False)]
            st.write("Filtered results:", df_audit.shape[0])
            st.dataframe(df_audit)
        else:
            st.warning("⚠️ No audit database found at data/audit.db")

Overwriting pages/_04_admin_sandbox.py


In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import pandas as pd
import os
import importlib
import sqlite3
from modules.synth import generators as g
from modules.ml import engine

# Always reload modules
importlib.reload(g)
importlib.reload(engine)

def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing",
        "4️⃣ Audit Logs"
    ])

    # ---------------- TAB 1: Data Generation ----------------
    with tabs[0]:
        st.subheader("🔹 Data Generation Controls")
        n_rows = st.slider("Number of clients", 100, 10000, 1000, step=100)
        default_rate = st.slider("Default rate (%)", 0, 50, 20, step=1) / 100.0
        gender_ratio = st.slider("Female ratio (%)", 0, 100, 70, step=5) / 100.0
        loan_mean = st.number_input("Average loan amount (KES)", 5000, 100000, 25000, step=1000)
        branches = st.slider("Number of branches", 5, 70, 20, step=1)
        product_weeks = st.slider("Typical product cycle (weeks)", 4, 12, 6, step=1)
        region_bias = st.selectbox("Region bias", ["balanced","Eastern","Coast","Central","Rift","Nairobi","Western"])

        if st.button("🚀 Generate Dataset"):
            df = g.generate_clients_loans(
                n_rows=n_rows,
                default_rate=default_rate,
                gender_ratio=gender_ratio,
                loan_mean=loan_mean,
                branches=branches,
                product_weeks=product_weeks,
                region_bias=region_bias,
                seed=42
            )
            st.success(f"✅ Generated dataset with {df.shape[0]} rows")
            st.dataframe(df.head(20))

            os.makedirs("data/sandbox_batches", exist_ok=True)
            fname = "data/sandbox_batches/sandbox_latest.csv"
            df.to_csv(fname, index=False)
            st.info(f"📂 Saved to {fname}")

    # ---------------- TAB 2: Model Training ----------------
    with tabs[1]:
        st.subheader("🔹 Model Training")
        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            if st.button("⚡ Train Models"):
                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, path = engine.train_model(df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
                    preds, scores = engine.predict_sample(df, algo)
                    st.write("Sample preds:", preds)
                    st.write("Sample scores:", scores)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

    # ---------------- TAB 3: Stress Testing ----------------
    with tabs[2]:
        st.subheader("🔹 Stress Testing")
        fname = "data/sandbox_batches/sandbox_latest.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            st.success(f"Loaded {fname}")

            shock = st.selectbox("Shock type", ["default","loan_amount","unemployment"])
            intensity = st.slider("Shock intensity", 0.0, 1.0, 0.2, step=0.05)

            if st.button("🔥 Run Stress Test"):
                stressed_df = engine.stress_test(df, shock, intensity)
                st.write("📊 Preview stressed dataset")
                st.dataframe(stressed_df.head(10))

                for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                    metrics, _, _ = engine.train_model(stressed_df, algo)
                    st.write(f"### {algo}")
                    st.json(metrics)
        else:
            st.warning("⚠️ Please generate a dataset in Tab 1 first.")

    # ---------------- TAB 4: Audit Logs ----------------
    with tabs[3]:
        st.subheader("🔹 Audit Logs")

        db_path = "data/audit.db"
        os.makedirs("data", exist_ok=True)
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()

        # Ensure table exists
        cur.execute("""
        CREATE TABLE IF NOT EXISTS audit_logs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user TEXT,
            action TEXT,
            reason TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
        """)
        conn.commit()

        # Seed with sample if empty
        cur.execute("SELECT COUNT(*) FROM audit_logs")
        if cur.fetchone()[0] == 0:
            cur.executemany(
                "INSERT INTO audit_logs (user, action, reason) VALUES (?, ?, ?)",
                [
                    ("admin", "create_dataset", "Initial synthetic data"),
                    ("analyst", "train_model", "Ran baseline LogReg"),
                    ("auditor", "view_logs", "Checked audit table")
                ]
            )
            conn.commit()

        # Load table
        df_audit = pd.read_sql("SELECT * FROM audit_logs ORDER BY created_at DESC LIMIT 200", conn)
        conn.close()

        st.write("Latest audit entries:")
        st.dataframe(df_audit)

        # Filters
        user_filter = st.text_input("Filter by user")
        action_filter = st.text_input("Filter by action")
        if user_filter:
            df_audit = df_audit[df_audit["user"].str.contains(user_filter, case=False)]
        if action_filter:
            df_audit = df_audit[df_audit["action"].str.contains(action_filter, case=False)]
        st.write("Filtered results:", df_audit.shape[0])
        st.dataframe(df_audit)

Overwriting pages/_04_admin_sandbox.py


In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Tabs 1–4 working (Audit logs auto-created)")



✅ Tabs 1–4 working (Audit logs auto-created)


In [None]:
%%writefile pages/_04_admin_sandbox.py
import streamlit as st
import os, sqlite3, pandas as pd
import modules.synth.generators as g
import modules.ml.engine as engine

# ---------------- TAB 1: Data Generation ----------------
def render_tab1():
    st.markdown("### 🔹 Data Generation")
    n_rows = st.slider("Number of rows", 100, 5000, 1000, step=100)
    default_rate = st.slider("Default rate", 0.0, 0.5, 0.1, step=0.05)
    gender_ratio = st.slider("Female ratio", 0.0, 1.0, 0.6, step=0.1)
    loan_mean = st.slider("Avg loan amount", 5000, 100000, 20000, step=5000)
    branches = st.slider("Branches", 5, 50, 20)
    product_weeks = st.slider("Product weeks", 4, 12, 6)
    region_bias = st.checkbox("Bias by region?", value=True)

    if st.button("🚀 Generate Dataset"):
        df = g.generate_clients_loans(
            n_rows=n_rows,
            default_rate=default_rate,
            gender_ratio=gender_ratio,
            loan_mean=loan_mean,
            branches=branches,
            product_weeks=product_weeks,
            region_bias=region_bias,
            seed=42
        )
        st.success(f"✅ Generated dataset with {df.shape[0]} rows")
        st.dataframe(df.head(20))

        os.makedirs("data/sandbox_batches", exist_ok=True)
        fname = "data/sandbox_batches/sandbox_latest.csv"
        df.to_csv(fname, index=False)
        st.info(f"📂 Saved to {fname}")

# ---------------- TAB 2: Model Training ----------------
def render_tab2():
    st.markdown("### 🔹 Model Training")
    fname = "data/sandbox_batches/sandbox_latest.csv"
    if os.path.exists(fname):
        df = pd.read_csv(fname)
        st.success(f"Loaded {fname} with {df.shape[0]} rows")

        if st.button("⚡ Train Models"):
            for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                metrics, _, path = engine.train_model(df, algo)
                st.write(f"### {algo}")
                st.json(metrics)
                preds, scores = engine.predict_sample(df, algo)
                st.write("Sample preds:", preds)
                st.write("Sample scores:", scores)
    else:
        st.warning("⚠️ Please generate a dataset in Tab 1 first.")

# ---------------- TAB 3: Stress Testing ----------------
def render_tab3():
    st.markdown("### 🔹 Stress Testing")

    fname = "data/sandbox_batches/sandbox_latest.csv"
    if os.path.exists(fname):
        df = pd.read_csv(fname)
        st.success(f"Loaded {fname}")

        shock = st.selectbox("Shock type", ["default","loan_amount","unemployment"])
        intensity = st.slider("Shock intensity", 0.0, 1.0, 0.2, step=0.05)

        if st.button("🔥 Run Stress Test"):
            stressed_df = engine.stress_test(df, shock, intensity)
            st.write("📊 Preview stressed dataset")
            st.dataframe(stressed_df.head(10))

            for algo in ["LogReg","SGD","XGBoost","HybridBlend"]:
                metrics, _, _ = engine.train_model(stressed_df, algo)
                st.write(f"### {algo}")
                st.json(metrics)
    else:
        st.warning("⚠️ Please generate a dataset in Tab 1 first.")

# ---------------- TAB 4: Audit Logs ----------------
def render_tab4():
    st.subheader("🔹 Audit Logs")

    db_path = "data/audit.db"
    os.makedirs("data", exist_ok=True)
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    cur.execute("""
    CREATE TABLE IF NOT EXISTS audit_logs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        user TEXT,
        action TEXT,
        reason TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
    """)
    conn.commit()

    cur.execute("SELECT COUNT(*) FROM audit_logs")
    if cur.fetchone()[0] == 0:
        cur.executemany(
            "INSERT INTO audit_logs (user, action, reason) VALUES (?, ?, ?)",
            [
                ("admin", "create_dataset", "Initial synthetic data"),
                ("analyst", "train_model", "Ran baseline LogReg"),
                ("auditor", "view_logs", "Checked audit table")
            ]
        )
        conn.commit()

    df_audit = pd.read_sql("SELECT * FROM audit_logs ORDER BY created_at DESC LIMIT 200", conn)
    conn.close()

    st.write("Latest audit entries:")
    st.dataframe(df_audit)

    user_filter = st.text_input("Filter by user")
    action_filter = st.text_input("Filter by action")
    if user_filter:
        df_audit = df_audit[df_audit["user"].str.contains(user_filter, case=False)]
    if action_filter:
        df_audit = df_audit[df_audit["action"].str.contains(action_filter, case=False)]
    st.write("Filtered results:", df_audit.shape[0])
    st.dataframe(df_audit)

# ---------------- MAIN APP ----------------
def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing",
        "4️⃣ Audit Logs"
    ])

    with tabs[0]: render_tab1()
    with tabs[1]: render_tab2()
    with tabs[2]: render_tab3()
    with tabs[3]: render_tab4()

Overwriting pages/_04_admin_sandbox.py


In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Sandbox refactored: each tab is its own function")



✅ Sandbox refactored: each tab is its own function


In [None]:
# ---------------- TAB 5: A/B Experiments ----------------
def render_tab5():
    st.markdown("### 🔹 A/B Model Experiments")

    fname = "data/sandbox_batches/sandbox_latest.csv"
    if not os.path.exists(fname):
        st.warning("⚠️ Please generate a dataset in Tab 1 first.")
        return

    df = pd.read_csv(fname)
    st.success(f"Loaded {fname} with {df.shape[0]} rows")

    # Select models to compare
    models = ["LogReg", "SGD", "XGBoost", "HybridBlend"]
    col1, col2 = st.columns(2)
    with col1:
        model_a = st.selectbox("Select Model A", models, index=0)
    with col2:
        model_b = st.selectbox("Select Model B", models, index=2)

    if st.button("⚔️ Run A/B Experiment"):
        metrics_a, _, _ = engine.train_model(df, model_a)
        metrics_b, _, _ = engine.train_model(df, model_b)

        st.write(f"### {model_a}")
        st.json(metrics_a)

        st.write(f"### {model_b}")
        st.json(metrics_b)

        # Quick comparison
        comparison = pd.DataFrame([metrics_a, metrics_b], index=[model_a, model_b])
        st.write("📊 Side-by-side comparison")
        st.dataframe(comparison)

In [None]:
def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing",
        "4️⃣ Audit Logs",
        "5️⃣ A/B Experiments"
    ])

    with tabs[0]:
        render_tab1()
    with tabs[1]:
        render_tab2()
    with tabs[2]:
        render_tab3()
    with tabs[3]:
        render_tab4()
    with tabs[4]:
        render_tab5()

In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Sandbox app with Tab 5 registered cleanly")



✅ Sandbox app with Tab 5 registered cleanly


In [None]:
# ---------------- TAB 6: User Management ----------------
def render_tab6():
    st.markdown("### 🔹 User Management (Impersonate / Edit / Delete)")

    db_path = "data/audit.db"
    os.makedirs("data", exist_ok=True)
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Create users table if not exists
    cur.execute("""
    CREATE TABLE IF NOT EXISTS users (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        username TEXT UNIQUE,
        role TEXT,
        branch TEXT,
        status TEXT DEFAULT 'active'
    )
    """)
    conn.commit()

    # Seed users if empty
    cur.execute("SELECT COUNT(*) FROM users")
    if cur.fetchone()[0] == 0:
        cur.executemany(
            "INSERT OR IGNORE INTO users (username, role, branch) VALUES (?, ?, ?)",
            [
                ("admin", "superadmin", "HQ"),
                ("analyst1", "analyst", "Nairobi"),
                ("officer1", "loan_officer", "Mombasa"),
            ]
        )
        conn.commit()

    # Load users
    df_users = pd.read_sql("SELECT * FROM users", conn)
    st.dataframe(df_users)

    # Select a user
    selected_user = st.selectbox("Select a user", df_users["username"].tolist())
    if selected_user:
        user_data = df_users[df_users["username"] == selected_user].iloc[0]
        st.write("User details:", user_data.to_dict())

        # Impersonate
        if st.button(f"🎭 Impersonate {selected_user}"):
            st.session_state["active_user"] = selected_user
            cur.execute("INSERT INTO audit_logs (user, action, reason) VALUES (?, ?, ?)",
                        ("admin", f"impersonate {selected_user}", "Admin action"))
            conn.commit()
            st.success(f"✅ Now impersonating {selected_user}")

        # Edit role & branch
        new_role = st.text_input("Role", value=user_data["role"])
        new_branch = st.text_input("Branch", value=user_data["branch"])
        new_status = st.selectbox("Status", ["active","suspended","deleted"], index=0)

        if st.button("💾 Save Changes"):
            cur.execute(
                "UPDATE users SET role=?, branch=?, status=? WHERE username=?",
                (new_role, new_branch, new_status, selected_user)
            )
            cur.execute("INSERT INTO audit_logs (user, action, reason) VALUES (?, ?, ?)",
                        ("admin", f"edit_user {selected_user}", f"Updated to {new_role}/{new_branch}/{new_status}"))
            conn.commit()
            st.success("✅ User updated.")

        # Delete user
        if st.button(f"🗑️ Delete {selected_user}"):
            cur.execute("DELETE FROM users WHERE username=?", (selected_user,))
            cur.execute("INSERT INTO audit_logs (user, action, reason) VALUES (?, ?, ?)",
                        ("admin", f"delete_user {selected_user}", "Admin action"))
            conn.commit()
            st.warning(f"⚠️ User {selected_user} deleted.")

    conn.close()

In [None]:
def app():
    st.title("🛠️ Admin Sandbox (Godmode)")

    tabs = st.tabs([
        "1️⃣ Data Generation",
        "2️⃣ Model Training",
        "3️⃣ Stress Testing",
        "4️⃣ Audit Logs",
        "5️⃣ A/B Experiments",
        "6️⃣ User Management"
    ])

    with tabs[0]:
        render_tab1()

    with tabs[1]:
        render_tab2()

    with tabs[2]:
        render_tab3()

    with tabs[3]:
        render_tab4()

    with tabs[4]:
        render_tab5()

    with tabs[5]:
        render_tab6()

In [None]:
import importlib
import pages._04_admin_sandbox as sandbox

importlib.reload(sandbox)
sandbox.app()
print("✅ Sandbox with Tab 6 now runs cleanly")

In [None]:
# Step 0: Install deps
!pip -q install xgboost imbalanced-learn shap

# Step 1: Wrap engine code
engine_code = r'''
# =========================================
# modules/ml/engine.py
# =========================================

# 👉 # =========================================
# ONE-CELL PATCH: modules/ml/engine.py
# - Installs deps
# - Writes full engine.py (rigorous training)
# - Runs a quick self-test with your generator
# =========================================

# ---- Step 0: Deps (quiet) ----
!pip -q install xgboost imbalanced-learn shap

# ---- Step 1: Write engine.py ----
%%writefile modules/ml/engine.py
import os, time, json, joblib, warnings
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import shap

warnings.filterwarnings("ignore", category=UserWarning)
os.makedirs("models", exist_ok=True)

TARGET_CANDIDATES = ["default", "label", "y", "is_default"]

# -------------------------
# Utilities / dataclasses
# -------------------------
@dataclass
class TrainMetrics:
    algo: str
    rows: int
    features: int
    accuracy: float
    precision: float
    recall: float
    f1: float
    auc: float
    tn: int
    fp: int
    fn: int
    tp: int
    model_path: str
    timestamp: str

def _find_or_build_target(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure df contains a binary target column named 'default'."""
    cols = [c.lower() for c in df.columns]
    if "default" in cols:
        # Just ensure int
        df["default"] = df[df.columns[cols.index("default")]].astype(int)
        return df

    # Common pattern in your synthetic data
    if "loan_health" in df.columns:
        df = df.copy()
        df["default"] = (df["loan_health"].str.lower() != "performing").astype(int)
        return df

    # Fallback: if any candidate exists, map to default
    for c in TARGET_CANDIDATES:
        if c in df.columns:
            df = df.copy()
            df["default"] = df[c].astype(int)
            return df

    raise ValueError("No target column found. Provide 'default' or a 'loan_health' to infer from.")

def _split_features(df: pd.DataFrame):
    """Split columns into numeric vs categorical, dropping strong identifiers."""
    drop_cols = set([
        "customer_name","name","client_name",
        "national_id","id_number","client_id","ref_number","ref","reference",
        "created_date","created_at","approval_date"
    ])
    base_cols = [c for c in df.columns if c not in drop_cols and c != "default"]

    # Identify numeric vs categorical
    numeric_cols, cat_cols = [], []
    for c in base_cols:
        if pd.api.types.is_numeric_dtype(df[c]):
            numeric_cols.append(c)
        else:
            cat_cols.append(c)
    return numeric_cols, cat_cols

def _build_preprocessor(numeric_cols, cat_cols):
    """ColumnTransformer with dense output to allow SMOTE."""
    numeric_t = ImbPipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_t = ImbPipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_t, numeric_cols),
            ("cat", cat_t, cat_cols)
        ],
        remainder="drop",
        sparse_threshold=0.0
    )
    return pre

def _confmat_parts(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return tn, fp, fn, tp

def _calc_metrics(algo, y_true, y_prob, y_pred, n_feats, model_path, rows):
    auc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else 0.5
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = _confmat_parts(y_true, y_pred)
    ts = time.strftime("%Y%m%d_%H%M%S")
    return TrainMetrics(
        algo=algo, rows=rows, features=n_feats,
        accuracy=acc, precision=prec, recall=rec, f1=f1, auc=auc,
        tn=tn, fp=fp, fn=fn, tp=tp, model_path=model_path, timestamp=ts
    )

def _save_registry_row(metrics: TrainMetrics):
    reg_path = "models/registry.csv"
    row = asdict(metrics)
    df_row = pd.DataFrame([row])
    if os.path.exists(reg_path):
        df_row.to_csv(reg_path, mode="a", header=False, index=False)
    else:
        df_row.to_csv(reg_path, index=False)

# -------------------------
# Public API
# -------------------------
def train_model(df: pd.DataFrame, algo: str = "LogReg",
                test_size: float = 0.2, val_size: float = 0.1, random_state: int = 42,
                smote_k_neighbors: int = 5, max_train_rows: int | None = None,
                lr: float = 0.05):
    """
    Train a model on df and persist:
      - Preprocessing (imputer/onehot/scaler)
      - SMOTE for class imbalance
      - Classifier (LogReg / SGD / XGBoost / HybridBlend)
    Returns: (metrics:dict, shap_values:dict|None, model_path:str)
    """
    if max_train_rows is not None and df.shape[0] > max_train_rows:
        df = df.sample(n=max_train_rows, random_state=random_state).reset_index(drop=True)

    df = _find_or_build_target(df)
    # Drop rows with missing target
    df = df[df["default"].isin([0,1])].copy()

    # Train/val/test split (stratified)
    X = df.drop(columns=["default"])
    y = df["default"].astype(int)

    strat = y if y.nunique() == 2 else None
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=strat
    )
    # carve validation from train_full
    val_rel = val_size / (1 - test_size)
    strat_tr = y_train_full if y_train_full.nunique() == 2 else None
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=val_rel, random_state=random_state, stratify=strat_tr
    )

    # Columns
    num_cols, cat_cols = _split_features(pd.concat([X_train, X_val, X_test], axis=0))
    pre = _build_preprocessor(num_cols, cat_cols)

    # Classifier
    if algo == "LogReg":
        clf = LogisticRegression(max_iter=2000, solver="lbfgs", C=2.0, class_weight="balanced")
    elif algo == "SGD":
        clf = SGDClassifier(loss="log_loss", penalty="elasticnet", alpha=1e-4, l1_ratio=0.15, class_weight="balanced", max_iter=2000)
    elif algo == "XGBoost":
        clf = XGBClassifier(
            n_estimators=400, max_depth=5, learning_rate=0.08, subsample=0.9, colsample_bytree=0.9,
            reg_lambda=1.0, reg_alpha=0.0, tree_method="hist", eval_metric="logloss", random_state=random_state
        )
    elif algo == "HybridBlend":
        # Train bases and return blended metrics
        m_lr, _, p_lr = train_model(df.copy(), "LogReg", test_size, val_size, random_state, smote_k_neighbors, max_train_rows, lr)
        m_sgd, _, p_sgd = train_model(df.copy(), "SGD", test_size, val_size, random_state, smote_k_neighbors, max_train_rows, lr)
        m_xgb, _, p_xgb = train_model(df.copy(), "XGBoost", test_size, val_size, random_state, smote_k_neighbors, max_train_rows, lr)

        # Load predictions on test for blending
        model_lr = joblib.load(p_lr)
        model_sgd = joblib.load(p_sgd)
        model_xgb = joblib.load(p_xgb)

        # Build a unified preprocessor to transform test set for all models safely:
        # (Our saved pipelines already include preprocessing; just call predict_proba)
        # Pick the model with the best validation AUC and give it more weight
        w_lr = 0.25
        w_sgd = 0.25
        w_xgb = 0.50

        # Need to reconstruct test from df splits; do a fresh split with same seed
        df2 = _find_or_build_target(df.copy())
        X2 = df2.drop(columns=["default"])
        y2 = df2["default"].astype(int)
        Xtr_full, Xte, ytr_full, yte = train_test_split(
            X2, y2, test_size=test_size, random_state=random_state, stratify=(y2 if y2.nunique()==2 else None)
        )

        proba_lr = model_lr.predict_proba(Xte)[:,1]
        proba_sgd = model_sgd.predict_proba(Xte)[:,1]
        proba_xgb = model_xgb.predict_proba(Xte)[:,1]
        blended = w_lr*proba_lr + w_sgd*proba_sgd + w_xgb*proba_xgb
        y_pred = (blended >= 0.5).astype(int)

        # Save blend metadata
        ts = time.strftime("%Y%m%d_%H%M%S")
        path = f"models/HybridBlend_{ts}.pkl"
        meta = {
            "type": "HybridBlend",
            "weights": {"LogReg": w_lr, "SGD": w_sgd, "XGBoost": w_xgb},
            "components": {"LogReg": p_lr, "SGD": p_sgd, "XGBoost": p_xgb},
            "created_at": ts
        }
        joblib.dump(meta, path)

        metrics = _calc_metrics("HybridBlend", yte, blended, y_pred, n_feats=-1, model_path=path, rows=df.shape[0])
        _save_registry_row(metrics)
        return asdict(metrics), None, path

    else:
        raise ValueError(f"Unknown algo: {algo}")

    # Full pipeline: preprocess -> SMOTE -> classifier
    pipe = ImbPipeline(steps=[
        ("pre", pre),
        ("smote", SMOTE(k_neighbors=smote_k_neighbors, random_state=random_state)),
        ("clf", clf)
    ])

    pipe.fit(X_train, y_train)
    # Evaluate on test
    y_prob = pipe.predict_proba(X_test)[:,1]
    y_pred = (y_prob >= 0.5).astype(int)

    ts = time.strftime("%Y%m%d_%H%M%S")
    model_path = f"models/{algo}_{ts}.pkl"
    joblib.dump(pipe, model_path)

    # SHAP (lightweight): use a small background sample
    shap_values = None
    try:
        # Sample 200 rows from test to compute SHAP
        X_small = X_test.sample(min(200, X_test.shape[0]), random_state=0)
        # Transform features
        Xt_small = pipe.named_steps["pre"].transform(X_small)
        # Choose explainer type
        if algo == "XGBoost":
            explainer = shap.TreeExplainer(pipe.named_steps["clf"])
        else:
            # KernelExplainer over decision function/proba; use dense background
            background = shap.sample(Xt_small, 100)
            # Define a prediction function that accepts pre-transformed arrays
            model = pipe.named_steps["clf"]
            pred_fn = lambda A: model.predict_proba(A)[:,1]
            explainer = shap.KernelExplainer(pred_fn, background)
        # Compute SHAP on small set
        sv = explainer.shap_values(pipe.named_steps["pre"].transform(X_small))
        # Store only summary stats (mean |shap|), to keep object small
        mean_abs = np.mean(np.abs(sv), axis=0).tolist() if isinstance(sv, np.ndarray) else None
        shap_values = {"mean_abs_shap": mean_abs}
    except Exception as e:
        shap_values = None  # SHAP is best-effort; never fail training

    metrics = _calc_metrics(algo, y_test, y_prob, y_pred, n_feats=pipe.named_steps["pre"].transform(X_train).shape[1],
                            model_path=model_path, rows=df.shape[0])
    _save_registry_row(metrics)
    return asdict(metrics), shap_values, model_path

def predict_sample(df: pd.DataFrame, algo: str = "LogReg", k: int = 10):
    """Load latest saved model for algo and return preds/scores on head(k)."""
    if algo == "HybridBlend":
        # Load meta and compute with components
        metas = sorted([f for f in os.listdir("models") if f.startswith("HybridBlend_")])
        if not metas:
            raise FileNotFoundError("No HybridBlend model found. Train it first.")
        meta = joblib.load(os.path.join("models", metas[-1]))
        comp = meta["components"]
        w = meta["weights"]

        # Prepare sample
        df2 = _find_or_build_target(df.copy())
        X = df2.drop(columns=["default"])
        Xs = X.head(k)

        pipe_lr  = joblib.load(comp["LogReg"])
        pipe_sgd = joblib.load(comp["SGD"])
        pipe_xgb = joblib.load(comp["XGBoost"])

        pr_lr  = pipe_lr.predict_proba(Xs)[:,1]
        pr_sgd = pipe_sgd.predict_proba(Xs)[:,1]
        pr_xgb = pipe_xgb.predict_proba(Xs)[:,1]
        blended = w["LogReg"]*pr_lr + w["SGD"]*pr_sgd + w["XGBoost"]*pr_xgb
        preds = (blended >= 0.5).astype(int).tolist()
        scores = blended.round(3).tolist()
        return preds, scores

    # else: single algo
    files = sorted([f for f in os.listdir("models") if f.startswith(f"{algo}_")])
    if not files:
        raise FileNotFoundError(f"No saved model for {algo}. Train it first.")
    pipe = joblib.load(os.path.join("models", files[-1]))

    df2 = _find_or_build_target(df.copy())
    Xs = df2.drop(columns=["default"]).head(k)
    scores = pipe.predict_proba(Xs)[:,1]
    preds = (scores >= 0.5).astype(int).tolist()
    return preds, scores.round(3).tolist()

def stress_test(df: pd.DataFrame, shock: str = "default", intensity: float = 0.2) -> pd.DataFrame:
    """Simple stressors to create counterfactual datasets."""
    df = df.copy()
    if shock == "default":
        # Flip some performing to non-performing
        mask = df["loan_health"].str.lower() == "performing"
        idx = df[mask].sample(frac=min(intensity, 0.9), random_state=1).index
        df.loc[idx, "loan_health"] = "nonperforming"
    elif shock == "loan_amount":
        df["amount"] = (df["amount"] * (1.0 + intensity)).round(0)
    elif shock == "unemployment":
        # Proxy: reduce income stability and increase defaults
        if "income_stability" in df.columns:
            df["income_stability"] = np.clip(df["income_stability"] - intensity*0.5, 0, 1)
        if "loan_health" in df.columns:
            idx = df.sample(frac=min(0.1 + intensity*0.4, 0.9), random_state=2).index
            df.loc[idx, "loan_health"] = "nonperforming"
    return df

# ---- End file ----

# ---- Step 2: Quick self-test on synthetic data ----
import importlib
import pandas as pd
from modules.synth import generators as g
import modules.ml.engine as engine

importlib.reload(g)
importlib.reload(engine)

# Generate a realistic dataset (you can tweak these)
df = g.generate_clients_loans(
    n_rows=8000,
    default_rate=0.18,
    gender_ratio=0.7,
    loan_mean=28000,
    branches=40,
    product_weeks=6,
    region_bias="balanced",
    seed=2
)

# Train all models, print headline metrics
for algo in ["LogReg", "SGD", "XGBoost", "HybridBlend"]:
    metrics, shap_info, path = engine.train_model(df, algo, random_state=7, smote_k_neighbors=5)
    print(f"\n=== {algo} ===")
    print("Saved:", path)
    print("Accuracy:", round(metrics["accuracy"], 3),
          "Precision:", round(metrics["precision"], 3),
          "Recall:", round(metrics["recall"], 3),
          "AUC:", round(metrics["auc"], 3))
    preds, scores = engine.predict_sample(df, algo, k=10)
    print("Pred sample:", preds)
    print("Scores sample:", scores)

# Show last 5 registry rows (if any)
reg_path = "models/registry.csv"
if os.path.exists(reg_path):
    tail = pd.read_csv(reg_path).tail(5)
    print("\n📒 Registry tail:")
    print(tail[["algo","rows","accuracy","recall","auc","model_path"]].to_string(index=False))
else:
    print("\n(no registry yet)")
# (from the very first line "import ..." down to the very last line)
# Do not delete its internal """docstrings"""
# Just drop the code here as-is

# =========================================
'''

# Step 2: Save to file
with open("modules/ml/engine.py", "w") as f:
    f.write(engine_code)

print("✅ engine.py overwritten successfully")

✅ engine.py overwritten successfully


In [None]:
import pandas as pd
import modules.synth.generators as g
from modules.ml import engine

# === Step 1: Generate synthetic dataset ===
df = g.generate_clients_loans(
    n_rows=5000,         # you can scale this up for realism
    default_rate=0.15,   # 15% default, typical for microfinance
    gender_ratio=0.65,   # 65% women borrowers
    seed=42
)

print("✅ Synthetic dataset ready:", df.shape)
print("Columns:", df.columns.tolist())

# === Step 2: Train models ===
results = {}
for algo in ["LogReg", "SGD", "XGBoost"]:
    metrics, shap_values, path = engine.train_model(df, algo, lr=0.1)
    results[algo] = metrics
    print(f"\n=== {algo} ===")
    print("Saved:", path)
    for k,v in metrics.items():
        print(f"{k}: {v:.3f}")

# === Step 3: Train HybridBlend ===
metrics, shap_values, path = engine.train_model(df, "HybridBlend")
results["HybridBlend"] = metrics
print(f"\n=== HybridBlend ===")
print("Saved:", path)
for k,v in metrics.items():
    print(f"{k}: {v:.3f}")

✅ Synthetic dataset ready: (5000, 14)
Columns: ['customer_id', 'customer_name', 'gov_id', 'age', 'gender', 'branch', 'product', 'product_weeks', 'loan_amount', 'loan_type', 'status', 'loan_health', 'debt_to_income', 'created_date']

=== LogReg ===
Saved: models/LogReg_model.pkl
Accuracy: 0.845
Precision: 0.000
Recall: 0.000
AUC: 0.526

=== SGD ===
Saved: models/SGD_model.pkl
Accuracy: 0.845
Precision: 0.000
Recall: 0.000
AUC: 0.500

=== XGBoost ===
Saved: models/XGBoost_model.pkl
Accuracy: 0.845
Precision: 0.000
Recall: 0.000
AUC: 0.732

=== HybridBlend ===
Saved: models/HybridBlend_model.pkl
Accuracy: 0.845
Precision: 0.000
Recall: 0.000
AUC: 0.723


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:

engine_code = r'''
import os
import joblib
import shap
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import xgboost as xgb
from imblearn.over_sampling import SMOTE

# ======================================================
# Utility
# ======================================================

def _prepare_xy(df: pd.DataFrame, target_col="default", use_smote=False, seed=42):
    """Split dataframe into train/test with optional SMOTE rebalancing."""
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # One-hot encode categoricals
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    if use_smote:
        sm = SMOTE(random_state=seed)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test


def _evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    try:
        y_score = model.predict_proba(X_test)[:, 1]
    except:
        y_score = model.decision_function(X_test)

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_score)
    }, y_pred, y_score


# ======================================================
# Training
# ======================================================

def train_model(df, algo="LogReg", lr=0.1, use_smote=True, seed=42):
    """Train a model (LogReg, SGD, XGBoost, HybridBlend)."""
    os.makedirs("models", exist_ok=True)

    X_train, X_test, y_train, y_test = _prepare_xy(df, use_smote=use_smote, seed=seed)

    if algo == "LogReg":
        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("clf", LogisticRegression(max_iter=1000))
        ])
        model = pipe.fit(X_train, y_train)

    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", max_iter=1000, random_state=seed)
        model.fit(X_train, y_train)

    elif algo == "XGBoost":
        model = xgb.XGBClassifier(
            n_estimators=200,
            learning_rate=lr,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=seed,
            use_label_encoder=False,
            eval_metric="auc"
        )
        model.fit(X_train, y_train)

    elif algo == "HybridBlend":
        # Train 3 base models
        logreg = LogisticRegression(max_iter=1000).fit(X_train, y_train)
        sgd = SGDClassifier(loss="log_loss", max_iter=1000, random_state=seed).fit(X_train, y_train)
        xgbc = xgb.XGBClassifier(
            n_estimators=200,
            learning_rate=lr,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=seed,
            use_label_encoder=False,
            eval_metric="auc"
        ).fit(X_train, y_train)

        def predict_proba(X):
            probs = (
                logreg.predict_proba(X)[:,1] * 0.3 +
                sgd.predict_proba(X)[:,1] * 0.2 +
                xgbc.predict_proba(X)[:,1] * 0.5
            )
            return np.vstack([1-probs, probs]).T

        class BlendWrapper:
            def __init__(self, models): self.models = models
            def predict(self, X): return (self.predict_proba(X)[:,1] > 0.5).astype(int)
            def predict_proba(self, X): return predict_proba(X)

        model = BlendWrapper([logreg, sgd, xgbc])

    else:
        raise ValueError(f"Unknown algo: {algo}")

    metrics, y_pred, y_score = _evaluate(model, X_test, y_test)

    # SHAP explainability (if tree-based)
    try:
        explainer = shap.Explainer(model, X_test)
        shap_values = explainer(X_test)
    except:
        shap_values = None

    out_path = f"models/{algo}_model.pkl"
    joblib.dump(model, out_path)

    return metrics, shap_values, out_path
'''

with open("modules/ml/engine.py", "w") as f:
    f.write(engine_code)

print("✅ engine.py patched with SMOTE support")

✅ engine.py patched with SMOTE support


In [None]:
# === Overwrite engine.py with SMOTE-enabled version ===
engine_code = """
import os, joblib
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
import shap

# Optional SMOTE for imbalance handling
try:
    from imblearn.over_sampling import SMOTE
    HAS_SMOTE = True
except ImportError:
    HAS_SMOTE = False

MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

def prepare_data(df, target_col='loan_health', use_smote=False, seed=42):
    y = (df[target_col] != "performing").astype(int)
    X = df.drop(columns=['customer_id','customer_name','gov_id','created_date','loan_health'])
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    if use_smote and HAS_SMOTE:
        sm = SMOTE(random_state=seed)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test, X.columns

def train_model(df, algo="LogReg", lr=0.1, use_smote=False, seed=42):
    X_train, X_test, y_train, y_test, feat_names = prepare_data(df, use_smote=use_smote, seed=seed)

    if algo == "LogReg":
        model = LogisticRegression(max_iter=1000)
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", learning_rate="constant", eta0=lr, max_iter=1000)
    elif algo == "XGBoost":
        model = XGBClassifier(
            n_estimators=200, learning_rate=lr, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
            random_state=seed, use_label_encoder=False
        )
    elif algo == "HybridBlend":
        # Train LogReg and XGBoost, blend predictions
        log_metrics, _, _ = train_model(df, "LogReg", use_smote=use_smote, seed=seed)
        xgb_metrics, _, _ = train_model(df, "XGBoost", use_smote=use_smote, seed=seed)
        acc = (log_metrics["Accuracy"] + xgb_metrics["Accuracy"]) / 2
        prec = (log_metrics["Precision"] + xgb_metrics["Precision"]) / 2
        rec = (log_metrics["Recall"] + xgb_metrics["Recall"]) / 2
        auc = (log_metrics["AUC"] + xgb_metrics["AUC"]) / 2
        return {"Accuracy": acc, "Precision": prec, "Recall": rec, "AUC": auc}, None, None
    else:
        raise ValueError(f"Unknown algo: {algo}")

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]

    metrics = {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, zero_division=0),
        "Recall": recall_score(y_test, preds, zero_division=0),
        "AUC": roc_auc_score(y_test, probs)
    }

    # SHAP explainability
    explainer = shap.Explainer(model, X_train)
    shap_values = explainer(X_test[:200])

    model_path = os.path.join(MODEL_DIR, f"{algo}_model.pkl")
    joblib.dump(model, model_path)

    return metrics, shap_values, model_path
"""

with open("modules/ml/engine.py", "w") as f:
    f.write(engine_code)

print("✅ engine.py overwritten with SMOTE-enabled version")

✅ engine.py overwritten with SMOTE-enabled version


In [None]:
import importlib, modules.ml.engine as engine
importlib.reload(engine)

<module 'modules.ml.engine' from '/content/modules/ml/engine.py'>

In [None]:
%%writefile modules/ml/engine.py
import os, pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from imblearn.over_sampling import SMOTE
import shap
import xgboost as xgb

os.makedirs("models", exist_ok=True)

def preprocess(df: pd.DataFrame, target_col="default"):
    """Split df into X, y and build preprocessing pipeline."""
    y = df[target_col].astype(int)
    X = df.drop(columns=[target_col])

    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

    # Pipelines
    cat_pipe = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])
    num_pipe = Pipeline([
        ("scaler", StandardScaler())  # normalize numeric features
    ])

    preproc = ColumnTransformer([
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols)
    ])

    return X, y, preproc

def train_model(df: pd.DataFrame, algo: str, use_smote=False, lr=0.1):
    """Train model with optional SMOTE and return metrics + SHAP values."""
    X, y, preproc = preprocess(df)

    # Train/val split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Optionally apply SMOTE
    if use_smote:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(
            pd.DataFrame(preproc.fit_transform(X_train).toarray()), y_train
        )
        preproc.fit(X)  # refit after resampling

    # Models
    if algo == "LogReg":
        model = Pipeline([
            ("preproc", preproc),
            ("clf", LogisticRegression(
                max_iter=2000,
                class_weight="balanced",
                solver="lbfgs"
            ))
        ])
    elif algo == "SGD":
        model = Pipeline([
            ("preproc", preproc),
            ("clf", SGDClassifier(
                loss="log_loss",
                max_iter=2000,
                class_weight="balanced",
                learning_rate="optimal"
            ))
        ])
    elif algo == "XGBoost":
        model = Pipeline([
            ("preproc", preproc),
            ("clf", xgb.XGBClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                eval_metric="auc"
            ))
        ])
    elif algo == "HybridBlend":
        # Blend LogReg + XGB
        logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
        xgbm = xgb.XGBClassifier(
            n_estimators=200, max_depth=6, learning_rate=0.1,
            subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric="auc"
        )
        model = Pipeline([("preproc", preproc), ("clf", logreg)])  # placeholder

        # Train both separately
        X_train_t = preproc.fit_transform(X_train)
        X_test_t = preproc.transform(X_test)

        logreg.fit(X_train_t, y_train)
        xgbm.fit(X_train_t, y_train)

        log_preds = logreg.predict_proba(X_test_t)[:,1]
        xgb_preds = xgbm.predict_proba(X_test_t)[:,1]

        blend = 0.5*log_preds + 0.5*xgb_preds
        y_pred = (blend > 0.5).astype(int)

        metrics = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, zero_division=0),
            "Recall": recall_score(y_test, y_pred, zero_division=0),
            "AUC": roc_auc_score(y_test, blend)
        }
        path = f"models/{algo}_model.pkl"
        pickle.dump({"logreg": logreg, "xgb": xgbm, "preproc": preproc}, open(path,"wb"))

        # SHAP for blended (using xgb only)
        shap_values = None
        try:
            X_for_shap = pd.DataFrame(X_test_t.toarray() if hasattr(X_test_t,"toarray") else X_test_t).astype(float)
            explainer = shap.Explainer(xgbm, X_for_shap)
            shap_values = explainer(X_for_shap[:200])
        except Exception as e:
            print("⚠️ SHAP skipped (HybridBlend):", e)
        return metrics, shap_values, path
    else:
        raise ValueError(f"Unknown algo {algo}")

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_scores = model.predict_proba(X_test)[:,1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_scores)
    }

    # Save
    path = f"models/{algo}_model.pkl"
    pickle.dump(model, open(path,"wb"))

    # SHAP
    shap_values = None
    try:
        X_trans = model.named_steps["preproc"].transform(X_test)
        X_for_shap = pd.DataFrame(X_trans.toarray() if hasattr(X_trans,"toarray") else X_trans).astype(float)
        explainer = shap.Explainer(model.named_steps["clf"], X_for_shap)
        shap_values = explainer(X_for_shap[:200])
    except Exception as e:
        print("⚠️ SHAP skipped:", e)

    return metrics, shap_values, path

Overwriting modules/ml/engine.py


In [None]:
%%writefile modules/ml/engine.py
import os, time, pickle, sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import shap

# === DB Setup ===
os.makedirs("db", exist_ok=True)
DB_PATH = "db/audit.db"

def init_audit_db():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS model_audit (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        ts TEXT,
        algo TEXT,
        auc REAL,
        acc REAL,
        precision REAL,
        recall REAL,
        dataset TEXT,
        model TEXT
    )""")
    conn.commit()
    conn.close()

init_audit_db()

# === Training Function ===
def train_model(df: pd.DataFrame, algo: str="LogReg", use_smote: bool=True):
    """Train model and log results to SQLite audit table."""

    # Ensure binary target
    if "default" not in df.columns:
        raise ValueError("Dataset must contain 'default' column (0/1).")

    # Split
    X = df.drop(columns=["default"])
    X = pd.get_dummies(X, drop_first=True)   # encode categoricals
    y = df["default"].astype(int)

    if use_smote:
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Select model
    if algo == "LogReg":
        model = LogisticRegression(max_iter=1000)
    elif algo == "SGD":
        model = SGDClassifier(max_iter=1000, tol=1e-3)
    elif algo == "XGBoost":
        model = xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)
    elif algo == "HybridBlend":
        # Blend: average predictions from LogReg + XGB
        lr = LogisticRegression(max_iter=1000)
        xgbm = xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)
        lr.fit(X_train, y_train)
        xgbm.fit(X_train, y_train)
        preds_lr = lr.predict_proba(X_test)[:,1]
        preds_xgb = xgbm.predict_proba(X_test)[:,1]
        preds = (preds_lr + preds_xgb) / 2
        y_pred = (preds > 0.5).astype(int)
        auc = roc_auc_score(y_test, preds)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        metrics = {"AUC": auc, "Accuracy": acc, "Precision": prec, "Recall": rec}
        # Save hybrid model
        ts = time.strftime("%Y%m%d_%H%M%S")
        os.makedirs("models/versioned", exist_ok=True)
        model_path = f"models/versioned/HybridBlend_{ts}.pkl"
        with open(model_path, "wb") as f:
            pickle.dump((lr, xgbm), f)
        shap_values = None
        # Audit log
        _log_audit(ts, algo, metrics, "N/A", model_path)
        return metrics, shap_values, model_path
    else:
        raise ValueError(f"Unknown algo: {algo}")

    # Train + predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)

    metrics = {"AUC": auc, "Accuracy": acc, "Precision": prec, "Recall": rec}

    # Save model
    ts = time.strftime("%Y%m%d_%H%M%S")
    os.makedirs("models/versioned", exist_ok=True)
    model_path = f"models/versioned/{algo}_{ts}.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)

    # SHAP values (only for tree/logreg)
    shap_values = None
    try:
        explainer = shap.Explainer(model, X_test)
        shap_values = explainer(X_test)
    except Exception as e:
        shap_values = None

    # Audit log
    dataset = f"data/training_runs/trainset_{ts}.csv"
    _log_audit(ts, algo, metrics, dataset, model_path)

    return metrics, shap_values, model_path


def _log_audit(ts, algo, metrics, dataset_path, model_path):
    """Insert row into SQLite audit table."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""
        INSERT INTO model_audit (ts, algo, auc, acc, precision, recall, dataset, model)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        ts, algo,
        metrics.get("AUC", None),
        metrics.get("Accuracy", None),
        metrics.get("Precision", None),
        metrics.get("Recall", None),
        dataset_path, model_path
    ))
    conn.commit()
    conn.close()

Overwriting modules/ml/engine.py


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

faker = Faker()

# Kenyan branch sample (can expand)
BRANCHES = [
    "Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret",
    "Meru", "Nyeri", "Thika", "Machakos", "Embu"
]
PRODUCTS = [
    ("Inuka", 5), ("Kuza", 4), ("Fadhili", 6),
    ("Boresha", 8), ("Msingi", 12)
]

def estimate_age_from_id(gov_id: str) -> int:
    try:
        prefix = int(str(gov_id)[:2])
        if prefix < 20: return random.randint(60, 75)
        elif prefix < 25: return random.randint(45, 59)
        elif prefix < 30: return random.randint(35, 44)
        elif prefix < 35: return random.randint(28, 34)
        elif prefix < 40: return random.randint(21, 27)
        else: return random.randint(18, 25)
    except:
        return random.randint(18, 60)

def guess_gender_from_name(name: str) -> str:
    return "Female" if name.endswith("a") else "Male"

def generate_clients_loans(n_rows:int=1000, seed:int|None=None,
                           default_rate:float=0.1, gender_ratio:float=0.6) -> pd.DataFrame:
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        faker.seed_instance(seed)

    rows = []
    for i in range(n_rows):
        name = faker.name()
        gov_id = str(random.randint(2000000, 49999999))
        age = estimate_age_from_id(gov_id)
        gender = guess_gender_from_name(name)
        branch = random.choice(BRANCHES)
        product, weeks = random.choice(PRODUCTS)
        amount = random.randint(5000, 200000)
        loan_type = random.choice(["Normal", "Top-Up", "Refinance"])
        status = random.choice(["Active", "Pending Approval"])
        loan_health = "defaulted" if random.random() < default_rate else "performing"
        debt_to_income = round(random.uniform(0.1, 0.8), 2)
        created_date = faker.date_between(start_date="-2y", end_date="today")

        rows.append({
            "customer_id": f"CUST{i+1:05d}",
            "customer_name": name,
            "gov_id": gov_id,
            "age": age,
            "gender": gender,
            "branch": branch,
            "product": product,
            "product_weeks": weeks,
            "loan_amount": amount,
            "loan_type": loan_type,
            "status": status,
            "loan_health": loan_health,
            "debt_to_income": debt_to_income,
            "created_date": created_date,
            # 🔥 always include binary target
            "default": 0 if loan_health == "performing" else 1
        })

    return pd.DataFrame(rows)

Overwriting modules/synth/generators.py


In [None]:
%%writefile modules/synth/generators.py
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime

faker = Faker()

# Kenyan branch sample (expandable)
BRANCHES = [
    "Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret",
    "Meru", "Nyeri", "Thika", "Machakos", "Embu"
]
PRODUCTS = [
    ("Inuka", 5), ("Kuza", 4), ("Fadhili", 6),
    ("Boresha", 8), ("Msingi", 12)
]

def estimate_age_from_id(gov_id: str) -> int:
    try:
        prefix = int(str(gov_id)[:2])
        if prefix < 20: return random.randint(60, 75)
        elif prefix < 25: return random.randint(45, 59)
        elif prefix < 30: return random.randint(35, 44)
        elif prefix < 35: return random.randint(28, 34)
        elif prefix < 40: return random.randint(21, 27)
        else: return random.randint(18, 25)
    except:
        return random.randint(18, 60)

def guess_gender_from_name(name: str) -> str:
    return "Female" if name.endswith("a") else "Male"

def generate_clients_loans(n_rows:int=1000, seed:int|None=None,
                           default_rate:float=0.1, gender_ratio:float=0.6) -> pd.DataFrame:
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        faker.seed_instance(seed)

    rows = []
    for i in range(n_rows):
        name = faker.name()
        gov_id = str(random.randint(2000000, 49999999))
        age = estimate_age_from_id(gov_id)
        gender = guess_gender_from_name(name)
        branch = random.choice(BRANCHES)
        product, weeks = random.choice(PRODUCTS)
        amount = random.randint(5000, 200000)
        loan_type = random.choice(["Normal", "Top-Up", "Refinance"])
        status = random.choice(["Active", "Pending Approval"])

        # --- Dynamic default probability ---
        p_default = default_rate

        # Product risk (shorter terms riskier)
        if weeks <= 5: p_default += 0.05
        if weeks >= 12: p_default -= 0.02

        # Loan type risk
        if loan_type == "Top-Up": p_default += 0.03
        elif loan_type == "Refinance": p_default += 0.05

        # Branch risk differences
        if branch in ["Nairobi", "Mombasa"]: p_default += 0.02
        if branch in ["Meru", "Nyeri"]: p_default -= 0.01

        # Age/gender adjustments
        if age < 25: p_default += 0.04
        if gender == "Female": p_default -= 0.02  # microfinance trend

        p_default = max(0.01, min(0.8, p_default))  # keep sane bounds
        loan_health = "defaulted" if random.random() < p_default else "performing"

        debt_to_income = round(random.uniform(0.1, 0.8), 2)
        created_date = faker.date_between(start_date="-2y", end_date="today")

        rows.append({
            "customer_id": f"CUST{i+1:05d}",
            "customer_name": name,
            "gov_id": gov_id,
            "age": age,
            "gender": gender,
            "branch": branch,
            "product": product,
            "product_weeks": weeks,
            "loan_amount": amount,
            "loan_type": loan_type,
            "status": status,
            "loan_health": loan_health,
            "debt_to_income": debt_to_income,
            "created_date": created_date,
            "default": 0 if loan_health == "performing" else 1
        })

    return pd.DataFrame(rows)

Overwriting modules/synth/generators.py


In [None]:
import importlib
import modules.synth.generators as g
importlib.reload(g)   # make sure Colab reloads the newest generator

# regenerate dataset (now with 'default')
df = g.generate_clients_loans(n_rows=5000, seed=42, default_rate=0.15)
print("✅ Dataset ready:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(3))

✅ Dataset ready: (5000, 15)
Columns: ['customer_id', 'customer_name', 'gov_id', 'age', 'gender', 'branch', 'product', 'product_weeks', 'loan_amount', 'loan_type', 'status', 'loan_health', 'debt_to_income', 'created_date', 'default']
  customer_id   customer_name    gov_id  age gender   branch  product  \
0   CUST00001    Allison Hill  44911206   19   Male  Nairobi  Fadhili   
1   CUST00002   Megan Mcclain  38598928   21   Male     Embu  Boresha   
2   CUST00003  Allen Robinson   3780798   25   Male   Nakuru   Msingi   

   product_weeks  loan_amount loan_type            status loan_health  \
0              6        69196    Normal            Active  performing   
1              8        13331    Normal            Active  performing   
2             12       114974    Normal  Pending Approval  performing   

   debt_to_income created_date  default  
0            0.57   2024-02-10        0  
1            0.45   2023-11-02        0  
2            0.67   2024-09-03        0  


In [None]:
import os
import joblib
import shap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# --- Utility: ensure target exists ---
def ensure_target(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure df contains a binary target column named 'default'."""
    if "default" not in df.columns:
        raise ValueError("Dataset must contain 'default' column (0/1).")
    return df

# --- Training Engine ---
def train_model(df: pd.DataFrame, algo: str = "LogReg", use_smote: bool = True, lr: float = 0.01):
    """
    Train a model with SMOTE option.
    Supports: LogReg, SGD, XGBoost, HybridBlend
    """
    df = ensure_target(df)

    # Split
    X = df.drop(columns=["default"])
    y = df["default"]

    # Encode categoricals
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    # Handle imbalance
    if use_smote:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    # Pick model
    if algo == "LogReg":
        model = LogisticRegression(max_iter=200, solver="lbfgs")
    elif algo == "SGD":
        model = SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3, random_state=42)
    elif algo == "XGBoost":
        model = XGBClassifier(
            n_estimators=200,
            learning_rate=lr,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            use_label_encoder=False
        )
    elif algo == "HybridBlend":
        # Train individual sub-models first
        m1 = LogisticRegression(max_iter=200, solver="lbfgs")
        m2 = XGBClassifier(
            n_estimators=200,
            learning_rate=lr,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            use_label_encoder=False
        )
        m1.fit(X_train, y_train)
        m2.fit(X_train, y_train)

        def hybrid_predict(X_in):
            p1 = m1.predict_proba(X_in)[:, 1]
            p2 = m2.predict_proba(X_in)[:, 1]
            return (p1 + p2) / 2

        model = (m1, m2, hybrid_predict)  # store tuple for hybrid
    else:
        raise ValueError(f"Unsupported algo: {algo}")

    # --- Fit model (skip for hybrid since we already fit inside) ---
    if algo != "HybridBlend":
        model.fit(X_train, y_train)

    # --- Prediction handling ---
    if algo == "HybridBlend":
        m1, m2, hybrid_fn = model
        y_pred = (hybrid_fn(X_test) > 0.5).astype(int)
        y_scores = hybrid_fn(X_test)
    else:
        y_pred = model.predict(X_test)

        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, "decision_function"):
            scores = model.decision_function(X_test)
            y_scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)
        else:
            y_scores = y_pred  # fallback

    # --- Metrics ---
    metrics = {
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred, zero_division=0), 3),
        "Recall": round(recall_score(y_test, y_pred, zero_division=0), 3),
        "AUC": round(roc_auc_score(y_test, y_scores), 3),
    }

    # --- SHAP values (tree-based only) ---
    shap_values = None
    try:
        if algo == "XGBoost":
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
    except Exception:
        shap_values = None

    # --- Save model ---
    os.makedirs("models", exist_ok=True)
    path = f"models/{algo}_model.pkl"
    joblib.dump(model, path)

    return metrics, shap_values, path

In [None]:
!pip install streamlit pyngrok -q

In [None]:
import time
from pyngrok import ngrok

# Kill any tunnels from before
ngrok.kill()

# Auth
NGROK_TOKEN = "31rYvgklL0EdX9bGLvTXc313efE_2GyDFGPUNAyFgB83bikTF"
ngrok.set_auth_token(NGROK_TOKEN)

# Start tunnel
public_url = ngrok.connect(8501)
print("🌍 Your app is live at:", public_url)

# Start Streamlit in background + log output
!nohup streamlit run modules/streamlit_app/app.py --server.port 8501 > streamlit.log 2>&1 &

# Wait a bit for startup
time.sleep(5)

# Show first 30 log lines
!head -n 30 streamlit.log

🌍 Your app is live at: NgrokTunnel: "https://e05d4326736b.ngrok-free.app" -> "http://localhost:8501"
Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: modules/streamlit_app/app.py
