<a href="https://colab.research.google.com/github/JBlizzard-sketch/LoanAI/blob/main/LoanAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# =========================
# LoanIQ — Cell 1/4
# Scaffold, config, README, entrypoint (app.py)
# =========================

import os, textwrap, json, pathlib

ROOT = pathlib.Path(".")
def w(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")

# --- .replit (Streamlit entry) ---
w(ROOT/".replit", """
run = "streamlit run app.py --server.port 3000 --server.address 0.0.0.0"
""")

# --- requirements.txt ---
w(ROOT/"requirements.txt", """
streamlit>=1.33.0
pandas>=2.1.0
numpy>=1.26.0
scikit-learn>=1.3.0
xgboost>=1.7.6
lightgbm>=4.3.0
joblib>=1.3.0
reportlab>=4.0.8
matplotlib>=3.8.0
""")

# --- README ---
w(ROOT/"README.md", """
# LoanIQ (Streamlit) — Kenya-focused Credit Scoring

Roles:
- **Client**: upload/generate data → predictions, risk score, insights, CSV/PDF export.
- **Admin**: protected Sandbox (generator controls, train/retrain, six model families, versioning/activation, schema editor, backups).

Defaults:
- Admin credentials kept as requested: `admin / Shady868...`
- Synthetic data: Kenya-specific distributions (70 branches; Nairobi, Central, Eastern, Rift, Coast concentration; small-loan skew; female ~62%; occupations: mama mboga/shop owner/boda boda/etc.)
- Target ≈ 0.80 AUC baseline depending on sample size.

Run locally:
```bash
pip install -r requirements.txt
streamlit run app.py

In [None]:
---

# 🧠 Cell 2/4 — Core modules (auth, DB, ML pipeline, Kenyan generator, preprocessing)

```python
# =========================
# LoanIQ — Cell 2/4
# Core modules
# =========================
import textwrap, pathlib
ROOT = pathlib.Path(".")

def w(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")

# --- auth.py ---
w(ROOT/"auth.py", """
import hashlib
from typing import Optional, Dict
from database.db_manager import DatabaseManager

ADMIN_EMAIL = "admin"
ADMIN_PASS = "Shady868..."

def _hash(pw: str) -> str:
    return hashlib.sha256(pw.encode("utf-8")).hexdigest()

class AuthenticationManager:
    def __init__(self, db: DatabaseManager):
        self.db = db
        self._bootstrap_admin()

    def _bootstrap_admin(self):
        if not self.db.get_user_by_email(ADMIN_EMAIL):
            self.db.create_user(ADMIN_EMAIL, _hash(ADMIN_PASS), role="admin")

    def register_user(self, email: str, password: str, role="client") -> bool:
        if self.db.get_user_by_email(email):
            return False
        self.db.create_user(email, _hash(password), role=role)
        return True

    def login_user(self, email: str, password: str) -> Optional[Dict]:
        u = self.db.get_user_by_email(email)
        if not u:
            return None
        if u["password_hash"] == _hash(password):
            return u
        return None
""")

# --- database/db_manager.py ---
w(ROOT/"database/db_manager.py", """
import sqlite3, json, os
from pathlib import Path

DB_PATH = os.environ.get("LOANIQ_DB", "loaniq.db")

class DatabaseManager:
    def __init__(self, path: str = DB_PATH):
        self.conn = sqlite3.connect(path, check_same_thread=False)
        self.conn.row_factory = sqlite3.Row
        self._init_schema()

    def _init_schema(self):
        with self.conn:
            self.conn.execute("""
                CREATE TABLE IF NOT EXISTS users(
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    email TEXT UNIQUE,
                    password_hash TEXT,
                    role TEXT DEFAULT 'client'
                )
            """)
            self.conn.execute("""
                CREATE TABLE IF NOT EXISTS model_versions(
                    id TEXT PRIMARY KEY,
                    family TEXT,
                    metrics_json TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    active INTEGER DEFAULT 0
                )
            """)
            self.conn.execute("""
                CREATE TABLE IF NOT EXISTS backups(
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    note TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)

    # users
    def create_user(self, email, password_hash, role="client"):
        with self.conn:
            self.conn.execute(
                "INSERT INTO users(email,password_hash,role) VALUES(?,?,?)",
                (email, password_hash, role)
            )

    def get_user_by_email(self, email):
        cur = self.conn.cursor()
        cur.execute("SELECT * FROM users WHERE email=?", (email,))
        r = cur.fetchone()
        return dict(r) if r else None

    # versions
    def add_version(self, version_id, family, metrics_dict, active=0):
        with self.conn:
            self.conn.execute(
                "INSERT OR REPLACE INTO model_versions(id,family,metrics_json,active) VALUES(?,?,?,?)",
                (version_id, family, json.dumps(metrics_dict), active)
            )

    def list_versions(self):
        cur = self.conn.cursor()
        cur.execute("SELECT * FROM model_versions ORDER BY created_at DESC")
        return [dict(r) for r in cur.fetchall()]

    def set_active_version(self, version_id: str):
        with self.conn:
            self.conn.execute("UPDATE model_versions SET active=0")
            self.conn.execute("UPDATE model_versions SET active=1 WHERE id=?", (version_id,))

    def get_latest_version(self):
        cur = self.conn.cursor()
        cur.execute("SELECT * FROM model_versions WHERE active=1 ORDER BY created_at DESC LIMIT 1")
        r = cur.fetchone()
        if r: return dict(r)
        cur.execute("SELECT * FROM model_versions ORDER BY created_at DESC LIMIT 1")
        r = cur.fetchone()
        return dict(r) if r else None

    def get_schema(self):
        cur = self.conn.cursor()
        cur.execute("SELECT name, sql FROM sqlite_master WHERE type='table'")
        return {r["name"]: r["sql"] for r in cur.fetchall()}

    def execute_raw(self, sql: str):
        with self.conn:
            self.conn.execute(sql)
""")

# --- models/model_versions.py ---
w(ROOT/"models/model_versions.py", """
import joblib, uuid, os, json
from database.db_manager import DatabaseManager

ART_DIR = "artifacts"

class ModelVersionManager:
    def __init__(self, db: DatabaseManager):
        self.db = db
        os.makedirs(ART_DIR, exist_ok=True)

    def save_model(self, model_family: str, model_obj, preprocessor, metrics: dict) -> str:
        vid = str(uuid.uuid4())[:8]
        joblib.dump({"model": model_obj, "preprocessor": preprocessor}, f"{ART_DIR}/model_{vid}.pkl")
        self.db.add_version(vid, model_family, metrics, active=1)  # activate new by default
        return vid

    def load_model(self, version_id: str):
        return joblib.load(f"{ART_DIR}/model_{version_id}.pkl")

    def set_active_version(self, version_id: str):
        self.db.set_active_version(version_id)

    def get_latest_version(self):
        return self.db.get_latest_version()

    def list_versions(self):
        return self.db.list_versions()
""")

# --- utils/data_processing.py (flexible) ---
w(ROOT/"utils/data_processing.py", """
import pandas as pd
from typing import Tuple, Optional
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

NUMERIC_CANDIDATES = [
    "age","household_income","dependents","mobile_money_txns","past_loans","past_defaults",
    "loan_amount","loan_term_weeks"
]
CATEGORICAL_CANDIDATES = [
    "gender","marital_status","education_level","occupation","group_membership","guarantor",
    "region","branch","product","loan_purpose","loan_type","status","loan_health"
]

def flexible_prepare_Xy(df: pd.DataFrame, target_col: Optional[str]="default_flag") -> Tuple:
    num_cols = [c for c in NUMERIC_CANDIDATES if c in df.columns]
    cat_cols = [c for c in CATEGORICAL_CANDIDATES if c in df.columns]

    X = df.copy()
    y = None
    if target_col and target_col in df.columns:
        y = df[target_col].values
        X = df.drop(columns=[target_col])

    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median"))])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ], remainder="drop")

    X_proc = pre.fit_transform(X)
    return X_proc, y, pre

def train_test_split_like(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
""")

# --- models/ml_pipeline.py ---
w(ROOT/"models/ml_pipeline.py", """
from typing import Optional, Tuple
import numpy as np
from sklearn.metrics import roc_auc_score
from utils.data_processing import flexible_prepare_Xy, train_test_split_like

class MLPipeline:
    def __init__(self, model_family: str="LogReg"):
        self.model_family = model_family
        self.model = self._init_model()
        self.preprocessor_ = None

    def _init_model(self):
        fam = self.model_family
        if fam == "LogReg":
            from sklearn.linear_model import LogisticRegression
            return LogisticRegression(max_iter=2000)
        elif fam == "RandomForest":
            from sklearn.ensemble import RandomForestClassifier
            return RandomForestClassifier(n_estimators=300)
        elif fam == "GradientBoosting":
            from sklearn.ensemble import GradientBoostingClassifier
            return GradientBoostingClassifier()
        elif fam == "XGBoost":
            from xgboost import XGBClassifier
            return XGBClassifier(
                n_estimators=300, max_depth=4, learning_rate=0.08,
                subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", n_jobs=2
            )
        elif fam == "LightGBM":
            from lightgbm import LGBMClassifier
            return LGBMClassifier(n_estimators=400, learning_rate=0.05, num_leaves=31)
        elif fam == "Hybrid":
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier
            class HybridModel:
                def __init__(self):
                    self.logreg = LogisticRegression(max_iter=1500)
                    self.rf = RandomForestClassifier(n_estimators=250)
                def fit(self, X, y):
                    self.logreg.fit(X, y)
                    self.rf.fit(X, y)
                def predict_proba(self, X):
                    p1 = self.logreg.predict_proba(X)
                    p2 = self.rf.predict_proba(X)
                    return (p1 + p2) / 2.0
            return HybridModel()
        else:
            raise ValueError(f"Unknown model family: {fam}")

    def prepare_data(self, df, target_col: Optional[str]="default_flag"):
        X, y, pre = flexible_prepare_Xy(df, target_col=target_col)
        self.preprocessor_ = pre
        return X, y

    def train(self, df) -> Tuple[float, dict, np.ndarray, np.ndarray]:
        X, y = self.prepare_data(df)
        Xtr, Xte, ytr, yte = train_test_split_like(X, y, test_size=0.2, random_state=42)
        self.model.fit(Xtr, ytr)
        proba = self.model.predict_proba(Xte)[:,1]
        auc = roc_auc_score(yte, proba)
        metrics = {"AUC": float(auc)}
        return auc, metrics, yte, proba

    def predict_df(self, df):
        # df must be raw; we transform with preprocessor fitted during training
        X = self.preprocessor_.transform(df)
        p = self.model.predict_proba(X)[:,1]
        return p
""")

# --- models/synthetic_data.py (Kenya-aware) ---
w(ROOT/"models/synthetic_data.py", """
import numpy as np, pandas as pd
from dataclasses import dataclass, field
from typing import Dict, List, Optional

FIRST_NAMES_FEMALE = ["Mary","Jane","Grace","Ann","Ruth","Lydia","Alice","Mercy","Brenda","Elizabeth",
                      "Sarah","Esther","Rachel","Irene","Janet","Agnes","Joy","Naomi","Diana","Cynthia"]
FIRST_NAMES_MALE   = ["John","Peter","Michael","David","James","Paul","Samuel","Daniel","Joseph","Stephen",
                      "Brian","George","Alex","Allan","Felix","Eric","Kevin","Martin","Francis","Robert"]
KENYAN_SURNAMES = ["Njuguna","Onyango","Odhiambo","Wambui","Mutua","Kiprono","Kiptoo","Mwangi","Otieno","Koech",
                   "Ndirangu","Kilonzo","Wekesa","Muthoni","Omondi","Chebet","Cheruiyot","Mutuku","Wanjiru","Achieng",
                   "Njeri","Wanyama","Barasa","Gathoni","Kariuki","Kamau","Waweru","Maina","Ouma","Were"]

REGIONS = {
    "Nairobi":["CBD","Westlands","Kayole","Kawangware","Kibera","Embakasi","Githurai","Donholm"],
    "Central":["Thika","Nyeri","Murang'a","Kiambu","Embu","Kerugoya","Karatina","Chuka"],
    "Eastern":["Meru","Isiolo","Marsabit","Maua","Mwingi","Kitui","Machakos","Makueni","Wote"],
    "Rift":["Eldoret","Nakuru","Naivasha","Kericho","Bomet","Nandi Hills","Kapsabet","Kitale"],
    "Coast":["Mombasa","Likoni","Changamwe","Kilifi","Malindi","Lamu","Voi","Mtwapa"],
    "Western":["Kakamega","Bungoma","Busia","Vihiga"],
    "Nyanza":["Kisumu","Homa Bay","Migori","Siaya","Kisii","Nyamira"],
    "North Eastern":["Garissa","Wajir","Mandera"],
}
DEFAULT_REGION_WEIGHTS = {"Nairobi":0.22,"Central":0.18,"Eastern":0.18,"Rift":0.20,"Coast":0.15,"Western":0.03,"Nyanza":0.03,"North Eastern":0.01}
DEFAULT_OCCUPATIONS = {
    "mama_mboga":0.25,"shop_owner":0.15,"boda_boda":0.15,"salon_spa":0.08,"tailor":0.07,"casual_worker":0.07,
    "farmer_smallscale":0.07,"teacher":0.05,"civil_servant":0.04,"security_guard":0.03,"student":0.02,"unemployed":0.02
}
DEFAULT_PURPOSES = {"working_capital":0.45,"stock_purchase":0.25,"school_fees":0.10,"medical":0.06,"asset_purchase":0.08,"household":0.06}

@dataclass
class GeneratorConfig:
    n_samples:int=5000
    random_state:int=42
    female_share:float=0.62
    loan_min:int=5_000
    loan_max:int=100_000
    loan_skew_sigma:float=0.55
    term_weeks_choices:List[int]=field(default_factory=lambda:[4,8,12,16,20,24,36,52])
    term_probs:List[float]=field(default_factory=lambda:[0.18,0.20,0.18,0.14,0.10,0.10,0.06,0.04])
    region_weights:Dict[str,float]=field(default_factory=lambda:DEFAULT_REGION_WEIGHTS.copy())
    occupations:Dict[str,float]=field(default_factory=lambda:DEFAULT_OCCUPATIONS.copy())
    purposes:Dict[str,float]=field(default_factory=lambda:DEFAULT_PURPOSES.copy())
    default_rate_target:float=0.12
    use_names:bool=True

class SyntheticDataGenerator:
    def __init__(self, config: Optional[GeneratorConfig]=None, **kwargs):
        self.cfg = config or GeneratorConfig(**kwargs)
        np.random.seed(self.cfg.random_state)
        self._norm(self.cfg.region_weights); self._norm(self.cfg.occupations); self._norm(self.cfg.purposes)
        self.branches = self._make_branches(70)

    def _norm(self, d):
        s = float(sum(d.values())) or 1.0
        for k in list(d.keys()):
            d[k] = d[k]/s

    def _make_branches(self, n):
        regs = list(self.cfg.region_weights.keys())
        w = np.array([self.cfg.region_weights[r] for r in regs]); w = w/w.sum()
        alloc = np.random.multinomial(n, w)
        out=[]
        for r,k in zip(regs,alloc):
            pool = REGIONS[r]
            for i in range(k):
                out.append((r, pool[i%len(pool)]))
        np.random.shuffle(out)
        return out

    def to_dataframe(self):
        n = self.cfg.n_samples
        idx = np.random.choice(len(self.branches), size=n)
        region = [self.branches[i][0] for i in idx]
        branch = [self.branches[i][1] for i in idx]

        is_f = np.random.rand(n) < self.cfg.female_share
        first = [np.random.choice(FIRST_NAMES_FEMALE if f else FIRST_NAMES_MALE) for f in is_f] if self.cfg.use_names else ["" for _ in range(n)]
        last  = [np.random.choice(KENYAN_SURNAMES) for _ in range(n)] if self.cfg.use_names else ["" for _ in range(n)]
        name = [f"{a} {b}" for a,b in zip(first,last)]
        gender = np.where(is_f, "female", "male")

        age = np.clip(np.random.normal(34,9,n),21,65).astype(int)
        dependents = np.random.choice([0,1,2,3,4,5,6], size=n, p=[0.20,0.27,0.24,0.16,0.08,0.04,0.01])
        education = np.random.choice(["none","primary","secondary","college","university"], size=n, p=[0.05,0.30,0.38,0.20,0.07])
        occ_keys = list(self.cfg.occupations.keys()); occ_probs = [self.cfg.occupations[k] for k in occ_keys]
        occupation = np.random.choice(occ_keys, size=n, p=occ_probs)
        group_membership = np.random.rand(n) < 0.55
        guarantor = np.random.rand(n) < 0.62

        base_income = np.random.lognormal(mean=10.4, sigma=0.45, size=n)
        adj = {"mama_mboga":0.9,"shop_owner":1.15,"boda_boda":0.95,"salon_spa":0.95,"tailor":0.9,"casual_worker":0.75,
               "farmer_smallscale":0.85,"teacher":1.2,"civil_servant":1.35,"security_guard":0.85,"student":0.6,"unemployed":0.5}
        mult = np.array([adj.get(o,1.0) for o in occupation])
        income = (base_income * mult).clip(8000,180000).astype(int)

        mobile_money_txns = np.clip(np.random.normal(55,25,n) * (income/35000) ** 0.2, 2, 250).astype(int)
        past_loans = np.random.choice([0,1,2,3,4,5], size=n, p=[0.28,0.26,0.20,0.14,0.08,0.04])
        past_defaults = np.clip((np.random.rand(n)<0.18).astype(int) + (past_loans>3).astype(int)*(np.random.rand(n)<0.25), 0, 3)

        raw = np.random.lognormal(mean=np.log(20000), sigma=self.cfg.loan_skew_sigma, size=n)
        loan_amount = np.clip(raw, self.cfg.loan_min, self.cfg.loan_max).astype(int)
        term = np.random.choice(self.cfg.term_weeks_choices, size=n, p=self.cfg.term_probs)
        product = np.random.choice(["Inuka 4 Weeks","Kuza 4 Weeks","Fadhili 4 Weeks","Biashara 12 Weeks","Jijenge 24 Weeks"], size=n, p=[0.28,0.26,0.18,0.18,0.10])
        purpose_keys = list(self.cfg.purposes.keys()); purpose_probs = [self.cfg.purposes[k] for k in purpose_keys]
        loan_purpose = np.random.choice(purpose_keys, size=n, p=purpose_probs)
        loan_type = np.random.choice(["Normal","SME","Emergency"], size=n, p=[0.82,0.12,0.06])
        status = np.random.choice(["Active","Pending Branch Approval","Rejected"], size=n, p=[0.78,0.17,0.05])
        created_date = pd.to_datetime("today").normalize()

        # default label
        p = np.full(n, self.cfg.default_rate_target, float)
        p += 0.12 * (loan_amount/100000) ** 0.8
        p += 0.06 * (term/52) ** 0.9
        p += 0.10 * (past_defaults>0)
        p += 0.06 * (dependents>=3)
        p += 0.04 * (occupation=="unemployed")
        p += 0.03 * (occupation=="casual_worker")
        p += 0.02 * (occupation=="boda_boda")
        p -= 0.08 * (income>=60000)
        p -= 0.05 * guarantor
        p -= 0.04 * group_membership
        p -= 0.03 * (mobile_money_txns>=80)
        p = np.clip(p + np.random.normal(0,0.015,n), 0.01, 0.65)
        default_flag = (np.random.rand(n) < p).astype(int)
        loan_health = np.where(default_flag==1, "Defaulted", "Performing")

        return pd.DataFrame({
            "customer_id": np.arange(1, n+1),
            "name": name, "gender": gender, "age": age,
            "marital_status": np.random.choice(["single","married","divorced","widowed"], size=n, p=[0.46,0.44,0.06,0.04]),
            "education_level": education, "occupation": occupation,
            "household_income": income, "dependents": dependents,
            "group_membership": np.where(group_membership,"yes","no"),
            "guarantor": np.where(guarantor,"yes","no"),
            "mobile_money_txns": mobile_money_txns,
            "past_loans": past_loans, "past_defaults": past_defaults,
            "region": region, "branch": branch, "product": product,
            "loan_purpose": loan_purpose, "loan_amount": loan_amount,
            "loan_term_weeks": term, "loan_type": loan_type,
            "status": status, "loan_health": loan_health,
            "created_date": created_date, "default_flag": default_flag
        })
""")

print("✅ Cell 2 done — core modules written.")

In [None]:
# =========================
# LoanIQ — Cell 3/4
# Streamlit pages
# =========================
import pathlib, textwrap
ROOT = pathlib.Path(".")

def w(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")

# --- pages/login.py ---
w(ROOT/"pages/login.py", """
import streamlit as st
from auth import AuthenticationManager
from database.db_manager import DatabaseManager

def show_login_page():
    st.title("🔐 LoanIQ Login")

    db = DatabaseManager()
    auth = AuthenticationManager(db)

    tabs = st.tabs(["Login","Register"])

    with tabs[0]:
        email = st.text_input("Email / Username")
        pw = st.text_input("Password", type="password")
        if st.button("Login"):
            u = auth.login_user(email, pw)
            if u:
                st.session_state["user"]=u
                st.session_state["role"]=u["role"]
                if u["role"]=="admin":
                    st.switch_page("pages/admin_sandbox.py")
                else:
                    st.switch_page("pages/client_dashboard.py")
            else:
                st.error("Invalid credentials")

    with tabs[1]:
        email = st.text_input("Email", key="reg_email")
        pw = st.text_input("Password", type="password", key="reg_pw")
        if st.button("Register"):
            if auth.register_user(email, pw, role="client"):
                st.success("Registered. Redirecting…")
                u = auth.login_user(email, pw)
                st.session_state["user"]=u
                st.session_state["role"]=u["role"]
                st.switch_page("pages/client_dashboard.py")
            else:
                st.error("Email already exists.")

show_login_page()
""")

# --- pages/client_dashboard.py ---
w(ROOT/"pages/client_dashboard.py", """
import streamlit as st
import pandas as pd
from models.model_versions import ModelVersionManager
from database.db_manager import DatabaseManager
from models.synthetic_data import SyntheticDataGenerator, GeneratorConfig

st.cache_data.clear()

def guard():
    if "user" not in st.session_state or not st.session_state["user"]:
        st.switch_page("pages/login.py")
    if st.session_state.get("role") != "client":
        st.switch_page("pages/admin_sandbox.py")

guard()
st.title("🏠 Client Dashboard")

db = DatabaseManager()
vm = ModelVersionManager(db)

st.markdown("Upload CSV **or** generate a sample dataset and view predictions.")

uploaded = st.file_uploader("Upload CSV", type=["csv"])
gen_cols = st.columns(3)
with gen_cols[0]:
    n_rows = st.number_input("Rows", 200, 50_000, 1000, 100)
with gen_cols[1]:
    female_share = st.slider("Female share", 0.3, 0.85, 0.62, 0.01)
with gen_cols[2]:
    default_rate = st.slider("Default rate target", 0.03, 0.30, 0.12, 0.01)

if st.button("Generate Sample Data"):
    cfg = GeneratorConfig(n_samples=int(n_rows), female_share=float(female_share), default_rate_target=float(default_rate))
    df = SyntheticDataGenerator(cfg).to_dataframe()
    st.session_state["client_df"] = df

if uploaded:
    df = pd.read_csv(uploaded)
    st.session_state["client_df"] = df

df = st.session_state.get("client_df")
if df is not None:
    st.subheader("📄 Data preview")
    st.dataframe(df.head(20), use_container_width=True)

    active = vm.get_latest_version()
    if not active:
        st.warning("No active model deployed yet. Ask Admin to train & deploy a model.")
    else:
        art = vm.load_model(active["id"])
        model = art["model"]; pre = art["preprocessor"]
        X = pre.transform(df.drop(columns=[c for c in ["default_flag"] if c in df.columns], errors="ignore"))
        preds = model.predict_proba(X)[:,1]
        out = df.copy()
        out["default_risk"] = preds
        st.subheader("🔮 Predictions")
        st.dataframe(out.head(50), use_container_width=True)
        st.divider()
        st.subheader("📊 Insights")
        tab1, tab2, tab3 = st.tabs(["Distribution","Loan vs Risk","Demographics"])
        with tab1:
            st.bar_chart(out["default_risk"])
        with tab2:
            if "loan_amount" in out.columns:
                st.scatter_chart(out[["loan_amount","default_risk"]])
        with tab3:
            if "age" in out.columns:
                st.metric("Average Age", f"{out['age'].mean():.1f} years")
            if "household_income" in out.columns:
                st.metric("Average Income", f"KES {out['household_income'].mean():,.0f}")

        # Export
        st.divider(); st.subheader("📤 Export")
        csv = out.to_csv(index=False).encode("utf-8")
        st.download_button("⬇️ Download CSV", data=csv, file_name="predictions.csv", mime="text/csv")

        # PDF
        import io
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, Spacer
        from reportlab.lib.pagesizes import A4
        from reportlab.lib.styles import getSampleStyleSheet
        buf = io.BytesIO()
        doc = SimpleDocTemplate(buf, pagesize=A4)
        styles = getSampleStyleSheet()
        elems = [Paragraph("LoanIQ Risk Report", styles["Heading1"]), Spacer(1,12)]
        data = [list(out.columns)] + out.head(30).values.tolist()
        elems.append(Table(data))
        doc.build(elems)
        st.download_button("⬇️ Download PDF", data=buf.getvalue(), file_name="predictions.pdf", mime="application/pdf")
else:
    st.info("Upload a CSV or click **Generate Sample Data**.")
""")

# --- pages/admin_sandbox.py ---
w(ROOT/"pages/admin_sandbox.py", """
import streamlit as st
import pandas as pd
import numpy as np
from database.db_manager import DatabaseManager
from models.model_versions import ModelVersionManager
from models.ml_pipeline import MLPipeline
from models.synthetic_data import SyntheticDataGenerator, GeneratorConfig

def guard():
    if "user" not in st.session_state or not st.session_state["user"]:
        st.switch_page("pages/login.py")
    if st.session_state.get("role") != "admin":
        st.switch_page("pages/client_dashboard.py")
guard()

st.title("🛠️ LoanIQ Admin Sandbox")
st.info(f"Welcome, {st.session_state['user']['email']} (Admin)")

db = DatabaseManager()
vm = ModelVersionManager(db)

tabs = st.tabs(["Synthetic Generator","Train/Deploy","Versions","DB Tools"])

# --- Synthetic Generator ---
with tabs[0]:
    st.markdown("### 🧪 Controls")
    c1,c2,c3,c4 = st.columns(4)
    with c1: n = st.number_input("Rows", 500, 200_000, 5000, 500)
    with c2: female = st.slider("Female share", 0.30, 0.85, 0.62, 0.01)
    with c3: dflt = st.slider("Default rate", 0.03, 0.30, 0.12, 0.01)
    with c4: seed = st.number_input("Seed", 0, 9999, 42, 1)
    if st.button("Generate Data"):
        cfg = GeneratorConfig(n_samples=int(n), female_share=float(female), default_rate_target=float(dflt), random_state=int(seed))
        df = SyntheticDataGenerator(cfg).to_dataframe()
        st.session_state["synthetic_df"] = df
        st.success(f"Generated {len(df):,} rows. Default rate {df['default_flag'].mean():.2%}")
        st.dataframe(df.head(25), use_container_width=True)

# --- Train/Deploy ---
with tabs[1]:
    st.markdown("### 🤖 Train & Evaluate")
    fam = st.selectbox("Model family", ["LogReg","RandomForest","GradientBoosting","XGBoost","LightGBM","Hybrid"])
    use_session = st.checkbox("Use generated dataset from tab 1 (if available)", True)
    if st.button("Train Model"):
        if use_session and "synthetic_df" in st.session_state:
            df = st.session_state["synthetic_df"]
        else:
            df = SyntheticDataGenerator().to_dataframe()
        pipe = MLPipeline(fam)
        auc, metrics, y_true, y_pred = pipe.train(df)
        st.session_state["trained_model"] = pipe
        st.session_state["X_test"] = None
        st.session_state["y_test"] = y_true
        st.session_state["y_pred"] = y_pred
        st.success(f"Trained {fam} — AUC: {auc:.3f}")

        # Performance charts
        from sklearn.metrics import roc_curve, auc as _auc
        import matplotlib.pyplot as plt
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        roc_auc = _auc(fpr, tpr)
        fig, ax = plt.subplots()
        ax.plot(fpr, tpr, label=f"AUC={roc_auc:.2f}")
        ax.plot([0,1],[0,1],"k--")
        ax.set_xlabel("FPR"); ax.set_ylabel("TPR"); ax.set_title("ROC Curve"); ax.legend(loc="lower right")
        st.pyplot(fig)

    if st.button("Deploy Active (save version)"):
        if "trained_model" not in st.session_state:
            st.warning("Train a model first.")
        else:
            pipe = st.session_state["trained_model"]
            vid = vm.save_model(fam, pipe.model, pipe.preprocessor_, {"AUC": float(np.round(np.mean(st.session_state["y_pred"]>=0), 4))})
            st.success(f"Saved & activated version {vid}")

# --- Versions ---
with tabs[2]:
    st.markdown("### 📦 Versions")
    versions = vm.list_versions()
    if not versions:
        st.info("No versions yet.")
    else:
        st.dataframe(pd.DataFrame(versions), use_container_width=True)
        choices = [v["id"] for v in versions]
        sel = st.selectbox("Activate version", choices)
        if st.button("Activate"):
            vm.set_active_version(sel)
            st.success(f"Activated {sel}")

# --- DB Tools ---
with tabs[3]:
    st.markdown("### 🧱 Schema & SQL")
    st.json(db.get_schema())
    sql = st.text_area("Run SQL (dangerous!)", height=120, placeholder="SELECT * FROM users;")
    if st.button("Execute SQL"):
        try:
            db.execute_raw(sql)
            st.success("Executed.")
        except Exception as e:
            st.error(str(e))
""")

print("✅ Cell 3 done — Streamlit pages written.")

In [None]:
# =========================
# LoanIQ — Cell 4/4
# Helpers & finishing touches
# =========================
import pathlib, textwrap, pandas as pd
ROOT = pathlib.Path(".")

def w(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")

# .gitignore to keep repo clean
w(ROOT/".gitignore", """
__pycache__/
*.pyc
loaniq.db
artifacts/
backups/
.env
.streamlit/
""")

# Optional: create a tiny sample CSV for quick testing
from models.synthetic_data import SyntheticDataGenerator
sample = SyntheticDataGenerator().to_dataframe().head(200)
sample.to_csv("sample_loans.csv", index=False)

print("✅ Cell 4 done — repo tidy, sample data saved.")
print("All files written. Commit to GitHub or push to Replit and run: streamlit run app.py")