In [1]:
!pip install -q numpy==1.26.4 pandas scikit-learn tqdm \
sentence-transformers transformers langchain chromadb matplotlib seaborn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.7/20.7 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m6.3 MB/s[0m eta [36

In [1]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [2]:
np.random.seed(42)

OUT_DIR = "/mnt/data/risk_prototype_full"
os.makedirs(OUT_DIR, exist_ok=True)

# Borrowers
n_borrowers = 50
borrowers = pd.DataFrame({
    "borrower_id": np.arange(1, n_borrowers+1),
    "name": [f"Borrower_{i}" for i in range(1, n_borrowers+1)],
})
borrowers.to_csv(os.path.join(OUT_DIR, "borrowers.csv"), index=False)

In [4]:
n_events = 2000
events = pd.DataFrame({
    "event_id": np.arange(1, n_events+1),
    "borrower_id": np.random.choice(borrowers["borrower_id"], n_events),
    "amount": np.random.uniform(50, 1000, n_events),
    "ts": pd.date_range("2025-01-01", periods=n_events, freq="T"),
    # label for testing
    "label": np.random.choice([0,1], n_events, p=[0.85,0.15])
})
events.to_csv(os.path.join(OUT_DIR, "events.csv"), index=False)

print("borrowers.csv and events.csv gen")

borrowers.csv and events.csv gen


  "ts": pd.date_range("2025-01-01", periods=n_events, freq="T"),


In [5]:
borrowers = pd.read_csv(os.path.join(OUT_DIR, "borrowers.csv"))


np.random.seed(42)
fs = pd.DataFrame({
    "borrower_id": borrowers["borrower_id"],
    "count": np.random.randint(20, 50, len(borrowers)),
    "sum_amount": np.random.uniform(1000, 5000, len(borrowers)),
})
fs["avg_amount"] = fs["sum_amount"] / fs["count"]
fs["max_amount"] = fs["avg_amount"] + np.random.uniform(10, 50, len(borrowers))
fs["min_amount"] = fs["avg_amount"] - np.random.uniform(5, 30, len(borrowers))
fs["prior_open_default"] = np.random.choice([0,1], len(borrowers), p=[0.85,0.15])
fs.set_index("borrower_id", inplace=True)
fs.to_csv(os.path.join(OUT_DIR, "feature_store.csv"))

In [6]:
scaler = StandardScaler()
feature_cols = ["avg_amount","max_amount","min_amount","count","prior_open_default"]
scaler.fit(fs[feature_cols].values)
with open(os.path.join(OUT_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)


X = scaler.transform(fs[feature_cols].values)
y = np.random.choice([0,1], len(fs), p=[0.85,0.15])
model = LogisticRegression()
model.fit(X, y)
with open(os.path.join(OUT_DIR, "model.pkl"), "wb") as f:
    pickle.dump(model, f)

print("Feature store, scaler, and model saved")

Feature store, scaler, and model saved


In [7]:
fs_path = os.path.join(OUT_DIR, "feature_store.csv")
scaler_path = os.path.join(OUT_DIR, "scaler.pkl")
model_path = os.path.join(OUT_DIR, "model.pkl")
events_path = os.path.join(OUT_DIR, "events.csv")
decision_log_path = os.path.join(OUT_DIR, "decision_log.csv")


fs = pd.read_csv(fs_path).set_index("borrower_id")
with open(scaler_path, "rb") as f: scaler = pickle.load(f)
with open(model_path, "rb") as f: model = pickle.load(f)
events = pd.read_csv(events_path).head(500)

# Policy docs
policy_docs = [
    {"id":"p1","text":"Prior default + high risk → HUMAN_REVIEW"},
    {"id":"p2","text":"Risk>0.8 → FREEZE"},
    {"id":"p3","text":"Amount>500 → VERIFY"},
    {"id":"p4","text":"Risk<0.3 → ALLOW"}
]


In [8]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embedder.encode([d["text"] for d in policy_docs], normalize_embeddings=True)

# Local agent LLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model_lm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
llm_pipe = pipeline("text2text-generation", model=model_lm, tokenizer=tokenizer, device=-1, max_length=128, do_sample=False)

# Retrieval helper
def retrieve_policy(risk_score, borrower_id):
    query = f"risk_score={risk_score:.3f} borrower={borrower_id}"
    q_emb = embedder.encode([query], normalize_embeddings=True)
    sims = np.dot(q_emb, doc_embeddings.T)[0]
    top_idx = sims.argmax()
    return policy_docs[top_idx]["text"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [9]:
decision_log = []

for _, row in tqdm(events.iterrows(), total=len(events)):
    bid = row.borrower_id
    fs_row = fs.loc[bid]
    x_vec = np.array([[fs_row["avg_amount"], fs_row["max_amount"], fs_row["min_amount"],
                       fs_row["count"], fs_row["prior_open_default"]]])
    risk_score = model.predict_proba(scaler.transform(x_vec))[0,1]

    retrieved_doc = retrieve_policy(risk_score, bid)

    prompt = f"""
You are a risk agent. Based on the information, recommend one ACTION from {{FREEZE, VERIFY, ALLOW, HUMAN_REVIEW}}.
Borrower features: {fs_row.to_dict()}
Predicted risk_score: {risk_score:.3f}
Relevant policy document: {retrieved_doc}
Provide ACTION and a one-line REASON grounded in the features or policy.
Format: ACTION: <action>; REASON: <one-line reason>
"""
    output = llm_pipe(prompt)[0]["generated_text"]

    action = "HUMAN_REVIEW"
    if "FREEZE" in output.upper(): action="FREEZE"
    elif "VERIFY" in output.upper(): action="VERIFY"
    elif "ALLOW" in output.upper(): action="ALLOW"
    elif "HUMAN_REVIEW" in output.upper(): action="HUMAN_REVIEW"

    decision_log.append({
        "event_id": row.event_id,
        "borrower_id": bid,
        "amount": row.amount,
        "risk_score": risk_score,
        "retrieved_doc": retrieved_doc,
        "action": action,
        "reason": output
    })

# Save decision log
pd.DataFrame(decision_log).to_csv(decision_log_path, index=False)
print("✅ Step 3 completed: decision log saved")


100%|██████████| 500/500 [35:18<00:00,  4.24s/it]

✅ Step 3 completed: decision log saved





In [10]:
 decision_log

[{'event_id': 1,
  'borrower_id': 39,
  'amount': 168.00896839165637,
  'risk_score': 0.08288878427458314,
  'retrieved_doc': 'Risk<0.3 → ALLOW',
  'action': 'FREEZE',
  'reason': 'None of the above choices FREEZE, VERIFY, ALLOW, HUMAN_REVIEW'},
 {'event_id': 2,
  'borrower_id': 29,
  'amount': 901.4102375507864,
  'risk_score': 0.07791374846605766,
  'retrieved_doc': 'Risk<0.3 → ALLOW',
  'action': 'FREEZE',
  'reason': 'None of the above choices FREEZE, VERIFY, ALLOW, HUMAN_REVIEW'},
 {'event_id': 3,
  'borrower_id': 15,
  'amount': 615.0378055080893,
  'risk_score': 0.16315945151743974,
  'retrieved_doc': 'Risk<0.3 → ALLOW',
  'action': 'FREEZE',
  'reason': 'None of the above choices FREEZE, VERIFY, ALLOW, HUMAN_REVIEW'},
 {'event_id': 4,
  'borrower_id': 43,
  'amount': 635.7743527693503,
  'risk_score': 0.07371093251209905,
  'retrieved_doc': 'Risk<0.3 → ALLOW',
  'action': 'HUMAN_REVIEW',
  'reason': 'None of the above choices                                                     