In [1]:
"""
PRA Risk Summaries & Evasiveness Detector (2023–2025, JPM & HSBC)
Run as a script or copy cells into Jupyter. Designed for Apple Silicon (M3) with MPS.
"""

import os, platform, sys, re, json, warnings
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
warnings.filterwarnings("ignore")

print("Python:", sys.version)
print("Platform:", platform.platform())
print("CWD:", os.getcwd())

Python: 3.12.6 (main, Sep  6 2024, 19:03:47) [Clang 15.0.0 (clang-1500.3.9.4)]
Platform: macOS-15.5-arm64-arm-64bit
CWD: /Users/jerome.ahye/Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight/notebooks


In [3]:
# Apple Metal (MPS) accelerator check
try:
    import torch
    mps_ok = torch.backends.mps.is_available() and torch.backends.mps.is_built()
    device = torch.device("mps" if mps_ok else ("cuda" if torch.cuda.is_available() else "cpu"))
    print(f"Torch version: {torch.__version__}")
    print("MPS built:", torch.backends.mps.is_built())
    print("MPS available (usable):", torch.backends.mps.is_available())
    print("Using device:", device)
except Exception as e:
    print("Torch not available, falling back to CPU-only:", e)
    device = "cpu"

Torch version: 2.8.0
MPS built: True
MPS available (usable): True
Using device: mps


In [None]:
# Imports that may need pip install
try:
    import matplotlib.pyplot as plt
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
    from sentence_transformers import SentenceTransformer, util
    import textstat
except Exception as e:
    print("Missing libs. Install first:", e)
    print("Try: pip install pandas numpy matplotlib transformers sentence-transformers textstat")
    raise


## Config

In [None]:
# ---------- Config ----------
DATA_DIR = Path("../data/processed")
JPM_PATH = DATA_DIR / "jpm" / "all_jpm_2023_2025.csv"
HSBC_PATH = DATA_DIR / "hsbc" / "all_hsbc_2023_2025.csv"

# Try both possible filenames for the PRA categories
PRA_PATHS = [
    DATA_DIR / "PRA Risk Categories.csv",
    DATA_DIR / "PRA Risk Categories - Sheet1.csv"
]

# Models (swap summarizer if you want higher quality and have time/VRAM)
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
SUMM_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"  # or "facebook/bart-large-cnn"
USE_NLI = False
NLI_MODEL_NAME = "typeform/distilroberta-base-uncased-mnli"

# Summaries
SUMMARY_MAX_TOKENS = 200  # for distilbart cnn
SUMMARY_TARGET_WORDS = 120

# Evasion thresholds (tweak to taste)
SIMILARITY_LOW = 0.38      # cosine sim below this suggests low alignment to question
HEDGE_MIN_COUNT = 2        # minimum hedge/deflection cues to matter
VERBOSITY_RATIO_HIGH = 6.0 # answer-to-question char ratio
READABILITY_SIMPLE = 8.0   # Flesch-Kincaid grade; higher can indicate complexity
EVASION_SCORE_FLAG = 0.65  # composite score threshold to flag

## Text preprocessing

In [None]:
def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

HEDGE_PHRASES = [
    "i think","we think","i believe","we believe","we feel","i feel",
    "sort of","kind of","a bit","a little","roughly","approximately",
    "around","more or less","to some extent","somewhat",
    "we don't break out","we do not break out","we don't disclose","we do not disclose",
    "we won't comment","we will not comment","not going to comment",
    "too early to say","too soon to say","too soon to tell",
    "we'll have to see","we will have to see",
    "we'll come back","we will come back",
    "as we've said before","as we said before",
    "as previously mentioned","as mentioned",
    "let me step back","take a step back",
    "the way i would frame","i would frame it",
    "i'm not sure","we're not sure",
    "it's complicated","it's complex",
    "moving parts",
    "as you know","as you can appreciate",
    "that's a great question","good question",
    "let me answer a different","let me start somewhere else",
]

def count_hedges(text: str) -> int:
    t = " " + text.lower() + " "
    return sum(1 for p in HEDGE_PHRASES if f" {p} " in t)

def fk_grade(text: str) -> float:
    try:
        return textstat.flesch_kincaid_grade(text)
    except Exception:
        return np.nan