# **Summarisation & Evasion Notebook**

# **1. Objective**

- Summarise banker answers into short, PRA relevant insights.
- Generate an evasion score to tag summaries.
- RAG pipeline to bring in relevant external documents (e.g. PRA risk definitions, regulatory news).
- Optional extension: validate flagged risks with external/regulatory news.

# **2. Set up Workspace**

In [1]:
# Import libraries
# Core python
import os
import numpy as np
import pandas as pd
import re
import json
import pathlib
from pathlib import Path
from typing import List, Dict, Any 
import csv

# NLP & Summarisation
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import spacy
from llama_cpp import Llama 

# Evaluation
from rouge_score import rouge_scorer
import evaluate
from bert_score import score as bertscore 

# Retrieval
from sentence_transformers import SentenceTransformer 
import faiss
import chromadb
import langchain
import llama_index

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Visualisations
import matplotlib.pyplot as plt
import seaborn as sns 

  from .autonotebook import tqdm as notebook_tqdm


# **3. Load the dataset**

In [2]:
# Load the dataset.
jpm_2025_df = pd.read_csv('../data/processed/jpm/all_jpm_2025.csv')

# View the data.
jpm_2025_df.head()

Unnamed: 0,question_number,answer_number,speaker_name,role,company,content,year,quarter,source_pdf
0,1,,Ken Usdin,analyst,Autonomous Research,"Good morning, Jeremy. Wondering if you could s...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...
1,1,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Sure, Ken. So I mean, at a high level, I would...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...
2,2,,Ken Usdin,analyst,Autonomous Research,Yeah. And just one question on the NII ex. Mar...,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...
3,2,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Yeah, that's a good question, Ken. You're righ...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...
4,2,2.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorganChase,In the curve basically.,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...


# **4. Preprocessing**

In [3]:
# View speaker roles.
jpm_2025_df['role'].unique()

array(['analyst', 'Chief Financial Officer',
       'Chairman & Chief Executive Officer',
       'And then some. Theres a lot of value added.', 'Okay'],
      dtype=object)

In [4]:
# View rows with invalid roles.
valid_roles = 'analyst', 'Chief Financial Officer', 'Chairman & Chief Executive Officer'
invalid_roles_df = jpm_2025_df[~jpm_2025_df['role'].isin(valid_roles)]

# Number of rows with invalid roles.
print('Number of rows:', invalid_roles_df.shape[0])

# View the rows.
invalid_roles_df.head()

Number of rows: 2


Unnamed: 0,question_number,answer_number,speaker_name,role,company,content,year,quarter,source_pdf
201,35,5.0,"Chief Financial Officer, JPMorganChase",And then some. Theres a lot of value added.,JPMorganChase,"Yeah. And obviously, I mean, we're not going t...",2025,Q2,data/raw/jpm/jpm-2q25-earnings-call-transcript...
205,36,3.0,"Chief Financial Officer, JPMorganChase",Okay,there you have it.,"But it's not like I thought it would do badly,...",2025,Q2,data/raw/jpm/jpm-2q25-earnings-call-transcript...


In [5]:
# Input the correct role information.
jpm_2025_df.at[205, 'role'] = 'Chief Financial Officer'
jpm_2025_df.at[209, 'role'] = 'Chief Financial Officer'

# Verify the roles have been updated.
jpm_2025_df['role'].unique()

array(['analyst', 'Chief Financial Officer',
       'Chairman & Chief Executive Officer',
       'And then some. Theres a lot of value added.'], dtype=object)

In [6]:
# Define role mapping.
role_map = {
    'analyst': 'analyst',
    'Chief Financial Officer': 'banker',
    'Chairman & Chief Executive Officer': 'banker'
}

# Apply to dataset.
jpm_2025_df['role_normalised'] = jpm_2025_df['role'].map(role_map)

In [7]:
# View the dataset.
jpm_2025_df.head()

Unnamed: 0,question_number,answer_number,speaker_name,role,company,content,year,quarter,source_pdf,role_normalised
0,1,,Ken Usdin,analyst,Autonomous Research,"Good morning, Jeremy. Wondering if you could s...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,analyst
1,1,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Sure, Ken. So I mean, at a high level, I would...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker
2,2,,Ken Usdin,analyst,Autonomous Research,Yeah. And just one question on the NII ex. Mar...,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,analyst
3,2,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Yeah, that's a good question, Ken. You're righ...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker
4,2,2.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorganChase,In the curve basically.,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker


# **5. Summarisation**

## **5.1 Baseline**

### **5.1.1 BART**

In [8]:
# Filter data to banker answers only.
banker_answers = jpm_2025_df[jpm_2025_df['role_normalised'] == 'banker']['content'].tolist()
print(banker_answers[0][:200])

Sure, Ken. So I mean, at a high level, I would say that obviously, some of the salient news flow is quite recent. So, we've done some soundings and some checking both on the consumer side and on the w


In [9]:
# Summarisation baseline (BART)
summariser = pipeline('summarization', model='facebook/bart-large-cnn')

sample_text = banker_answers[0]
summary = summariser(sample_text, max_length=80, min_length=30, do_sample=False)
print('Original:', sample_text[:400])
print('Summary:', summary[0]['summary_text'])

Device set to use mps:0


Original: Sure, Ken. So I mean, at a high level, I would say that obviously, some of the salient news flow is quite recent. So, we've done some soundings and some checking both on the consumer side and on the wholesale side. I think on the consumer side, the thing to check is the spending data. And to be honest, the main thing that we see there, what would appear to be a certain amount of frontloading of sp
Summary: The main thing that we see there, what would appear to be a certain amount of frontloading of spending ahead of people expecting price increases from tariffs. So ironically, that's actually somewhat supportive, all else equal. In terms of our corporate clients, obviously, they've been reacting to the changes in tariff policy.


In [10]:
# Prompt conditioning to make PRA relevant.
prompt = "Summarise this answer, focusing on risk, capital and evasion of detail: " + sample_text
summary = summariser(prompt, max_length=80, min_length=30)
print('Original:', sample_text[:400])
print('Summary:', summary[0]['summary_text'])

Original: Sure, Ken. So I mean, at a high level, I would say that obviously, some of the salient news flow is quite recent. So, we've done some soundings and some checking both on the consumer side and on the wholesale side. I think on the consumer side, the thing to check is the spending data. And to be honest, the main thing that we see there, what would appear to be a certain amount of frontloading of sp
Summary: Corporates are taking a wait-and-see approach to tariff policy. Some sectors are going to be much more exposed than others. Small business and smaller corporates are probably a little more challenged.


### **5.1.2 Mistral-7B-Instruct**

- add baseline for mistral-7B-instruct to see performance improvement with the added context

## **5.2 Adding Context**

Retrieve PRA risk categories to give greater PRA focus to summaries (local RAG loop).
- measure cosine similarity between transcript chunks and PRA risk categories (vectors)
- retrieve the top 2-3 most relevant risk categories 
- prepend them to the summarisation prompt to make summaries PRA-aligned instead of just summarised answers

- Attempting to use BART resulted in prompt echoing.
- New attempt using Mistral-7B-Instruct.
- Using sentence-BERT vs TF-IDF for vectorisation.

### **5.2.1 Mistral-7B-Instruct**

- Needed to use a lot of fine tuning for the prompt and set strict rules for the model
- Need to be very clear about the output expected or else the model deviates a lot, especially as it processes more data.
- Include lines about lack of evidence if not the model may hallucinate

In [11]:
# Function to remove whitespace in text.
def clean_text(text: str):
    return re.sub(r'\s+', ' ', text).strip()

In [12]:
# Function to split the transcript into smaller chunks.
def chunk_text(text: str, max_chars: int = 6000):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip()) # split into sentences 
    chunks, current_chunk, current_len = [], [], 0 # list of chunks, sentences collecting for current chunk, character count for current chunk

    for s in sentences:
        if current_len + len(s) + 1 <= max_chars: # if the characters of current chunk + new sentence is below the limit:
            current_chunk.append(s) # add sentence to current chunk 
            current_len += len(s) + 1 # update running character count 
        
        else: # if the characters is above the limit:
            chunks.append(' '.join(current_chunk)) # add the current chunk to the final chunk list
            current_chunk, current_len = [s], len(s) # start a new chunk containing the sentence and update current len

    if current_chunk:
        chunks.append(' '.join(current_chunk)) # add any sentences in current chunk after loop ends 

    return chunks 

In [13]:
# Function to load PRA categories and definitions from CSV.
def load_pra_categories(path: Path):
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return [
            (row.get('category', '').strip(), [row.get('definition', '').strip()])
            for row in reader if row.get('category')
        ]

In [14]:
# Build a Sentence-BERT embedding index for PRA categories.
def build_embedding_index(pra_categories):
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    docs = [f"{name} {' '.join(defs)}" for name, defs in pra_categories]
    pra_risk_embeddings = embedder.encode(docs, batch_size=32, normalize_embeddings=True)

    return embedder, np.asarray(pra_risk_embeddings)

In [15]:
# Function to find the relevant PRA categories to the transcript chunks.
def find_rel_categories(chunk, pra_categories, embedder, pra_risk_embeddings, top_k=2):
    query_vec = embedder.encode([chunk], normalize_embeddings=True) # turns chunk into embedding
    sims = cosine_similarity(query_vec, pra_risk_embeddings).ravel() # compares the chunk to each category doc 
    top_indices = np.argsort(-sims)[:top_k] # sorts scores descending and selected top k cateogories 

    return [pra_categories[i] for i in top_indices]

In [None]:
def parse_tagged_json(raw):
    m = re.search(r"<json>\s*(\{[\s\S]*?\})\s*</json>", raw, flags=re.IGNORECASE)
    if not m:
        return None
    try:
        return json.loads(m.group(1))
    except json.JSONDecodeError:
        return None

In [None]:
# Function to summarise the text chunks.
def summarise_chunk(model, chunk, relevant_categories, chunk_idx, max_evidence=5):

    # Build PRA notes (limit to 2 bullets per category)
    lines = []
    for name, definition in relevant_categories:
        lines.append(f"- {name}:")
        for d in list(definition)[:2]:
            lines.append(f"- {d}")
    notes_block = "\n".join(lines)

    system_prompt = (
        "You are a careful data extraction model. "
        "Return ONLY valid JSON wrapped in <json>...</json> tags."
    )

    user_prompt = f"""
TRANSCRIPT:
{chunk}

PRA NOTES:
{notes_block}

TASK:
Return JSON ONLY, wrapped exactly like this:
<json>{{"summary": "...", "evidence": ["..."], "pra_categories": [{{"category":"...","why":"..."}}]}}</json>

RULES:
- 4-6 sentence neutral summary.
- Up to {max_evidence} evidence bullets (quotes/facts).
- 1-3 pra_categories objects.
- If evidence is lacking, use a single bullet "Insufficient evidence".
- Only choose categories supported by the evidence.
""".strip()

    response = model.create_chat_completion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
        top_p=0.9,
        max_tokens=700,
        repeat_penalty=1.1,
    )

    raw = (response["choices"][0]["message"]["content"] or "").strip()

    # Parse the tagged JSON
    parsed = parse_tagged_json(raw)

    # Fallback if model didn’t follow instructions
    if not parsed:
        return (
            {"summary": "", "evidence": ["Insufficient evidence"], "pra_categories": []},
            raw,
        )

    # Light coercion to guarantee keys exist
    result = {
        "summary": parsed.get("summary", "") or "",
        "evidence": parsed.get("evidence", []) or [],
        "pra_categories": parsed.get("pra_categories", []) or []
    }
    return result, raw

In [None]:
# Define variables.
MODEL_PATH = '/Users/laurenbrixey/Documents/Data Science Career Accelerator/Project Submissions/Course 3/topic_project_4.1/mistral-7b-instruct-v0.1.Q4_K_M.gguf'
PRA_NOTES_PATH = '../data/RAG-resources/PRA_risk_categories.csv'
TRANSCRIPT_PATH = '../data/processed/jpm/all_jpm_2025.csv'
OUTPUT_PATH = pathlib.Path("jpm_mistral_pra_summary_raw.json")
TOP_K = 2

In [None]:
# Runner code.
pra_categories = load_pra_categories(Path(PRA_NOTES_PATH))
embedder, category_embeddings = build_embedding_index(pra_categories)

# Load and chunk transcript
transcript_text = Path(TRANSCRIPT_PATH).read_text(encoding="utf-8")
transcript_chunks = chunk_text(transcript_text)

n_threads = max(4, (os.cpu_count() or 8) - 2)

# Define the model.
model = Llama(
    model_path=str(MODEL_PATH),
    n_ctx=4096,
    n_gpu_layers=20,
    chat_format="mistral-instruct",
    n_threads=n_threads,
)

raw_outputs = []

for i, chunk in enumerate(transcript_chunks, 1):
    try:
        top_categories = find_rel_categories(
            chunk, pra_categories, embedder, category_embeddings, top_k=TOP_K
        )
        _, raw = summarise_chunk(
            model, chunk, top_categories, i, max_evidence=5
        )
        raw_outputs.append({"chunk": i, "raw": raw})

    except Exception:
        raw_outputs.append({"chunk": i, "raw": ""})

final_output = {"raw_outputs": raw_outputs}

OUTPUT_PATH.write_text(json.dumps(final_output, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Wrote final JSON to: {OUTPUT_PATH.resolve()}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
llama_model_load_from_file_impl: using device Metal (Apple M3) - 8456 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/laurenbrixey/Documents/Data Science Career Accelerator/Project Submissions/Course 3/topic_project_4.1/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_len

Wrote final JSON to: /Users/laurenbrixey/Documents/GitHub Repositories/cam_ds_ep_FinSight/notebooks/jpm_mistral_pra_summary.json


- Need to preprocess the output (summary, evidence, PRA categories (name & why the model chose this))
- Can this information be fed to the model again and can it detect any early PRA risk indicators?

# **6. Evasion Scoring**

- Detect evasiveness of bankers in relation to analyst questions and give an evasiveness score.

## **6.1 Preprocessing**

In [22]:
# View data.
jpm_2025_df.head()

Unnamed: 0,question_number,answer_number,speaker_name,role,company,content,year,quarter,source_pdf,role_normalised
0,1,,Ken Usdin,analyst,Autonomous Research,"Good morning, Jeremy. Wondering if you could s...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,analyst
1,1,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Sure, Ken. So I mean, at a high level, I would...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker
2,2,,Ken Usdin,analyst,Autonomous Research,Yeah. And just one question on the NII ex. Mar...,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,analyst
3,2,1.0,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Yeah, that's a good question, Ken. You're righ...",2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker
4,2,2.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorganChase,In the curve basically.,2025,Q1,data/raw/jpm/jpm-1q25-earnings-call-transcript...,banker


In [23]:
# Split into questions = analyst and answers = banker.
questions = jpm_2025_df[jpm_2025_df['role_normalised'] == 'analyst']
answers = jpm_2025_df[jpm_2025_df['role_normalised'] == 'banker']

In [27]:
# Pair each analyst question with all the banker's answers.
qa_pairs = []

for q_num, q_row in questions.groupby('question_number'):
    q_text = ' '.join(q_row['content'].astype(str))
    a_rows = answers[answers['question_number'] == q_num]
    if not a_rows.empty:
        a_text = ' '.join(a_rows['content'].astype(str))
        qa_pairs.append({
            'question_number': q_num, 
            'question': q_text,
            'answer': a_text
        })

jpm_qa_df = pd.DataFrame(qa_pairs)

# View the results.
jpm_qa_df.head()

Unnamed: 0,question_number,question,answer
0,1,"Good morning, Jeremy. Wondering if you could s...","Sure, Ken. So I mean, at a high level, I would..."
1,2,Yeah. And just one question on the NII ex. Mar...,"Yeah, that's a good question, Ken. You're righ..."
2,3,Yes. Good morning. This question is for Jamie....,"I just – before Jamie answers that, Erika, I j..."
3,4,Got it. And a second follow-up question. And I...,"Yeah, Erika, it's a good question. But the tru..."
4,5,Thank you. Operator: Thank you. Our next quest...,"Thanks, Erika. Operator: I apologize. Our next..."


## **6.2 Evasion Detection**

- Evasion score is made up of three components:
1. **Cosine similarity**
- similarity of the question and answer
2. **Numeric specificity check**
- does the question require a number, if so does the answer contain a number?, e.g. requests for financial data
3. **Evasive phrases**
- does the answer contain evasive phrases?

In [None]:
# List of evasive phrases used to compute evasive signals.
EVASIVE_PHRASES = [
    r"\btoo early\b",
    r"\bcan't (?:comment|share|discuss)\b",
    r"\bwon't (?:comment|share|provide)\b",
    r"\bno (?:update|comment)\b",
    r"\bwe (?:don't|do not) (?:break out|provide guidance)\b",
    r"\bnot (?:going to|able to) (?:comment|share|provide)\b",
    r"\bwe'll (?:come back|circle back)\b",
    r"\bnot something we disclose\b",
    r"\bas (?:we|I) (?:said|mentioned)\b",
    r"\bgenerally speaking\b",
    r"\bit's premature\b",
    r"\bit's difficult to say\b",
    r"\bI (?:wouldn't|won't) want to (?:speculate|get into)\b",
    r"\bI (?:think|guess|suppose)\b",
    r"\bkind of\b",
    r"\bsort of\b",
    r"\baround\b",
    r"\broughly\b",
    r"\bwe (?:prefer|plan) not to\b",
    r"\bwe're not prepared to\b",
]

In [None]:
# List of words that suggest the answer needs specific financial numbers to properly answer the question.
SPECIFICITY_TRIGGERS = [
    "how much","how many","what is","what are","when","which","where","who","why",
    "range","guidance","margin","capex","opex","revenue","sales","eps","ebitda",
    "timeline","date","target","growth","update","split","dividend","cost","price",
    "units","volumes","gross","net","tax","percentage","utilization","order book"
]

In [None]:
# Function to calculate cosine similarity between question and answers.
def cosine_sim(q, a):
    vec = TfidfVectorizer(stop_words='english').fit_transform([q, a]) # converts text to vectors 
    sim = float(cosine_similarity(vec[0], vec[1])) # calculate the cosine similarity between the two vectors

    return sim

In [None]:
# Function to determine evasion score.
def evasion_score(q, a):
    # 1. Cosine similarity
    sim = cosine_sim(q, a) # calculates cosine similarity using previous function
    sim_component = (1 - sim) * 45 # less similar the answer is, the bigger the contribution to the evasion score, scaled by 45

    # 2. Numerical specificity- Does the question require and answer with financial figures/ a specific answer?
    needs_num = any(t in q.lower() for t in SPECIFICITY_TRIGGERS) # true if the question requires a numeric/ specific answer
    has_num = bool(re.search(NUMERIC_PATTERN, a)) # true if the answer includes a number 
    numeric_component = 25 if needs_num and not has_num else 0 # score of 25 if the question needs a number but the answer doesn't give one

    # 3. Evasive phrases- does the answer contain evasive phrases?
    phrase_hits = sum(len(re.findall(p, a.lower())) for p in EVASIVE_PHRASES) # counts how many times an evasive phrase appears in the answer
    phrase_component = min(3, phrase_hits) * 8 # max of 3 hits counted, each hit = 8 points 

    # Final evasion score.
    score = min(100, sim_component + numeric_component + phrase_component) # adds components together and caps score at 100
    
    return score, sim, phrase_hits, needs_num, has_num

In [None]:
# Apply scoring to data & catergories based on evasiveness.
records = []

for _, row in qa_df.iterrows():
    q, a = row['question'], row['answer']
    score, sim, phrase_hits, needs_num, has_num = evasion_score(q, a)
    
    # Generate summary based on evasiveness score.
    if score >= 70:
        summary = f'Likely evasive (score {score:.0f}).'
    elif score >= 50:
        summary = f'Possibly evasive (score {score:.0f}).'
    else:
        summary = f'Likely responsive (score {score:.0f}).'
    
    # Include reason for score.
    if sim < 0.25: summary += 'Answer has low relevance to the question'
    if needs_num and not has_num: summary += 'Asked for numbers but none given'
    if phrase_hits > 0: summary += 'Evasive phrasing detected'

    records.append({
        'question_number': row['question_number'],
        'question': q,
        'answer': a,
        'evasion_score': score,
        'similarity': sim,
        'evasive_phrase_count': phrase_hits,
        'answer needs_numeric': needs_num,
        'answer has numeric': has_num, 
        'summary': summary
    })

jpm_evasion_results = pd.DataFrame(records)

# Save outputs.
results.to_csv('../notebooks/jpm_evasion_results.csv', index=False)
with open('../notebooks/jpm_evasion_results.csv',"w",encoding="utf-8") as f:
    json.dump(results.to_dict(orient="records"), f, indent=2, ensure_ascii=False)

In [None]:
# View results.
display(jpm_evasion_results)