# **Evasion Detection Notebook**

# **1. Objectives**

The purpose of this notebook is to contain the evasion detection pipeline. 
1. **Baseline Evasion score** (rule-based) is made up of three components:
- **Cosine similarity**- similarity of the question and answer, lower similarity = more evasive
- **Numeric specificity check**- does the question require a number, if so does the answer contain a number?, e.g. requests for financial data
- **Evasive phrases**- does the answer contain evasive phrases?, presence = more evasive

2. **LLM evasion score** (RoBERTa-MNLI) uses entailment/neutral/contradiction between the question and answer
- Lower entailment (and higher neutral + contradiction) = more evasive
  
3. **Blended evasion score** combines both scores including a weight for the LLM component
- Rationale is that baseline enforces precision while the LLM will capture semantics

# **1. Set up Workspace**

In [1]:
# Import libraries
# Core python
import os
import numpy as np
import pandas as pd
import re
import json
import pathlib
from pathlib import Path
from typing import List, Dict, Any 
import csv
import math
from collections import Counter

# NLP & Summarisation
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import transformers, datasets, inspect
from llama_cpp import Llama 
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset

# Retrieval
from sentence_transformers import SentenceTransformer 

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score 
from sklearn.model_selection import GroupShuffleSplit 

# Visualisations
import matplotlib.pyplot as plt
import seaborn as sns 

# Set SEED.
SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


# **2. Data Preprocessing**

In [2]:
# Load dataset
all_jpm_2023_2025 = pd.read_csv('../data/processed/jpm/all_jpm_2023_2025.csv')

# View dataset.
display(all_jpm_2023_2025.head())

# Number of rows.
print('Number of rows:', all_jpm_2023_2025.shape[0])

Unnamed: 0,section,question_number,answer_number,speaker_name,role,company,content,year,quarter,is_pleasantry,source_pdf
0,presentation,,,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Thanks, and good morning, everyone. The presen...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...
1,qa,,,Steven Chubak,analyst,Wolfe Research LLC,"Hey, good morning.",2023,Q1,True,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...
2,qa,,,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Good morning, Steve.",2023,Q1,True,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...
3,qa,1.0,,Steven Chubak,analyst,Wolfe Research LLC,"So, Jamie, I was actually hoping to get your p...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...
4,qa,1.0,1.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorgan Chase & Co.,"Well, I think you were already kind of complet...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...


Number of rows: 1411


In [3]:
# Remove pleasantries.
all_jpm_2023_2025_cleaned = all_jpm_2023_2025[all_jpm_2023_2025['is_pleasantry'] == False]
print('Number of rows:', all_jpm_2023_2025_cleaned.shape[0])

Number of rows: 1241


In [4]:
# Check content column.
print('Number of rows with no content:', all_jpm_2023_2025_cleaned['content'].isna().sum())

Number of rows with no content: 23


In [5]:
# Drop rows with no content.
all_jpm_2023_2025_cleaned = all_jpm_2023_2025_cleaned.dropna(subset=['content'])

In [6]:
# Check content column.
print('Number of rows with no content:', all_jpm_2023_2025_cleaned['content'].isna().sum())

Number of rows with no content: 0


In [7]:
# View roles.
all_jpm_2023_2025_cleaned['role'].unique()

array(['Chief Financial Officer', 'analyst',
       'Chairman & Chief Executive Officer',
       'And then some. Theres a lot of value added.', 'Okay',
       "We're fundamentally", 'Thanks', 'Almost no chance.'], dtype=object)

- Some text has leaked into role column.

In [8]:
# View rows with invalid roles. 
valid_roles = 'analyst', 'Chief Financial Officer', 'Chairman & Chief Executive Officer'
invalid_roles_df = all_jpm_2023_2025_cleaned[~all_jpm_2023_2025_cleaned['role'].isin(valid_roles)]
invalid_roles_df.head(10)

Unnamed: 0,section,question_number,answer_number,speaker_name,role,company,content,year,quarter,is_pleasantry,source_pdf
305,qa,22.0,4.0,"Chief Financial Officer, JPMorganChase",And then some. Theres a lot of value added.,JPMorganChase,"Yeah. And obviously, I mean, we're not going t...",2025,Q2,False,data/raw/jpm/.ipynb_checkpoints/jpm-2q25-earni...
309,qa,23.0,3.0,"Chief Financial Officer, JPMorganChase",Okay,there you have it.,"But it's not like I thought it would do badly,...",2025,Q2,False,data/raw/jpm/.ipynb_checkpoints/jpm-2q25-earni...
650,qa,10.0,3.0,Who knows how important politics are in all th...,We're fundamentally,"as I said, I think on the press call, happy to...",little bit cautious about the pull-forward dyn...,2024,Q1,False,data/raw/jpm/jpm-1q24-earnings-call-transcript...
924,qa,8.0,2.0,"Chief Financial Officer, JPMorgan Chase & Co.",Thanks,Glenn.,"Operator: Next, we'll go to the line of Matt O...",2024,Q2,False,data/raw/jpm/jpm-2q24-earnings-call-transcript...
1059,qa,22.0,4.0,"Chief Financial Officer, JPMorganChase",And then some. Theres a lot of value added.,JPMorganChase,"Yeah. And obviously, I mean, we're not going t...",2025,Q2,False,data/raw/jpm/jpm-2q25-earnings-call-transcript...
1063,qa,23.0,3.0,"Chief Financial Officer, JPMorganChase",Okay,there you have it.,"But it's not like I thought it would do badly,...",2025,Q2,False,data/raw/jpm/jpm-2q25-earnings-call-transcript...
1274,qa,23.0,1.0,"Chairman & Chief Executive Officer, JPMorgan C...",Almost no chance.,JPMorganChase,"Well, but having – it's very important. While ...",2024,Q3,False,data/raw/jpm/jpm-3q24-earnings-conference-call...


In [9]:
# Input the correct role information.
all_jpm_2023_2025_cleaned.loc[[305, 309, 924, 1059, 1063], 'role'] = 'Chief Financial Officer'
all_jpm_2023_2025_cleaned.loc[[1274], 'role'] = 'Chairman & Chief Executive Officer'

# Drop nonsence row.
all_jpm_2023_2025_cleaned = all_jpm_2023_2025_cleaned.drop(index=650)

In [10]:
# Check the roles have been updated.
all_jpm_2023_2025_cleaned['role'].unique()

array(['Chief Financial Officer', 'analyst',
       'Chairman & Chief Executive Officer'], dtype=object)

In [11]:
# Normalise role names.
role_map = {
    'analyst': 'analyst',
    'Chief Financial Officer': 'banker',
    'Chairman & Chief Executive Officer': 'banker'
}

# Map roles.
all_jpm_2023_2025_cleaned['role_normalised'] = all_jpm_2023_2025_cleaned['role'].map(role_map)

In [12]:
# View dataset.
display(all_jpm_2023_2025_cleaned.head())
print('Number of rows:', all_jpm_2023_2025_cleaned.shape[0])

Unnamed: 0,section,question_number,answer_number,speaker_name,role,company,content,year,quarter,is_pleasantry,source_pdf,role_normalised
0,presentation,,,Jeremy Barnum,Chief Financial Officer,JPMorganChase,"Thanks, and good morning, everyone. The presen...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...,banker
3,qa,1.0,,Steven Chubak,analyst,Wolfe Research LLC,"So, Jamie, I was actually hoping to get your p...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...,analyst
4,qa,1.0,1.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorgan Chase & Co.,"Well, I think you were already kind of complet...",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...,banker
5,qa,1.0,1.0,Steven Chubak,analyst,Wolfe Research LLC,Got it. And just in terms of appetite for the ...,2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...,analyst
6,qa,1.0,2.0,Jamie Dimon,Chairman & Chief Executive Officer,JPMorgan Chase & Co.,"Oh, yeah.",2023,Q1,False,data/raw/jpm/.ipynb_checkpoints/jpm-1q23-earni...,banker


Number of rows: 1217


In [13]:
# Save the cleaned dataset.
all_jpm_2023_2025_cleaned.to_csv('../data/processed/jpm/cleaned/all_jpm_2023_2025_cleaned') 

In [14]:
# Helper function to remove duplicates within questions and answers. 
def clean_repeats(text):
    if not isinstance(text, str):
        return text

    # 1) Normalize whitespace
    t = ' '.join(text.split()).strip()
    if not t:
        return t

    # 2) If the whole-string is a back-to-back duplicate (A+A) = keep first half
    mid = len(t) // 2
    if len(t) % 2 == 0 and t[:mid] == t[mid:]:
        t = t[:mid]

    # 3) Collapse immediate repeated token spans (n-grams)
    toks = t.split()
    out = []
    i = 0
    while i < len(toks):
        matched = False
        max_span = min(50, len(toks) - i)  # cap span to remaining length
        for n in range(max_span, 4, -1):  # try longer spans first: 50..5
            if i + 2*n <= len(toks) and toks[i:i+n] == toks[i+n:i+2*n]:
                out.extend(toks[i:i+n])  # keep one copy
                i += 2*n                # skip the duplicate block
                matched = True
                break
        if not matched:
            out.append(toks[i])
            i += 1
    t = ' '.join(out)

    # 4) Remove duplicate sentences globally (order-preserving)
    sents = re.split(r'(?<=[.!?])\s+', t)
    seen = set()
    uniq = []
    for s in sents:
        s_norm = s.strip()
        if not s_norm:
            continue
        key = ' '.join(s_norm.lower().split())
        if key not in seen:
            seen.add(key)
            uniq.append(s_norm)
    return ' '.join(uniq)

In [15]:
# Function to convert datasets into question and answer pairs.
def create_qa_pairs(df, min_answer_words=30):
    # Keep only the Q&A section.
    qa_df = df[df['section'].astype(str).str.lower() == 'qa'].copy()

    # Split into roles.
    analyst_rows = qa_df[qa_df['role_normalised'] == 'analyst'].copy()
    banker_rows  = qa_df[qa_df['role_normalised'] == 'banker' ].copy()

    # Keys to keep quarters separated
    key_q = ['year', 'quarter', 'question_number']

    # Build full question text per (year, quarter, question_number)
    question_text_map = (
        analyst_rows
        .groupby(key_q, dropna=False)['content']
        .apply(lambda parts: clean_repeats(' '.join(parts.astype(str))))
        .rename('question')
        .reset_index()
    )

    # Ensure bankers have an answer_number — sequential per (year, quarter, question_number) if missing
    if 'answer_number' not in banker_rows.columns or banker_rows['answer_number'].isna().any():
        banker_rows = banker_rows.sort_index().copy()
        banker_rows['answer_number'] = (
            banker_rows
            .groupby(key_q, dropna=False)
            .cumcount() + 1
        )

    # Combine multiple banker utterances belonging to the same answer
    banker_answers = (
        banker_rows
        .groupby(key_q + ['answer_number'], dropna=False)
        .agg({
            'content':        lambda parts: clean_repeats(' '.join(parts.astype(str))),
            'speaker_name':   'first',
            'role':           'first',
            'role_normalised':'first',
            'source_pdf':     'first'
        })
        .rename(columns={'content': 'answer'})
        .reset_index()
    )

    # Merge question text back onto each answer row
    qa_pairs = banker_answers.merge(
        question_text_map,
        on=key_q,
        how='left',
        validate='many_to_one'
    )

    # Order columns for readability
    column_order = [
        'year', 'quarter', 'question_number', 'answer_number',
        'question', 'answer',
        'speaker_name', 'role', 'role_normalised',
        'source_pdf'
    ]
    qa_pairs = qa_pairs.reindex(columns=[c for c in column_order if c in qa_pairs.columns])

    # Sort and reset index.
    qa_pairs = qa_pairs.sort_values(['year', 'quarter', 'question_number', 'answer_number']).reset_index(drop=True)

    # Drop duplicate answers.
    qa_pairs = qa_pairs.drop_duplicates(subset=['answer'])

    # Drop short answers below threshold to ensure quality answers.
    qa_pairs = qa_pairs[qa_pairs['answer'].astype(str).str.split().str.len() >= int(min_answer_words)]

    return qa_pairs

In [16]:
# Create q&A pairs.
all_jpm_2023_2025_qa = create_qa_pairs(all_jpm_2023_2025_cleaned)

In [17]:
# View number of examples.
print('Number of examples:', all_jpm_2023_2025_qa.shape[0])

Number of examples: 309


In [18]:
# Split into prediction set and validation/training/test set.
jpm_2025_predict_qa = all_jpm_2023_2025_qa[all_jpm_2023_2025_qa['year'] == 2025]
jpm_2023_2024_qa = all_jpm_2023_2025_qa[all_jpm_2023_2025_qa['year'].isin([2023, 2024])]

# Save the datasets.
jpm_2025_predict_qa.to_csv('../data/processed/jpm/cleaned/jpm_2025_predict_qa.csv') 
jpm_2023_2024_qa.to_csv('../data/processed/jpm/cleaned/jpm_2023_2024_qa.csv')  

The jpm_2023_2024_qa dataset was then manually labelled according to whether the banker's answer was deemed 'Direct' or 'Evasive'. The label was appended by a new column 'label'.

In [19]:
# Load the labelled dataset.
jpm_2023_2024_qa_labelled = pd.read_csv('../data/processed/jpm/cleaned/jpm_2023_2024_qa_labelled.csv')

# View the dataset.
jpm_2023_2024_qa_labelled = jpm_2023_2024_qa_labelled.drop('Unnamed: 0', axis=1)
jpm_2023_2024_qa_labelled.head()

Unnamed: 0,year,quarter,question_number,answer_number,question,answer,speaker_name,role,role_normalised,source_pdf,label
0,2023,Q4,1.0,1.0,Good morning. Thanks for all the comments on t...,"Yeah. Matt, not particularly updating. I think...",Jeremy Barnum,Chief Financial Officer,banker,data/raw/jpm/jpm-4q23-earnings-call-transcript...,Direct
1,2023,Q4,2.0,1.0,"Okay. And then just separately, you bought bac...",Yeah. Good question. And I think you framed it...,Jeremy Barnum,Chief Financial Officer,banker,data/raw/jpm/jpm-4q23-earnings-call-transcript...,Direct
2,2023,Q4,3.0,1.0,"Thanks. Jeremy, could you give a little more c...","Yeah. Actually, John, this quarter, that's all...",Jeremy Barnum,Chief Financial Officer,banker,data/raw/jpm/jpm-4q23-earnings-call-transcript...,Direct
3,2023,Q4,4.0,1.0,"Okay. And then, just to follow up on the NII, ...","Sure. Yeah, happy to do that, John. So, I thin...",Jeremy Barnum,Chief Financial Officer,banker,data/raw/jpm/jpm-4q23-earnings-call-transcript...,Direct
4,2023,Q4,5.0,1.0,Hey. Good morning. Maybe just to follow up in ...,Yeah. Both good questions. So let's do reprice...,Jeremy Barnum,Chief Financial Officer,banker,data/raw/jpm/jpm-4q23-earnings-call-transcript...,Direct


In [20]:
# Function to split into test, training and validation datasets, preserve number of evasive cases per set.
def train_val_test(df, group_key, test_fraction, val_fraction, random_state):

    # Split test from full data.
    gss1 = GroupShuffleSplit(n_splits=1, test_size=test_fraction, random_state=random_state)
    idx_trainval, idx_test = next(gss1.split(df, groups=df[group_key]))
    train_and_val = df.iloc[idx_trainval].reset_index(drop=True)
    test_set = df.iloc[idx_test].reset_index(drop=True)

    # Split VAL from the remaining data (val is relative to full size)
    val_fraction_of_remaining = val_fraction / (1.0 - test_fraction)
    gss2 = GroupShuffleSplit(n_splits=1, test_size=val_fraction_of_remaining, random_state=random_state + 1)
    idx_train, idx_val = next(gss2.split(train_and_val, groups=train_and_val[group_key]))
    train_set = train_and_val.iloc[idx_train].reset_index(drop=True)
    val_set = train_and_val.iloc[idx_val].reset_index(drop=True)

    return train_set, val_set, test_set


In [21]:
# Make a group key so answers for the same question are not split between datasets.
jpm_2023_2024_qa_labelled['group_key'] = (
    jpm_2023_2024_qa_labelled["year"].astype(str) + "_" +
    jpm_2023_2024_qa_labelled["quarter"].astype(str) + "_" +
    jpm_2023_2024_qa_labelled["question_number"].astype(str)
)

In [22]:
# Split into test, training and validation datasets.
jpm_train, jpm_val, jpm_test = train_val_test(
    jpm_2023_2024_qa_labelled,
    group_key='group_key',
    test_fraction=0.30,
    val_fraction=0.20,
    random_state=SEED
)

In [23]:
# View the split. 
print(f'Number of training examples: {jpm_train.shape[0]} (evasive: {jpm_train[jpm_train["label"] == "Evasive"].shape[0]})')
print(f'Number of validation examples: {jpm_val.shape[0]} (evasive: {jpm_val[jpm_val["label"] == "Evasive"].shape[0]})')
print(f'Number of test examples: {jpm_test.shape[0]} (evasive: {jpm_test[jpm_test["label"] == "Evasive"].shape[0]})')

Number of training examples: 107 (evasive: 22)
Number of validation examples: 43 (evasive: 11)
Number of test examples: 65 (evasive: 9)


In [24]:
# Save the datasets.
jpm_train.to_csv('../data/processed/jpm/cleaned/jpm_train.csv') 
jpm_val.to_csv('../data/processed/jpm/cleaned/jpm_val.csv') 
jpm_test.to_csv('../data/processed/jpm/cleaned/jpm_test.csv') 

# **3. Rule-based Baseline**

## **3.1 Set-up**

In [25]:
# List of evasive phrases
EVASIVE_PHRASES = [
    r"\btoo early\b",
    r"\bcan't (?:comment|share|discuss)\b",
    r"\bwon't (?:comment|share|provide)\b",
    r"\bno (?:update|comment)\b",
    r"\bwe (?:don't|do not) (?:break out|provide guidance)\b",
    r"\bnot (?:going to|able to) (?:comment|share|provide)\b",
    r"\bwe'll (?:come back|circle back)\b",
    r"\bnot something we disclose\b",
    r"\bas (?:we|I) (?:said|mentioned)\b",
    r"\bgenerally speaking\b",
    r"\bit's premature\b",
    r"\bit's difficult to say\b",
    r"\bI (?:wouldn't|won't) want to (?:speculate|get into)\b",
    r"\bI (?:think|guess|suppose)\b",
    r"\bkind of\b",
    r"\bsort of\b",
    r"\baround\b",
    r"\broughly\b",
    r"\bwe (?:prefer|plan) not to\b",
    r"\bwe're not prepared to\b",
]

# List of words that suggest the answer needs specific financial numbers to properly answer the question.
SPECIFICITY_TRIGGERS = [
    "how much","how many","what is","what are","when","which","where","who","why",
    "range","guidance","margin","capex","opex","revenue","sales","eps","ebitda",
    "timeline","date","target","growth","update","split","dividend","cost","price",
    "units","volumes","gross","net","tax","percentage","utilization","order book"
]

NUMERIC_PATTERN = r"(?:\d+(?:\.\d+)?%|\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|£|\$|€)"

## **3.2 Functions**

In [26]:
# Function to calculate cosine similarity between question and answers.
def cosine_sim(q, a):
    vec = TfidfVectorizer(stop_words='english').fit_transform([q, a]) # converts text to vectors 
    sim = float(cosine_similarity(vec[0], vec[1])[0, 0]) # calculate the cosine similarity between the two vectors

    return sim

In [27]:
# Function to compute baseline evasion score.
def baseline_evasion_score(q, a):
    # 1. Cosine similarity
    sim = cosine_sim(q, a) # calculates cosine similarity using previous function
    sim_component = (1 - sim) * 45 # less similar the answer is, the bigger the contribution to the evasion score, scaled by 45

    # 2. Numerical specificity- Does the question require and answer with financial figures/ a specific answer?
    needs_num = any(t in q.lower() for t in SPECIFICITY_TRIGGERS) # true if the question requires a numeric/ specific answer
    has_num = bool(re.search(NUMERIC_PATTERN, a)) # true if the answer includes a number 
    numeric_component = 25 if needs_num and not has_num else 0 # score of 25 if the question needs a number but the answer doesn't give one

    # 3. Evasive phrases- does the answer contain evasive phrases?
    phrase_hits = sum(len(re.findall(p, a.lower())) for p in EVASIVE_PHRASES) # counts how many times an evasive phrase appears in the answer
    phrase_component = min(3, phrase_hits) * 8 # max of 3 hits counted, each hit = 8 points 

    # Final evasion score.
    score = min(100, sim_component + numeric_component + phrase_component) # adds components together and caps score at 100
    
    return score, sim, phrase_hits, needs_num, has_num

# **4. LLM**

## **4.1 Training**

Small, lightweight models were selected for this to prevent memory overload and long training times.

In [28]:
# Define base save directory.
BASE_SAVE_DIR = "/Users/laurenbrixey/Documents/Data Science Career Accelerator/EP Model Training"
os.makedirs(BASE_SAVE_DIR, exist_ok=True)

In [29]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Model names.
distil_roberta_name = 'distilroberta-base'
deberta_name = 'microsoft/deberta-v3-small'

# Tokenizers.
distil_roberta_tok = AutoTokenizer.from_pretrained(distil_roberta_name)
deberta_tok = AutoTokenizer.from_pretrained (deberta_name)

cfg_dr = AutoConfig.from_pretrained(distil_roberta_name, num_labels=1, problem_type="single_label_classification")
cfg_db = AutoConfig.from_pretrained(deberta_name,          num_labels=1, problem_type="single_label_classification")

distil_roberta_model = AutoModelForSequenceClassification.from_pretrained(
    distil_roberta_name, config=cfg_dr
)
deberta_model = AutoModelForSequenceClassification.from_pretrained(
    deberta_name, config=cfg_db
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Device set-up
import gc
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

use_mps = torch.backends.mps.is_available()
device = torch.device('mps' if use_mps else ('cuda' if torch.cuda.is_available() else 'cpu'))

distil_roberta_model.to(device)
deberta_model.to(device)

def mps_gc():
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

In [31]:
# Function to build the premise from the Q&A transcripts 
def build_premise(q, a):
    return f'[QUESTION] {q} [ANSWER] {a}'

In [32]:
# Dataset wrapper.
class EvasionDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        
        # columns = question, answer, label (direct/evasive)
        texts = [build_premise(q, a) for q, a in zip(df['question'].astype(str), df['answer'].astype(str))]
        self.encodings = tokenizer(texts, truncation=True, padding=False, max_length=max_length)

        # map labels (evasion=1, direct=0)
        self.labels = (df['label'].astype(str).str.strip().str.lower() == 'evasive').astype(int).tolist()

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {k: self.encodings[k][idx] for k in self.encodings}
        item["labels"] = self.labels[idx]  # <-- key must be 'labels'
        return item

In [33]:
# Build datasets.
# distil_roberta_train = EvasionDataset(jpm_train, distil_roberta_tok, max_length=256)
distil_roberta_val = EvasionDataset(jpm_val, distil_roberta_tok, max_length=286)

# deberta_train = EvasionDataset(jpm_train, deberta_tok, max_length=256)
deberta_val = EvasionDataset(jpm_val, deberta_tok, max_length=286)

In [34]:
def upsample_positives(df, label_col="label", pos_name="evasive", factor=3):
    pos = df[df[label_col].str.lower().str.strip() == pos_name]
    neg = df[df[label_col].str.lower().str.strip() != pos_name]
    df_up = pd.concat([neg, pd.concat([pos]*factor, ignore_index=True)], ignore_index=True).sample(frac=1.0, random_state=42)
    return df_up

jpm_train_up = upsample_positives(jpm_train, label_col="label", pos_name="evasive", factor=3)
# rebuild datasets with jpm_train_up
distil_roberta_train = EvasionDataset(jpm_train_up, distil_roberta_tok, max_length=286)
deberta_train        = EvasionDataset(jpm_train_up, deberta_tok,        max_length=286)

In [35]:
# Dynamic padding.
from transformers import DataCollatorWithPadding
distil_roberta_collator = DataCollatorWithPadding(tokenizer=distil_roberta_tok)
deberta_collator = DataCollatorWithPadding(tokenizer=deberta_tok)

In [36]:
# Model save paths.
DISTIL_SAVE_DIR  = os.path.join(BASE_SAVE_DIR, "distil_roberta_tuned")
DEBERTA_SAVE_DIR = os.path.join(BASE_SAVE_DIR, "deberta_small_tuned")
os.makedirs(DISTIL_SAVE_DIR, exist_ok=True)
os.makedirs(DEBERTA_SAVE_DIR, exist_ok=True)

# Define early stopping.
from transformers import EarlyStoppingCallback
# early_stop = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)

# Distil-roberta training parameters.
distil_roberta_args = TrainingArguments(
    output_dir=DISTIL_SAVE_DIR,
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.0,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=10,
    warmup_ratio=0.0,
    report_to=['none'],
    dataloader_pin_memory=False,
    gradient_accumulation_steps=1,
    fp16=False,
    bf16=False,
    dataloader_num_workers=0,
    greater_is_better=True
)

# Deberta training parameters.
deberta_args = TrainingArguments(
    output_dir=DEBERTA_SAVE_DIR,
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=4e-5,
    weight_decay=0.0,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=10,
    warmup_ratio=0.0,
    report_to=['none'],
    dataloader_pin_memory=False,
    gradient_accumulation_steps=1,
    fp16=False,
    bf16=False,
    dataloader_num_workers=0,
    greater_is_better=True
)

In [37]:
# Function to compute metrics during training.
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits.reshape(-1)))   # sigmoid
    preds = (probs >= 0.5).astype(int) # threshold is 50% just to get a feel.
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    p_c, r_c, f1_c, _ = precision_recall_fscore_support(labels, preds, average=None, zero_division=0)
    return {
        "accuracy": acc,
        "f1_macro": f1,
        "precision_macro": precision,
        "recall_macro": recall,
        "recall_direct": r_c[0],
        "recall_evasive": r_c[1],
        "f1_direct": f1_c[0],
        "f1_evasive": f1_c[1],
    }

In [38]:
# Class weighted trainer.
from sklearn.utils.class_weight import compute_class_weight

# class WeightedCELossTrainer(Trainer):
#     def __init__(self, *args, class_weights=None, **kwargs):
#         super().__init__(*args, **kwargs)
#         self.class_weights = class_weights

#     # Accept extra kwargs from HF (e.g., num_items_in_batch)
#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         # don't mutate the caller's dict
#         inputs = inputs.copy()
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits

#         weight = None
#         if self.class_weights is not None:
#             weight = self.class_weights.to(logits.device)

#         loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
#         loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

#         return (loss, outputs) if return_outputs else loss

# class BCE1LogitTrainer(Trainer):
#     def __init__(self, *args, pos_weight=None, **kwargs):
#         super().__init__(*args, **kwargs)
#         self.pos_weight = pos_weight

#     # 👇 accept extra kwargs from HF (e.g., num_items_in_batch)
#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         inputs = inputs.copy()
#         labels = inputs.pop("labels").float()          # (bs,)
#         outputs = model(**inputs)
#         logit = outputs.logits.view(-1)                # (bs,)
#         pw = self.pos_weight.to(logit.device) if self.pos_weight is not None else None
#         loss = F.binary_cross_entropy_with_logits(logit, labels, pos_weight=pw, reduction="mean")
#         return (loss, outputs) if return_outputs else loss

class FocalBCETrainer(Trainer):
    def __init__(self, *args, pos_weight=None, gamma=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        inputs = inputs.copy()
        labels = inputs.pop("labels").float()         # shape: (B,)
        outputs = model(**inputs)
        logits = outputs.logits.view(-1)              # shape: (B,)
        pw = self.pos_weight.to(logits.device) if self.pos_weight is not None else None

        # BCE per example (no reduction)
        bce = F.binary_cross_entropy_with_logits(logits, labels, pos_weight=pw, reduction="none")

        # Focal modulation
        probs = torch.sigmoid(logits)
        pt = probs * labels + (1 - probs) * (1 - labels)   # p_t
        loss = ((1 - pt).pow(self.gamma) * bce).mean()

        return (loss, outputs) if return_outputs else loss

# compute class weights from TRAIN (0=Direct, 1=Evasive)
train_y = np.array(distil_roberta_train.labels)
n_pos = (train_y == 1).sum()
n_neg = (train_y == 0).sum()
scale = 3.0   # try 2.0 first; if still low recall, try 3.0
pos_weight = torch.tensor([ (n_neg / max(1, n_pos)) * scale ], dtype=torch.float)

In [39]:
# Distil-roberta model trainer.
distil_roberta_trainer = FocalBCETrainer(
    model=distil_roberta_model,
    args=distil_roberta_args,
    train_dataset=distil_roberta_train,
    eval_dataset=distil_roberta_val,
    processing_class=distil_roberta_tok,
    compute_metrics=compute_metrics,
    data_collator=distil_roberta_collator,
    pos_weight=pos_weight,
    gamma=2.0
    # callbacks=[early_stop]
)

# Train distil_roberta.
distil_roberta_trainer.train()

# Save the tuned model.
distil_roberta_trainer.save_model(DISTIL_SAVE_DIR)

# Free up memory.
mps_gc()

# Deberta model trainer.
deberta_trainer = FocalBCETrainer(
    model=deberta_model,
    args=deberta_args,
    train_dataset=deberta_train,
    eval_dataset=deberta_val,
    processing_class=deberta_tok,
    compute_metrics=compute_metrics,
    data_collator=deberta_collator,
    pos_weight=pos_weight,
    gamma=2.0
    # callbacks=[early_stop]
)

# Train deberta.
deberta_trainer.train()

# Save the tuned models.
deberta_trainer.save_model(DEBERTA_SAVE_DIR)

# Reload the best models.
distil_roberta_model  = AutoModelForSequenceClassification.from_pretrained(DISTIL_SAVE_DIR).to(device).eval()
deberta_model = AutoModelForSequenceClassification.from_pretrained(DEBERTA_SAVE_DIR).to(device).eval()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,Recall Direct,Recall Evasive,F1 Direct,F1 Evasive
1,0.3854,0.313457,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
2,0.331,0.341154,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
3,0.308,0.333214,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
4,0.2979,0.351983,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
5,0.2654,0.323538,0.395349,0.387061,0.454861,0.444602,0.34375,0.545455,0.458333,0.315789
6,0.149,0.566251,0.674419,0.572443,0.572443,0.572443,0.78125,0.363636,0.78125,0.363636
7,0.1519,0.623155,0.372093,0.372093,0.488636,0.488636,0.25,0.727273,0.372093,0.372093
8,0.0557,0.793054,0.651163,0.554865,0.553763,0.556818,0.75,0.363636,0.761905,0.347826


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,Recall Direct,Recall Evasive,F1 Direct,F1 Evasive
1,0.4206,0.321911,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
2,0.3234,0.353468,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
3,0.3059,0.418461,0.255814,0.203704,0.127907,0.5,0.0,1.0,0.0,0.407407
4,0.2109,0.382208,0.627907,0.437908,0.43254,0.451705,0.8125,0.090909,0.764706,0.111111
5,0.132,1.040521,0.72093,0.488095,0.541667,0.514205,0.9375,0.090909,0.833333,0.142857
6,0.0736,1.714527,0.581395,0.450284,0.450284,0.450284,0.71875,0.181818,0.71875,0.181818
7,0.0085,2.07401,0.651163,0.493323,0.496429,0.497159,0.8125,0.181818,0.776119,0.210526
8,0.0019,2.160646,0.627907,0.437908,0.43254,0.451705,0.8125,0.090909,0.764706,0.111111


In [40]:
from torch.utils.data import DataLoader
@torch.no_grad()
def predict_probs(model, dataset, tokenizer, batch_size=32):
    model.eval()
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
    all_probs = []
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        logits = model(**batch).logits.view(-1)     # (bs,)
        probs = torch.sigmoid(logits).cpu().numpy() # P(evasive)
        all_probs.append(probs)
    return np.concatenate(all_probs)

In [41]:
val_probs = predict_probs(distil_roberta_model, distil_roberta_val, distil_roberta_tok)
print("VAL prob stats -> min:", val_probs.min(), "median:", np.median(val_probs), "max:", val_probs.max())

VAL prob stats -> min: 0.11766578 median: 0.31766963 max: 0.8094069


In [42]:
from dataclasses import dataclass

@dataclass
class ThrResult:
    thr: float
    f1_macro: float
    precision_macro: float
    recall_macro: float
    f1_evasive: float
    recall_evasive: float
    precision_evasive: float

def eval_at_threshold(y_true, p, thr):
    pred = (p >= thr).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, pred, average='macro', zero_division=0)
    p_c, r_c, f1_c, _ = precision_recall_fscore_support(y_true, pred, average=None, zero_division=0)
    return ThrResult(
        thr=thr,
        f1_macro=f1,
        precision_macro=prec,
        recall_macro=rec,
        f1_evasive=f1_c[1],
        recall_evasive=r_c[1],
        precision_evasive=p_c[1]
    )

def sweep_thresholds(y_true, probs, metric="f1_macro", recall_floor=None):
    best = None
    for thr in np.linspace(0.05, 0.95, 181):  # step 0.005
        r = eval_at_threshold(y_true, probs, thr)
        if recall_floor is not None and r.recall_evasive < recall_floor:
            continue
        if best is None or getattr(r, metric) > getattr(best, metric):
            best = r
    return best

In [43]:
# CHANGE: tune for both models; use whichever you deploy
val_labels = np.array(distil_roberta_val.labels)

val_probs_distil  = predict_probs(distil_roberta_model, distil_roberta_val, distil_roberta_tok)
best_distil       = sweep_thresholds(val_labels, val_probs_distil, metric="f1_macro")
with open(os.path.join(DISTIL_SAVE_DIR, "threshold.json"), "w") as f:
    json.dump({"evasive_threshold": float(best_distil.thr)}, f)

val_probs_deberta = predict_probs(deberta_model, deberta_val, deberta_tok)
best_deberta      = sweep_thresholds(val_labels, val_probs_deberta, metric="f1_macro")
with open(os.path.join(DEBERTA_SAVE_DIR, "threshold.json"), "w") as f:
    json.dump({"evasive_threshold": float(best_deberta.thr)}, f)

print("Best DISTIL thr:", best_distil.thr, "F1_macro:", best_distil.f1_macro, "Recall_evasive:", best_distil.recall_evasive)
print("Best DEBERTA thr:", best_deberta.thr, "F1_macro:", best_deberta.f1_macro, "Recall_evasive:", best_deberta.recall_evasive)

Best DISTIL thr: 0.695 F1_macro: 0.6160714285714286 Recall_evasive: 0.2727272727272727
Best DEBERTA thr: 0.09 F1_macro: 0.520655737704918 Recall_evasive: 0.36363636363636365


In [45]:
# CHANGE: example with DeBERTa; swap in Distil if preferred
test_probs_deberta = predict_probs(deberta_model, deberta_test, deberta_tok)
test_labels        = np.array(deberta_test.labels)

with open(os.path.join(DEBERTA_SAVE_DIR, "threshold.json")) as f:
    thr = json.load(f)["evasive_threshold"]

test_res = eval_at_threshold(test_labels, test_probs_deberta, thr)
print("Chosen thr:", thr, "| F1_macro:", test_res.f1_macro, "| Recall_evasive:", test_res.recall_evasive)

NameError: name 'deberta_test' is not defined

## **4.2 Functions for pipeline**

In [None]:
# ---------- blend utility ----------
def blend_probs(p_base, p_llm, llm_weight=0.70):
    # p_blend = w * llm + (1-w) * base
    return llm_weight * p_llm + (1.0 - llm_weight) * p_base

# ---------- compact report ----------
def _short_report(tag, res):
    print(f"[{tag}] thr={res.thr:.4f} | F1_macro={res.f1_macro:.3f} | "
          f"Recall_evasive={res.recall_evasive:.3f} | Precision_evasive={res.precision_evasive:.3f}")


In [None]:
def evasion_pipeline_v1(
    # labels
    val_labels: np.ndarray,
    test_labels: np.ndarray,
    # probabilities
    val_probs_base: np.ndarray,
    test_probs_base: np.ndarray,
    val_probs_llm: np.ndarray,
    test_probs_llm: np.ndarray,
    # knobs
    llm_weight: float = 0.70,
    metric: str = "f1_macro",
    recall_floor: float | None = None,
    save_dirs: dict | None = None,  # e.g., {"base": "...", "llm": DEBERTA_SAVE_DIR, "blend": "..."}
):
    """
    Tunes thresholds on VAL for baseline, LLM, and blended probs, then applies to TEST.
    Returns a dict of threshold + test metrics for each system.
    """
    # --- blend on VAL/TEST
    val_probs_blend  = blend_probs(val_probs_base,  val_probs_llm,  llm_weight)
    test_probs_blend = blend_probs(test_probs_base, test_probs_llm, llm_weight)

    # --- tune thresholds on VAL
    best_base  = sweep_thresholds(val_labels,  val_probs_base,  metric=metric, recall_floor=recall_floor)
    best_llm   = sweep_thresholds(val_labels,  val_probs_llm,   metric=metric, recall_floor=recall_floor)
    best_blend = sweep_thresholds(val_labels,  val_probs_blend, metric=metric, recall_floor=recall_floor)

    _short_report("VAL/Base ",  best_base)
    _short_report("VAL/LLM  ",  best_llm)
    _short_report("VAL/Blend",  best_blend)

    # --- evaluate on TEST using tuned thresholds
    test_base  = eval_at_threshold(test_labels,  test_probs_base,  best_base.thr)
    test_llm   = eval_at_threshold(test_labels,  test_probs_llm,   best_llm.thr)
    test_blend = eval_at_threshold(test_labels,  test_probs_blend, best_blend.thr)

    print("\n=== TEST ===")
    _short_report("Base ",  test_base)
    _short_report("LLM  ",  test_llm)
    _short_report("Blend",  test_blend)

    # --- optionally save thresholds
    if save_dirs is not None:
        import json, os
        os.makedirs(save_dirs.get("base", ""), exist_ok=True)  if "base"  in save_dirs else None
        os.makedirs(save_dirs.get("llm", ""), exist_ok=True)   if "llm"   in save_dirs else None
        os.makedirs(save_dirs.get("blend", ""), exist_ok=True) if "blend" in save_dirs else None

        if "base" in save_dirs:
            with open(os.path.join(save_dirs["base"], "threshold.json"), "w") as f:
                json.dump({"evasive_threshold": float(best_base.thr)}, f)
        if "llm" in save_dirs:
            with open(os.path.join(save_dirs["llm"], "threshold.json"), "w") as f:
                json.dump({"evasive_threshold": float(best_llm.thr)}, f)
        if "blend" in save_dirs:
            with open(os.path.join(save_dirs["blend"], "threshold.json"), "w") as f:
                json.dump({
                    "evasive_threshold": float(best_blend.thr),
                    "llm_weight": float(llm_weight)
                }, f)

    return {
        "val":   {"base": best_base, "llm": best_llm, "blend": best_blend},
        "test":  {"base": test_base, "llm": test_llm, "blend": test_blend},
        "thrs":  {"base": best_base.thr, "llm": best_llm.thr, "blend": best_blend.thr},
        "weight": llm_weight,
    }


In [None]:
# 1) LLM probabilities (pick your selected model; here: deberta)
val_probs_llm  = predict_probs(deberta_model, deberta_val,  deberta_tok)
test_probs_llm = predict_probs(deberta_model, deberta_test, deberta_tok)

# 2) Baseline probabilities:
#    If you already have them as probs, just assign:
#    val_probs_base  = <np.array shape [n_val]>
#    test_probs_base = <np.array shape [n_test]>
#
#    If you have *logits*, convert first:
#    val_probs_base  = 1 / (1 + np.exp(-val_logits_base))
#    test_probs_base = 1 / (1 + np.exp(-test_logits_base))
#
#    If you only have hard labels from baseline, consider fitting a quick
#    probability model for it (e.g., calibrate or use its raw score if available).

# 3) Labels
val_labels  = np.array(deberta_val.labels)   # or your dataset’s labels
test_labels = np.array(deberta_test.labels)

# 4) Run the pipeline
results = evasion_pipeline_v1(
    val_labels=val_labels,
    test_labels=test_labels,
    val_probs_base=val_probs_base,
    test_probs_base=test_probs_base,
    val_probs_llm=val_probs_llm,
    test_probs_llm=test_probs_llm,
    llm_weight=0.70,
    metric="f1_macro",        # or "recall_macro" / "precision_macro"
    recall_floor=None,        # or e.g. 0.70 to enforce min evasive recall
    save_dirs={"llm": DEBERTA_SAVE_DIR}  # add "base"/"blend" dirs if you want files
)

# 5) Access chosen thresholds
EVASION_THRESHOLD_BASE  = results["thrs"]["base"]
EVASION_THRESHOLD_LLM   = results["thrs"]["llm"]
EVASION_THRESHOLD_BLEND = results["thrs"]["blend"]
LLM_WEIGHT              = results["weight"]


In [None]:
def decisions_from_probs(probs, thr):
    return (probs >= thr).astype(int)

test_pred_base  = decisions_from_probs(test_probs_base,  EVASION_THRESHOLD_BASE)
test_pred_llm   = decisions_from_probs(test_probs_llm,   EVASION_THRESHOLD_LLM)
test_pred_blend = decisions_from_probs(
    blend_probs(test_probs_base, test_probs_llm, LLM_WEIGHT),
    EVASION_THRESHOLD_BLEND
)

# **5. Evasion Detection Pipeline**

## **5.1 Functions**

In [None]:
# Function to label 'Direct' or 'Evasive' based on the score.
def label_from_score(score, threshold):
    return 'Evasive' if score >= threshold else 'Direct'

In [None]:
# Function to extract ground truth (1 = Evasive, 0 = Direct)
def extract_y_true(df):
    return (df['label'].astype(str).str.strip().str.lower() == 'evasive').astype(int).values

In [None]:
# Computes the logits margin.
def compute_logits_margin(model, tokenizer, df, batch_size=32, max_length=512):
    model.eval()
    margins = []
    with torch.no_grad():
        for i in range(0, len(df), batch_size):
            b = df.iloc[i:i+batch_size]
            texts = [build_premise(q, a) for q, a in zip(b['question'].astype(str), b['answer'].astype(str))]
            enc = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
            enc = {k: v.to(device) for k, v in enc.items()}
            out = model(**enc).logits  # [B,2]
            margins.extend((out[:,1] - out[:,0]).detach().cpu().numpy().tolist())
    return np.array(margins)

In [None]:
def llm_evasion_score(question, answer, model, tokenizer, platt_model, max_length=512):
    text = build_premise(question, answer)
    enc = tokenizer(text, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits.squeeze(0)  # [2]
    margin = float((logits[1] - logits[0]).detach().cpu().numpy())
    p_ev = float(platt_model.predict_proba([[margin]])[0,1])
    return {'p_evasive': p_ev, 'p_direct': 1.0 - p_ev}

In [None]:
# Function to compute blended evasion score and return all scores.
def compute_all_evasion_scores(q, a, *, models_and_tokenizers, device, LLM_WEIGHT=0.30):
    
    # Compute baseline evasion score.
    base_score, _, _, _, _ = baseline_evasion_score(q, a)

    # Individual LLM scores.
    llm_scores = {}
    for key, (_, m, t, platt) in models_and_tokenizers.items():
        scores = llm_evasion_score(q, a, m, t, platt)
        llm_scores[key] = float(100.0 * scores['p_evasive'])

    # Ensemble LLM score.
    llm_avg = float(np.mean(list(llm_scores.values()))) if llm_scores else 0.0

    # Compute blended score.
    blended_score = float(np.clip((1.0 - LLM_WEIGHT) * base_score + LLM_WEIGHT * llm_avg, 0.0, 100.0))

    return {
        'baseline': base_score,
        'llm_individual': llm_scores,
        'llm_avg': llm_avg,
        'blended': blended_score
        }

In [None]:
# Main evasion Pipeline
def evasion_pipeline(df, models_and_tokenizers, device, LLM_WEIGHT, EVASION_THRESHOLD_BASE, EVASION_THRESHOLD_LLM, EVASION_THRESHOLD_BLENDED):
    records = []
    for _, row in df.iterrows():
        q, a = str(row['question']), str(row['answer'])
        out = compute_all_evasion_scores(q=q, a=a, LLM_WEIGHT=LLM_WEIGHT, models_and_tokenizers=models_and_tokenizers, device=device)

        rec = {
            'question_number': row.get('question_number'),
            'question': q, 'answer': a,
            'evasion_score_baseline': int(out['baseline']),
            'evasion_score_llm_avg': int(out['llm_avg']),
            'evasion_score_blended': int(out['blended']),
            'prediction_baseline': label_from_score(out['baseline'], EVASION_THRESHOLD_BASE),
            'prediction_llm_avg': label_from_score(out['llm_avg'], EVASION_THRESHOLD_LLM),
            'prediction_blended': label_from_score(out['blended'], EVASION_THRESHOLD_BLENDED),
        }
        # add individual models dynamically
        for model_name, score in out['llm_individual'].items():
            rec[f'evasion_score_{model_name}'] = int(score)
            rec[f'prediction_{model_name}'] = label_from_score(score, EVASION_THRESHOLD_LLM)

        records.append(rec)
    return pd.DataFrame(records)

## **5.2 Threshold Tuning & Model Selection**

In [None]:
# Calibrate LLM labelling threshold on validation set.
y_val = extract_y_true(jpm_val)

distil_roberta_val_margins = compute_logits_margin(distil_roberta_model, distil_roberta_tok, jpm_val)
deberta_val_margins = compute_logits_margin(deberta_model, deberta_tok, jpm_val)

distil_roberta_platt = LogisticRegression(solver='lbfgs').fit(distil_roberta_val_margins.reshape(1, -1), y_val)
derberta_platt = LogisticRegression(solver='lbfgs').fit(deberta_val_margins.reshape(1, -1), y_val)

In [None]:
# Define models.
models_and_tokenizers = {
    'distil_roberta': ('distil_roberta', distil_roberta_model, distil_roberta_tok, distil_roberta_platt)
    'deberta': ('deberta', deberta_model, deberta_tok, deberta_platt)
}

In [None]:
# Function to fine tune the threshold 
def tune_threshold(df, score_col, thr_grid):
    y_true = extract_y_true(df)
    scores = df[score_col].astype(float).values
    rows = []
    for thr in thr_grid:
        y_pred = (scores >= thr).astype(int)
        rows.append({
            'threshold': float(thr),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall':    recall_score(y_true, y_pred, zero_division=0),
            'f1':        f1_score(y_true, y_pred, zero_division=0),
            'accuracy':  accuracy_score(y_true, y_pred)
        })
    return pd.DataFrame(rows).sort_values(by=['f1','recall'], ascending=[False, False]).reset_index(drop=True)

In [None]:
# Perform an initial run with preliminary threshold values.
LLM_WEIGHT = 0.30
EVASION_THRESHOLD_BASE = 30.0
EVASION_THRESHOLD_LLM = 30.0
EVASION_THRESHOLD_BLENDED = 30.0

jpm_val_scores = evasion_pipeline(
    jpm_val, 
    models_and_tokenizers, 
    device, 
    LLM_WEIGHT, 
    EVASION_THRESHOLD_BASE, 
    EVASION_THRESHOLD_LLM, 
    EVASION_THRESHOLD_BLENDED
    )

In [None]:
# View the results and reappend the label.
jpm_val_scores['label'] = jpm_val_qa_labelled['label'].values
jpm_val_scores.head()

In [None]:
# Define threshold grid.
thr_grid = np.arange(30, 85, 5)

# Tune all thresholds.
base_results = tune_threshold(jpm_val_scores, 'evasion_score_baseline', thr_grid)
llm_avg_results = tune_threshold(jpm_val_scores, 'evasion_score_llm_avg', thr_grid)
blend_results = tune_threshold(jpm_val_scores, 'evasion_score_blended', thr_grid)

best_base_thr    = float(base_results.loc[0, 'threshold'])
best_llm_avg_thr = float(llm_avg_results.loc[0, 'threshold'])
best_blend_thr   = float(blend_results.loc[0, 'threshold'])
best_model_thrs  = {k: float(v.loc[0, 'threshold']) for k, v in per_model_results.items()}

print("=== Best thresholds (VAL) ===")
print("Baseline:", best_base_thr)
print("LLM Avg:", best_llm_avg_thr)
print("Blended:", best_blend_thr)
for k, thr in best_model_thrs.items():
    print(f"{k}: {thr}")

# View top configs. 
print("\nTop 5 baseline:\n", base_results.head())
print("\nTop 5 llm_avg:\n", llm_avg_results.head())
print("\nTop 5 blended:\n", blend_results.head())
for k, dfres in per_model_results.items():
    print(f"\nTop 5 {k}:\n", dfres.head())

## **5.2 Optimised Evaluation**

## **5.3 2025 Predictions**

In [None]:
# ==============================================
# 7) Pick ONE LLM and run TEST with baseline
# ==============================================
# Choose which LLM you want to carry forward
SELECTED_LLM = 'deberta_small'   # or 'distilroberta'

# Use its tuned threshold + the tuned baseline and blended thresholds
LLM_WEIGHT = 0.70  # keep your preferred blend weight

EVASION_THRESHOLD_BASE    = best_base_thr
EVASION_THRESHOLD_LLM_SEL = best_model_thrs[SELECTED_LLM]
EVASION_THRESHOLD_BLEND   = best_blend_thr  # or retune blend after fixing SELECTED_LLM if you like

# Run v1 on TEST (still computes all models, but we'll read only baseline + selected)
test_scores_v1 = evasion_pipeline(
    jpm_test_qa_labelled,
    models_and_tokenizers,
    device,
    LLM_WEIGHT,
    EVASION_THRESHOLD_BASE,
    EVASION_THRESHOLD_LLM_SEL,   # used for individual preds
    EVASION_THRESHOLD_BLEND
)
test_scores_v1['label'] = jpm_test_qa_labelled['label'].values

# Compact evaluation for baseline + SELECTED_LLM (+ blended if desired)
def _to_bin(pred_series): return (pred_series.astype(str).str.lower() == 'evasive').astype(int).values
y_true = (test_scores_v1['label'].astype(str).str.lower() == 'evasive').astype(int).values

y_pred_base = _to_bin(test_scores_v1['prediction_baseline'])
y_pred_sel  = _to_bin(test_scores_v1[f'prediction_{SELECTED_LLM}'])
y_pred_blnd = _to_bin(test_scores_v1['prediction_blended'])

print("\n=== TEST: BASELINE ===")
print(classification_report(y_true, y_pred_base, target_names=["Direct","Evasive"], digits=3, zero_division=0))
print(confusion_matrix(y_true, y_pred_base))

print(f"\n=== TEST: {SELECTED_LLM.upper()} ===")
print(classification_report(y_true, y_pred_sel, target_names=["Direct","Evasive"], digits=3, zero_division=0))
print(confusion_matrix(y_true, y_pred_sel))

print("\n=== TEST: BLENDED ===")
print(classification_report(y_true, y_pred_blnd, target_names=["Direct","Evasive"], digits=3, zero_division=0))
print(confusion_matrix(y_true, y_pred_blnd))