# Initial bits

## HuggingFace token

In [None]:
from google.colab import userdata
hf_token = userdata.get('HFtoken')


## Notebook formating and time tracking

In [None]:
import datetime
import time
from IPython.core.events import EventManager
from IPython.display import display, HTML

last_cell_execution = None

# Generator for keeping track of time elapsed since first and most recent cell run
def print_time():
    global last_cell_execution
    start_time = time.time()
    init = True
    while True:
        if init:
            init = False
            yield ' >>> execution timer initialized at ' + time.strftime("%H:%M:%S", time.localtime(start_time))
        else:
            elapsed_tot = time.time() - start_time
            elapsed_m = (elapsed_tot) // 60 # minutes
            elapsed_s = (elapsed_tot) % 60 # seconds

            ret = f'  --> so far: {elapsed_m:02,.0f} min :{elapsed_s:02.0f} s'
            if last_cell_execution is not None:
                elapsed_cell = time.time() - last_cell_execution
                elapsed_cell_m = (elapsed_cell) // 60 # minutes
                elapsed_cell_s = (elapsed_cell) % 60 # seconds
                ret += f'  --> this cell: {elapsed_cell_m:02,.0f} min :{elapsed_cell_s:02,.3f} s'

            yield ret

# Initialize the generator and run its first cycle
execution_time_tracker = print_time()

# The pre callback noting cell execution start time
def cell_startrun_callback():
    global last_cell_execution
    last_cell_execution = time.time()

# The post callback iterating the generator
def print_time_callback():
    print('\n' + str(next(execution_time_tracker)), end = '')

# Attach the callback to colab cell hook
ip = get_ipython()
ip.events.register('pre_run_cell', cell_startrun_callback)
ip.events.register('post_run_cell', print_time_callback)



 >>> execution timer initialized at 15:10:16

In [None]:
### getting printed outputs to wrap
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


  --> so far: 00 min :00 s  --> this cell: 00 min :0.001 s

# Mount google drive and install basic packages

In [None]:
### access to drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    drive_mounted = True
except Exception as e:
    drive_mounted = False
    !pip install gdown

### import basic libraries
import os
import pandas as pd
import numpy as np
import ast
import string
import re

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

  --> so far: 00 min :03 s  --> this cell: 00 min :2.914 s

# Import pdf

In [None]:
!pip install pymupdf pdfplumber


  --> so far: 00 min :09 s  --> this cell: 00 min :6.347 s

In [None]:
import fitz  # PyMuPDF

if drive_mounted:
    pdf_path = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/0. Master/1.0 Quarterly Announcements/JPMorganChase - 4q24-earnings-transcript.pdf"

    folder_path = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/0. Master/5.0 Inputs for RAG"

    pdf_paths = []
    file_names = []
    for filename in os.listdir(folder_path):
        if filename.startswith('JPMorganChase') and filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):  # Ensure it's a file, not a subfolder
                print(file_path)
                pdf_paths += [file_path]
                file_names.append(filename)

else:
    ### hasn't been updated, this is just for 4Q24
    !pip install requests
    import requests

    pdf_folder_url = "https://drive.google.com/drive/folders/1dMcc2rv0Vossre6Ab0vZwJSsOuuKG33c?usp=sharing"
    print(f"\n##########\nThe drive folder with the pdf was set to public! If the code to import the pdfs doesn't work, the folder can be accessed with the link to the folder given in this cell:\npdf_folder_url = {pdf_folder_url}\n##########\n")

    url = r"https://drive.google.com/file/d/1w49m7hyXSoy-pV65QRkRt3ibu8HBiFTG/view?usp=sharing"
    url = "https://drive.google.com/uc?export=download&id=1w49m7hyXSoy-pV65QRkRt3ibu8HBiFTG"
    pdf_path = "/content/PDF.pdf"

    !pip install gdown
    import gdown
    file_id_dict = {"4q24":"1jTe8W6HLO3zyw1N8CAUglgWjZOdL5M1H",
                    "3q24":"13Xs5GgFaetigYfOEZAYSzHCn5_1iTwSZ",
                    "2q24":"11gXE8bkp7ASH-d2KTV9dbUh5J_Ub83id"}
    # file_id = "1w49m7hyXSoy-pV65QRkRt3ibu8HBiFTG"
    pdf_paths = []
    for q, id in file_id_dict.items():
        url = f"https://drive.google.com/uc?export=download&id={id}"
        gdown.download(url, f"/content/{q}.pdf", quiet=False)
        pdf_paths += [f"/content/{q}.pdf"]

    # pdf_path = "/content/PDF.pdf"
    # pdf_paths = [pdf_path] # to make it consistent with drive_mounted option
    print(f"PDFs downloaded and saved as {pdf_paths}")

    # Check if the files exists and has content
    for pdf_path in pdf_paths:
        print(f"\nC")
        if os.path.exists(pdf_path):
            print(f"File exists, size: {os.path.getsize(pdf_path)} bytes")
            !file /content/PDF.pdf
        else:
            print("File not found!")

/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/0. Master/5.0 Inputs for RAG/JPMorganChase - 2q24-earnings-transcript.pdf
/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/0. Master/5.0 Inputs for RAG/JPMorganChase - 3q24-earnings-transcript.pdf
/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/0. Master/5.0 Inputs for RAG/JPMorganChase - 4q24-earnings-transcript.pdf

  --> so far: 00 min :10 s  --> this cell: 00 min :0.204 s

In [None]:
def extract_text_pymupdf(pdf_path):
    # text = ""
    # with fitz.open(pdf_path) as doc:
    #     for page in doc:
    #         text += page.get_text() + "\n"
    # return text
    try:
        with fitz.open(pdf_path) as doc:
            text = "\n".join([page.get_text() for page in doc])
        return text
    except Exception as e:
        return f"Error opening PDF: {e}"

pdf_texts = []
Quarters = []
for i, pdf_path in enumerate(pdf_paths):
    pdf_text = extract_text_pymupdf(pdf_path)
    # print(pdf_text)
    pdf_texts.append(pdf_text)
    quarter = pdf_text.strip()
    quarter = quarter[:quarter.find("\n")]
    print(f"\n\nPDF {i+1} extracted:\n{quarter}")
    Quarters.append(quarter)



PDF 1 extracted:
2Q24 FINANCIAL RESULTS  


PDF 2 extracted:
3Q24 FINANCIAL RESULTS  


PDF 3 extracted:
4Q24 FINANCIAL RESULTS 

  --> so far: 00 min :10 s  --> this cell: 00 min :0.748 s

## Clean up text

In [None]:
def clean_pdf_text(pdf_text):
    pdf_text_cleaner = pdf_text

    pdf_text_cleaner = pdf_text_cleaner.replace("\n \n", "\n\n")

    while "\n\n\n" in pdf_text_cleaner:
        pdf_text_cleaner = pdf_text_cleaner.replace('\n\n\n', '\n\n')
    while "...." in pdf_text_cleaner:
        pdf_text_cleaner = pdf_text_cleaner.replace('....', '...')

    pdf_text_cleaner = pdf_text_cleaner[:pdf_text_cleaner.find("\nDisclaimer")] ### cutting out the disclaimer bit at the bootom
    return pdf_text_cleaner

### test with 4Q24
pdf_text = pdf_texts[0]
pdf_text_cleaner = clean_pdf_text(pdf_text)

print(f"{len(pdf_text)} --> {len(pdf_text_cleaner)}")

75325 --> 56734

  --> so far: 00 min :10 s  --> this cell: 00 min :0.008 s

## Find speakers

In [None]:
# <.*?>

def extract_speakers_regex(transcript, pattern = r"\n.*?\n.*?\nQ\n"):
    """
    Extracts speaker names from an earnings call transcript using regex.
    """
    speaker_pattern = re.compile(pattern)  # Captures text before ":"
    speakers = list(set(re.findall(speaker_pattern, transcript)))  # Find all matches

    return speakers

Speakers = []
for i, pdf_text in enumerate(pdf_texts):
    print(f"\n\nSpeakers for {Quarters[i]}")
    pdf_text_cleaner = clean_pdf_text(pdf_text)
    speakers_raw = extract_speakers_regex(pdf_text_cleaner)
    speakers_raw += extract_speakers_regex(pdf_text_cleaner, r"\n.*?\n.*?\nA\n")
    speakers_raw += extract_speakers_regex(pdf_text_cleaner, r"\n....*?\n.*?\nA \n")
    speakers_raw += extract_speakers_regex(pdf_text_cleaner, r"\n....*?\n.*?\nQ \n")
    print(f"speakers raw:\n{speakers_raw}\n")

    speakers = {}
    for speaker in speakers_raw:
        t = speaker.strip("\n")
        name = t[:t.find("\n")]
        t = t[len(name)+1:]
        org = t[:t.find("\n")]
        name = name.strip(" ")
        org = org.strip(" ")
        if name not in speakers.keys():
            speakers[name] = org

    for k, v in speakers.items():
        print(f"{k}: {v}")

    Speakers.append(speakers)




Speakers for 2Q24 FINANCIAL RESULTS  
speakers raw:
['\nJeremy Barnum \nChief Financial Officer, JPMorgan Chase & Co.  \nA \n', '\nJeremy Barnum \nChief Financial Officer, JPMorgan Chase & Co. \nA \n', '\nKen Usdin \nAnalyst, Jefferies LLC  \nQ \n', '\nKen Usdin \nAnalyst, Jefferies LLC \nQ \n', '\nJim Mitchell \nAnalyst, Seaport Global Securities LLC \nQ \n', '\nSaul Martinez \nAnalyst, HSBC Securities (USA), Inc \nQ \n', '\nMike Mayo \nAnalyst, Wells Fargo Securities LLC \nQ \n', '\nSteven Chubak \nAnalyst, Wolfe Research LLC \nQ \n', '\nSteven Chubak \nAnalyst, Wolfe Research LLC  \nQ \n', "\nMatt O'Connor \nAnalyst, Deutsche Bank Securities, Inc. \nQ \n", '\nErika Najarian \nAnalyst, UBS Securities LLC \nQ \n', '\nGerard Cassidy \nAnalyst, RBC Capital Markets LLC \nQ \n', '\nBetsy L. Graseck \nAnalyst, Morgan Stanley & Co. LLC \nQ \n', '\nGlenn Schorr \nAnalyst, Evercore ISI \nQ \n']

Jeremy Barnum: Chief Financial Officer, JPMorgan Chase & Co.
Ken Usdin: Analyst, Jefferies LLC
J

## Split into sections

In [None]:
def break_at_keyword(long_text, keyword, include_line_before_keyword = True, break_char = "\n"):
    matches = [match.start() for match in re.finditer(keyword, long_text)]
    # print("matches:", matches)
    chunks = []
    for i, j in enumerate(matches):
        if i == 0:
            text = long_text[:j]
            if include_line_before_keyword:
                text = text[:text.rfind(break_char)]
        else:
            j_prev = matches[i-1]
            text = long_text[j_prev:j]
            if include_line_before_keyword:
                pre = long_text[:j_prev] ### name of section, comes before the keyword
                text = pre[pre.rfind(break_char):] + text[:text.rfind(break_char)] # cut out name of next section
        chunks += [text]
    if len(matches) > 1:
        if include_line_before_keyword:
            j_prev = matches[-1]
            pre = long_text[:j_prev] ### name of section, comes before the keyword
            text = pre[pre.rfind(break_char):] + long_text[matches[-1]:]
        else:
            text = long_text[matches[-1]:]
    chunks += [text]
    if len(matches) == 0:
        return [long_text]
    return chunks

# testing
Sections = []
for i, pdf_text in enumerate(pdf_texts):
    print(f"\n\nQuarter {Quarters[i]}, section lengths:")

    pdf_text_cleaner = clean_pdf_text(pdf_text)
    sections = break_at_keyword(pdf_text_cleaner, 'SECTION')
    for sec in sections:
        print(len(sec))

    # initial_bit, management_discussion, QA = sections
    print(f"Text#{i+1}: number of sections = {len(sections)}")
    Sections.append(sections)
# matches: [148, 13530]



Quarter 2Q24 FINANCIAL RESULTS  , section lengths:
105
11121
45508
Text#1: number of sections = 3


Quarter 3Q24 FINANCIAL RESULTS  , section lengths:
108
9303
57005
Text#2: number of sections = 3


Quarter 4Q24 FINANCIAL RESULTS , section lengths:
112
13367
37381
Text#3: number of sections = 3

  --> so far: 00 min :10 s  --> this cell: 00 min :0.019 s

In [None]:
N = 0
N1 = 500
for i, sections in enumerate(Sections):
    print(f"\n#####################\nQuarter {Quarters[i]}:\n#####################{sections[2][N:N+N1]}")


#####################
Quarter 2Q24 FINANCIAL RESULTS  :
#####################
QUESTION AND ANSWER SECTION 
  
Operator: Please standby. For our first question, we'll go to the line of Steven Chubak from Wolfe Research. Please go ahead. 

 
 ...  

4 

Steven Chubak 
Analyst, Wolfe Research LLC  
Q 
Hi. Good morning, Jeremy. 
 ...  
Jeremy Barnum 
Chief Financial Officer, JPMorgan Chase & Co.  
A 
Good morning, Steve. 
 ...  
Steven Chubak 
Analyst, Wolfe Research LLC 
Q 
So, wanted to start off with a question on capital just given some indications that the Fed is consid

#####################
Quarter 3Q24 FINANCIAL RESULTS  :
#####################
QUESTION AND ANSWER SECTION 
  
Operator: Thank you. Please stand by. Our first question will come from the line of Jim Mitchell from Seaport Global Securities. You may 
proceed.  

 ...  

4 

Jim Mitchell 
Analyst, Seaport Global Securities LLC 
Q 
Hey, good morning. So, Jeremy, as you highlighted, full year NII guidance implies a sizable

## Get Q&A df

### code to "shape" the Q&As

In [None]:
def break_at_keyword(long_text, keyword, include_line_before_keyword = True, break_char = "\n"):
    matches = [match.start() for match in re.finditer(keyword, long_text)]
    # print("matches:", matches)
    chunks = []
    for i, j in enumerate(matches):
        if i == 0:
            text = long_text[:j]
            if include_line_before_keyword:
                text = text[:text.rfind(break_char)]
        else:
            j_prev = matches[i-1]
            text = long_text[j_prev:j]
            if include_line_before_keyword:
                pre = long_text[:j_prev] ### name of section, comes before the keyword
                text = pre[pre.rfind(break_char):] + text[:text.rfind(break_char)] # cut out name of next section
        chunks += [text]
    if len(matches) > 1:
        if include_line_before_keyword:
            j_prev = matches[-1]
            pre = long_text[:j_prev] ### name of section, comes before the keyword
            text = pre[pre.rfind(break_char):] + long_text[matches[-1]:]
        else:
            text = long_text[matches[-1]:]
    chunks += [text]
    if len(matches) == 0:
        return [long_text]
    return chunks

# # testing
# sections = break_at_keyword(pdf_text_cleaner, 'SECTION')

# initial_bit, management_discussion, QA = sections
# # print(len(sections))
# # matches: [148, 13530]



  --> so far: 00 min :10 s  --> this cell: 00 min :0.004 s

In [None]:
def lookup_speaker(text):
    text = text.strip("\n")
    L = text.split("\n")
    P = f"{L[-2]}, {L[-1]}"
    P = P.strip(" ")
    if " ," in P:
        P = P.replace(" ,", ",")
    if P.endswith("."):
        P = P[:-1]
    return P



  --> so far: 00 min :10 s  --> this cell: 00 min :0.003 s

In [None]:
def remove_thank_you(text, extra_words = []):
    punct = string.punctuation
    Thanks = ['thanks', 'thank you'] #'Thanks', 'Thank you',
    text = text.lower()
    for _ in Thanks + list(punct) + extra_words:
        text = text.replace(_.lower(), "")
    text = text.strip(" ")
    return text


  --> so far: 00 min :10 s  --> this cell: 00 min :0.003 s

In [None]:
speaker_words = []
for speakers in Speakers:
    L = []
    for k, v in speakers.items():
        L += k.split()
        L += v.split()
    # L = list(set(L))
    for l in L:
        if l not in speaker_words:
            speaker_words.append(l)
# print(L)
# speaker_words = L.copy()
speaker_words += ['gentlemen', 'operator', ]

Hellos = ['Hi', 'Hello', 'Good morning', 'Good day', 'Good evening', 'Hey']
speaker_words += Hellos




  --> so far: 00 min :10 s  --> this cell: 00 min :0.003 s

In [None]:
Quarters = []
QAs = []
Discussions = []
for sections in Sections:
    initial_bit, management_discussion, QA = sections
    QAs.append(QA)
    Discussions.append(management_discussion)
    quarter = initial_bit.strip()
    quarter = quarter[:quarter.find("\n")]
    Quarters.append(quarter)



  --> so far: 00 min :10 s  --> this cell: 00 min :0.002 s

In [None]:
Quarters

['2Q24 FINANCIAL RESULTS  ',
 '3Q24 FINANCIAL RESULTS  ',
 '4Q24 FINANCIAL RESULTS ']


  --> so far: 00 min :10 s  --> this cell: 00 min :0.006 s

In [None]:
QA_df = pd.DataFrame()
for enum, QA in enumerate(QAs):
    ### create QA_list
    Res = {
        'Q_num':[],
        'Q_who':[],
        'Q_what':[],
        'A_who':[],
        'A_what':[],
    }

    Qs = [match.start() for match in re.finditer("\nQ\n", QA)]
    Qs += [match.start() for match in re.finditer("\nQ \n", QA)]
    ### Qs = [209, ..., 36696]
    Qs += [len(QA)] ### 37363

    Q_before = ""
    Q_who_before = ""

    for i, q_to in enumerate(Qs[1:]):
        q_from = Qs[i] #+ len("\nQ\n")
        Q_person = lookup_speaker(QA[:q_from])
        q_from += len("\nQ\n")
        As = [match.end() for match in re.finditer("\nA\n", QA[q_from:q_to])]
        As += [match.end() for match in re.finditer("\nA \n", QA[q_from:q_to])]
        if len(As) == 0:
            a = q_to
        else:
            a = As[0]
        Q = QA[q_from:a+q_from]
        Q = Q[:Q.rfind("...")].strip("\n")
        # print(f"Q#{i+1}: {len(As)} matches") ### looks correct
        Q_test = remove_thank_you(Q, speaker_words).split()
        ### Qer_test:
        Qer1, Qer2 = Q_who_before.lower(), Q_person.lower()
        for char in string.punctuation:
            Qer1 = Qer1.replace(char, "")
            Qer2 = Qer2.replace(char, "")
        Qer_test = Qer1 == Qer2
        if len(Q_test) < 3 and Qer_test:
            print(f"\nThis Q:\n{Q}\n-->\nprevious Q:\n{Q_before}")
            Q = Q_before
        elif len(Q_test) < 3 and not Qer_test:
            print(f"Q is just thanks but different speaker:\n{Q_who_before} --> {Q_person}\n{Qer1} --> {Qer2}")

        # if "Sure. The truth is" in QA[q_from:q_to]:
        #     print(QA[q_from:q_to])
        #     print(f"\n{len(As)} answers identified: {As}")
        for j, a in enumerate(As):
            A_person = lookup_speaker(QA[q_from:a+q_from-len("\nA\n")])
            a_from = q_from + a
            if j < len(As)-1:
                a_to = As[j+1] + q_from
            else:
                a_to = q_to
            A = QA[a_from:a_to].strip("\n")
            if len(As) > 0:
                A_before = A
                A = A[:A.rfind("..")].strip("\n")
                A = A[:A.rfind("\n\n")]
            if "Sure. The truth is" in A:
                test_A = A
                # print(f"\n\n{A_before}\n-->{A}")

            # print(A_person)
            Res['Q_num'] += [i+1]
            Res['Q_who'] += [Q_person]
            Res['Q_what'] += [Q]
            Res['A_who'] += [A_person]
            Res['A_what'] += [A]
        Q_before = Q
        Q_who_before = Q_person

    df = pd.DataFrame(Res)
    df['doc_num'] = enum
    df['quarter'] = Quarters[enum]
    print(f"\nQuarter = {quarter}\nlen(df) = {len(df)}")

    QA_df = pd.concat([QA_df, df])
# display(QA_df)

# save as csv or excel
save_folder = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/Giwa"
QA_df.to_excel(os.path.join(save_folder, 'QA_df_JPMC_all.xlsx'), index = False)

Q is just thanks but different speaker:
 --> Steven Chubak, Analyst, Wolfe Research LLC
 --> steven chubak analyst wolfe research llc

This Q:
Thank you. 
-->
previous Q:
Great. Thank you for all that, Jeremy. And just one on the Consumer side, just anything you're noticing in terms of people just have been 
waiting for this delinquency stabilization on the Credit Card side. Obviously, your loss rates are coming in as you expected, and we did see 30 
days pretty flat and 90 days come down a little bit. Is that seasonal? Is it just a good rate of change trend? Any thoughts there? Thanks. 
Q is just thanks but different speaker:
Mike Mayo, Analyst, Wells Fargo Securities LLC --> Betsy L. Graseck, Analyst, Morgan Stanley & Co. LLC
mike mayo analyst wells fargo securities llc --> betsy l graseck analyst morgan stanley  co llc

This Q:
Great. Thank you. 
-->
previous Q:
Hi, good morning. I just had one clean-up question, Jeremy. The consensus provision for 2024 is $10.7 billion. Could you m

In [None]:
print(QA_df.columns)
print(QA_df.quarter.unique())
QA_df.groupby('quarter').Q_what.nunique()

Index(['Q_num', 'Q_who', 'Q_what', 'A_who', 'A_what', 'doc_num', 'quarter'], dtype='object')
['2Q24 FINANCIAL RESULTS  ' '3Q24 FINANCIAL RESULTS  '
 '4Q24 FINANCIAL RESULTS ']


Unnamed: 0_level_0,Q_what
quarter,Unnamed: 1_level_1
2Q24 FINANCIAL RESULTS,26
3Q24 FINANCIAL RESULTS,29
4Q24 FINANCIAL RESULTS,18



  --> so far: 00 min :11 s  --> this cell: 00 min :0.030 s

In [None]:
### grouping Qs

gQA_df = QA_df[['doc_num', 'quarter', 'Q_what', 'Q_who']].copy()

for col in ["A_who", "A_what"]:
    gQA_df[col] = QA_df.groupby(['doc_num', 'Q_what', 'Q_who'])[col].transform(lambda x: '\n\n'.join(x)).to_frame()#.reset_index()

    # print(gQA_df.head())
# gQA_df = gQA_df.reset_index()

gQA_df['Q_num'] = QA_df.groupby(['doc_num', 'Q_what', 'Q_who'])['Q_num'].transform("min")#.reset_index()
# gQA_df = gQA_df.reset_index()
gQA_df.drop_duplicates(inplace = True)
gQA_df = gQA_df[QA_df.columns]

print(gQA_df)
print(gQA_df.shape, QA_df.shape)

gQA_df.to_excel(os.path.join(save_folder, 'gQA_df_JPMC_all.xlsx'), index = False)

    Q_num                                              Q_who  \
0       1         Steven Chubak, Analyst, Wolfe Research LLC   
1       2         Steven Chubak, Analyst, Wolfe Research LLC   
2       5  Saul Martinez, Analyst, HSBC Securities (USA),...   
3       6  Saul Martinez, Analyst, HSBC Securities (USA),...   
4       7  Saul Martinez, Analyst, HSBC Securities (USA),...   
..    ...                                                ...   
25     21  Ebrahim H. Poonawala, Analyst, BofA Securities...   
27     22  Ebrahim H. Poonawala, Analyst, BofA Securities...   
28     24   Gerard Cassidy, Analyst, RBC Capital Markets LLC   
29     25   Gerard Cassidy, Analyst, RBC Capital Markets LLC   
30     26   Gerard Cassidy, Analyst, RBC Capital Markets LLC   

                                               Q_what  \
0                       Hi. Good morning, Jeremy. \n    
1   So, wanted to start off with a question on cap...   
2   Hi, good morning. Thanks for taking my questio...   
3  


  --> so far: 00 min :12 s  --> this cell: 00 min :0.444 s

# Hybrid RAG + agent

In [None]:
!pip install langchain
!pip install -U langchain-community


  --> so far: 00 min :26 s  --> this cell: 00 min :13.959 s

In [None]:
!pip install faiss-cpu


  --> so far: 00 min :33 s  --> this cell: 00 min :6.961 s

### log in to HF

In [None]:
!pip install --upgrade huggingface_hub
# huggingface-cli login
from huggingface_hub import login
login(token=hf_token)
### if the above isn't enough to access the model, see here:
### https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/discussions/93


  --> so far: 00 min :40 s  --> this cell: 00 min :6.945 s

### import libraries

In [None]:
!pip install datasets


  --> so far: 00 min :47 s  --> this cell: 00 min :7.692 s

In [None]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset  # Hugging Face Dataset for batch processing
import torch # to clear cache



  --> so far: 01 min :09 s  --> this cell: 00 min :21.396 s

### define things, helper functions

### define batch size
* L4 can handle batch_size = 2
* testing T4 with batch_size = 8

In [None]:
batch_size = 2 ### trying 8 with L4, ran out of memory; trying 4


  --> so far: 01 min :09 s  --> this cell: 00 min :0.002 s

In [None]:
# Step 4: Define Answer Evaluation Prompt
eval_prompt = PromptTemplate(
    input_variables=["question", "answer", "retrieved_info"],
    template="""
You are an expert at analyzing earnings call Q&A sessions. Compare the given answer to the retrieved reference information.

- Does the answer directly address the question?
- Is the answer specific and informative?
- Does it avoid the question, change the topic, or use vague language?

Question: {question}
Given Answer: {answer}
Reference Info: {retrieved_info}

Classify the answer as:
Well-answered
Partially-answered
Dodged

Also, provide an explanation.
"""
)


  --> so far: 01 min :09 s  --> this cell: 00 min :0.003 s

In [None]:
# Run Evaluation for Each Q&A Pair
def evaluate_qa(qa_df):
    results = []
    for _, row in qa_df.iterrows():
        print(f"\nEvaluating Q #{_+1} of {len(qa_df)}")
        question = row["question"]
        answer = row["answer"]

        # Retrieve relevant transcript context
        # retrieved_docs = retriever.get_relevant_documents(question) ### replacing because of deprecation warning
        retrieved_docs = retriever.invoke(question)
        retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])

        # Run LLM evaluation
        response = llm(eval_prompt.format(question=question,
                                          answer=answer, retrieved_info=retrieved_text))

        results.append({
            "question": question,
            "answer": answer,
            "retrieved_text": retrieved_text,
            "evaluation": response
        })

    return pd.DataFrame(results)


  --> so far: 01 min :09 s  --> this cell: 00 min :0.004 s

In [None]:
### Define Function to Process in Parallel
def process_batch(batch):
    torch.cuda.empty_cache()  # Clears unused VRAM

    questions = batch["question"]
    answers = batch["answer"]

    # Retrieve relevant transcript context in batch
    retrieved_texts = [
        "\n".join([doc.page_content for doc in retriever.invoke(q)]) for q in questions
    ]

    # Format prompt for each question-answer pair
    formatted_prompts = [
        eval_prompt.format(question=q, answer=a, retrieved_info=r)
        for q, a, r in zip(questions, answers, retrieved_texts)
    ]

    # # Run LLM in batch
    # responses = [llm.invoke(prompt) for prompt in formatted_prompts]

    # 🚀 TRUE BATCH PROCESSING: Call pipeline on the full list at once
    responses = text_gen_pipeline(formatted_prompts, batch_size=batch_size)

    # Extract generated text from the pipeline output
    generated_texts = [resp[0]["generated_text"] for resp in responses]


    return {
        "retrieved_text": retrieved_texts,
        "evaluation": responses,
    }




  --> so far: 01 min :09 s  --> this cell: 00 min :0.003 s

### run analysis

In [None]:
already_have_df = False
if drive_mounted:
    try:
        save_folder = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/Giwa"
        qa_analysis_df = pd.read_excel(os.path.join(save_folder, 'QA_analysis_HybrRAG_Mistral_JPMC_all_original.xlsx'))
        already_have_df = True
    except Exception as e:
        print(f"\n\nCouldn't load df from g-drive.\n{e}")

if not already_have_df:
    qa_analysis_df = pd.DataFrame()

    for doc_num in QA_df.doc_num.unique():
        # Load the earnings call transcript (main discussion before Q&A)
        # main_call_text = management_discussion
        main_call_text = Discussions[doc_num]


        # Step 1: Split Main Call Text into Chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        chunks = text_splitter.split_text(main_call_text)

        # Step 2: Store Chunks in a Vector Database
        embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vectorstore = FAISS.from_texts(chunks, embedding_model)
        retriever = vectorstore.as_retriever()

        # Step 3: Load an Open-Source LLM (e.g., Mistral-7B)
        model_name = "mistralai/Mistral-7B-Instruct-v0.3"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad token explicitly
        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                    device_map="auto",
                                                    #  torch_dtype="auto",
                                                    torch_dtype=torch.float16,  # Use FP16 to save memory
                                                    )

        text_gen_pipeline = pipeline("text-generation",
                                    model=model, tokenizer=tokenizer, max_new_tokens=200,  # Reduce output length
                                    do_sample=True,
                                    temperature=0.2,
                                    pad_token_id = model.config.eos_token_id,
                                    batch_size=batch_size,  # Enables parallel execution on GPU
                                    )
        llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

        # pipe.tokenizer.pad_token_id = model.config.eos_token_id

        # Load the Q&A section from a DataFrame
        qa_df = gQA_df[gQA_df.doc_num == doc_num].rename(columns = {'Q_what':'question', 'A_what':'answer'})

        # Convert Pandas DataFrame to Hugging Face Dataset for Batch Processing
        hf_dataset = Dataset.from_pandas(qa_df).with_format("torch",
                                                            device="cuda",
                                                            # streaming=True,
                                                            )

        # Apply Function to Dataset Efficiently
        ### Tried batches of 8 with A100 but crashed consistently bc ran out of system RAM
        ### trying batch_size = 2 with T4: about 15 min
        result_dataset = hf_dataset.map(process_batch,
                                        batched=True,
                                        batch_size=batch_size,
                                        # disable_fingerprint=True,
                                        )

        # Convert Back to Pandas DataFrame
        df = result_dataset.to_pandas()
        df["doc_num"] = doc_num
        # df['quarter'] = Quarters[doc_num]
        qa_analysis_df = pd.concat([qa_analysis_df, df])

    # Save results
    if drive_mounted:
        save_folder = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/Giwa"
        qa_analysis_df.to_csv(os.path.join(save_folder, "qa_evaluation_results_JPMC_all.csv"), index=False)
        qa_analysis_df.to_excel(os.path.join(save_folder, "QA_analysis_HybrRAG_Mistral_JPMC_all_original.xlsx"), index=False)

    else:
        qa_analysis_df.to_csv("qa_evaluation_results_JPMC_all.csv", index=False)

# Display results
print(qa_analysis_df.head())


   Q_num                                              Q_who  \
0      1         Steven Chubak, Analyst, Wolfe Research LLC   
1      2         Steven Chubak, Analyst, Wolfe Research LLC   
2      5  Saul Martinez, Analyst, HSBC Securities (USA),...   
3      6  Saul Martinez, Analyst, HSBC Securities (USA),...   
4      7  Saul Martinez, Analyst, HSBC Securities (USA),...   

                                            question  \
0                      Hi. Good morning, Jeremy. \n    
1  So, wanted to start off with a question on cap...   
2  Hi, good morning. Thanks for taking my questio...   
3  Okay, got it. That's helpful. Just following u...   
4                     Okay, great. Thanks a lot. \n    

                                               A_who  \
0  Jeremy Barnum, Chief Financial Officer, JPMorg...   
1  Jeremy Barnum, Chief Financial Officer, JPMorg...   
2  Jeremy Barnum, Chief Financial Officer, JPMorg...   
3  Jeremy Barnum, Chief Financial Officer, JPMorg...   
4  J

In [None]:
qa_analysis_df.evaluation.iloc[0][0]

'['


  --> so far: 01 min :09 s  --> this cell: 00 min :0.004 s

In [None]:
# print(qa_analysis_df.evaluation.iloc[0])

print("\n")
print(type(qa_analysis_df.evaluation.iloc[0]), len(qa_analysis_df.evaluation.iloc[0]))

print(type(qa_analysis_df.evaluation.iloc[0][0]), len(qa_analysis_df.evaluation.iloc[0][0]))

if type(qa_analysis_df.evaluation.iloc[0][0]) == dict:
    for k, v in qa_analysis_df.evaluation.iloc[0][0].items():
        print(f"\n{k}:\n{v}")
else:
    print(qa_analysis_df.evaluation.iloc[0])



<class 'str'> 2681
<class 'str'> 1
[{'generated_text': "\nYou are an expert at analyzing earnings call Q&A sessions. Compare the given answer to the retrieved reference information.\n\n- Does the answer directly address the question?\n- Is the answer specific and informative?\n- Does it avoid the question, change the topic, or use vague language?\n\nQuestion: Hi. Good morning, Jeremy. \n \nGiven Answer: Good morning, Steve. \n \nReference Info: MANAGEMENT DISCUSSION SECTION \n ...  \n\nOperator: Good morning, ladies and gentlemen. Welcome to JPMorgan Chase's Second Quarter 2024 Earnings Call. This call is being \nrecorded. Your line will be muted for the duration of the call. We will now go live to the presentation. The presentation is available on \nJPMorgan Chase's website, and please refer to the disclaimer in the back concerning forward-looking statements. Please standby.\nAt this time I would like to turn the call over to JPMorgan Chase's Chief Financial Officer, Jeremy Barnum. 

In [None]:
def clean_answers(ans):
    punct = string.punctuation.replace("-", "")
    punct = punct.replace("\\", "")
    assessments = ["Well-answered", "Partially-answered", "Dodged"]
    if type(ans) != str:
        a = ans[0]['generated_text']
    else:
        a = ans

    found = False
    for _ in [ "\nAnswer: ", "The answer is ", "Answer Classification: "]:
        if _ in a:
            # found = True
            a = a[a.find(_) + len(_):]
            a = a.strip(string.punctuation + " ")
            for asm in assessments:
                if a.lower().startswith(asm.lower()):
                    a = asm
                    found = True
            if "\n" in a:
                a = a[:a.find("\n")]
            if " The speaker " in a:
                a = a[:a.find(" The speaker ")]
    if found:
        for p in punct:
            a = a.strip(p)
        return a
    else: # not found:
        if "Also, provide an explanation." in a:
            a = a[a.find("Also, provide an explanation.") +  len("Also, provide an explanation."):]
            L = []
            for i, _ in enumerate(assessments):
                if _ in a:
                    # L.append(i)
                    L.append(_)
            if len(L) == 1:
                # a = assessments[L[0]]
                print("Diff format # 1")
                return _ #assessments[L[0]] # + "\n\n" + a
            elif len(L) == 0:
                # a = "indeterminate"
                print("Diff format # 2")
                return "Indeterminate!" #\n\n" + a
            else:
                print("\n\nDifferent format #3!")
                return "U/A" + ", ".join(L) #"Wrong#1\n\n" + a
        else:
            print("\n\nVery different format #4!")
            return "U/A" #\n\n" + a


  --> so far: 01 min :09 s  --> this cell: 00 min :0.005 s

In [None]:
# print(qa_analysis_df.answer.iloc[0])

qa_analysis_df["evaluation_clean"] = qa_analysis_df.evaluation.apply(clean_answers)

display(qa_analysis_df["evaluation_clean"].head())

for doc_num in qa_analysis_df.doc_num.unique():
    print(f"\n\nDocument number {doc_num+1}")
    print(qa_analysis_df[qa_analysis_df.doc_num == doc_num]["evaluation_clean"].value_counts())

Diff format # 1
Diff format # 1
Diff format # 1
Diff format # 1
Diff format # 1


Very different format #4!
Diff format # 1


Very different format #4!
Diff format # 1


Very different format #4!
Diff format # 1


Very different format #4!
Diff format # 1
Diff format # 1


Very different format #4!
Diff format # 1


Very different format #4!
Diff format # 1
Diff format # 1
Diff format # 1
Diff format # 1


Very different format #4!


Very different format #4!


Very different format #4!
Diff format # 1
Diff format # 1
Diff format # 1


Very different format #4!
Diff format # 1


Very different format #4!


Very different format #4!
Diff format # 1


Very different format #4!


Very different format #4!


Very different format #4!
Diff format # 1
Diff format # 1
Diff format # 1
Diff format # 1


Very different format #4!
Diff format # 1
Diff format # 1
Diff format # 1


Very different format #4!


Very different format #4!
Diff format # 1


Very different format #4!


Very different for

Unnamed: 0,evaluation_clean
0,Dodged
1,Dodged
2,Partially-answered
3,Dodged
4,Dodged




Document number 1
evaluation_clean
Dodged                15
U/A                    6
Partially-answered     3
Well-answered          2
Name: count, dtype: int64


Document number 2
evaluation_clean
Dodged                13
U/A                   12
Partially-answered     4
Name: count, dtype: int64


Document number 3
evaluation_clean
U/A              8
Dodged           8
Well-answered    2
Name: count, dtype: int64

  --> so far: 01 min :09 s  --> this cell: 00 min :0.031 s

In [None]:
print(qa_analysis_df.columns)
qa_analysis_df = qa_analysis_df[['doc_num', 'quarter',
                                 'Q_num', 'Q_who', 'question',
                                 'A_who', 'answer',
                                 'retrieved_text', 'evaluation',
                                 'evaluation_clean']]

ok_answers = ['Well-answered', 'Partially-answered', 'Dodged', 'U/A']
clean_answers_dict = {}
for eval0 in qa_analysis_df.evaluation_clean.unique():
    if eval0 not in ok_answers:
        eval = eval0[:eval0.find("\n")]
        if eval in ok_answers:
            clean_answers_dict[eval0] = eval

# qa_analysis_df.evaluation_clean = qa_analysis_df.evaluation_clean.map(clean_answers_dict)
print(clean_answers_dict)
qa_analysis_df.value_counts('evaluation_clean')

Index(['Q_num', 'Q_who', 'question', 'A_who', 'answer', 'doc_num', 'quarter',
       '__index_level_0__', 'retrieved_text', 'evaluation',
       'evaluation_clean'],
      dtype='object')
{}


Unnamed: 0_level_0,count
evaluation_clean,Unnamed: 1_level_1
Dodged,36
U/A,26
Partially-answered,7
Well-answered,4



  --> so far: 01 min :09 s  --> this cell: 00 min :0.027 s

In [None]:
df = qa_analysis_df.copy()

print(clean_answers_dict)

df.evaluation_clean = df.evaluation_clean.apply(lambda x: clean_answers_dict if x in clean_answers_dict.keys() else x)

df.evaluation_clean.value_counts()

{}


Unnamed: 0_level_0,count
evaluation_clean,Unnamed: 1_level_1
Dodged,36
U/A,26
Partially-answered,7
Well-answered,4



  --> so far: 01 min :09 s  --> this cell: 00 min :0.016 s

### dealing with U/A

In [None]:
df = df[df.evaluation_clean == 'U/A']
display(df[['evaluation_clean', 'evaluation']].head())

Unnamed: 0,evaluation_clean,evaluation
6,U/A,"[{'generated_text': ""\nYou are an expert at an..."
8,U/A,"[{'generated_text': ""\nYou are an expert at an..."
10,U/A,"[{'generated_text': ""\nYou are an expert at an..."
13,U/A,"[{'generated_text': ""\nYou are an expert at an..."
18,U/A,"[{'generated_text': ""\nYou are an expert at an..."



  --> so far: 01 min :09 s  --> this cell: 00 min :0.039 s

In [None]:
df.evaluation.iloc[0]

'[{\'generated_text\': "\\nYou are an expert at analyzing earnings call Q&A sessions. Compare the given answer to the retrieved reference information.\\n\\n- Does the answer directly address the question?\\n- Is the answer specific and informative?\\n- Does it avoid the question, change the topic, or use vague language?\\n\\nQuestion: Great. Thank you for all that, Jeremy. And just one on the Consumer side, just anything you\'re noticing in terms of people just have been \\nwaiting for this delinquency stabilization on the Credit Card side. Obviously, your loss rates are coming in as you expected, and we did see 30 \\ndays pretty flat and 90 days come down a little bit. Is that seasonal? Is it just a good rate of change trend? Any thoughts there? Thanks. \\nGiven Answer: Yeah. I still feel like when it comes to Card charge-offs and delinquencies, there\'s just not much to see there. It\'s still – it\'s normalization, not \\ndeterioration. It\'s in line with expectations. As I say, we a


  --> so far: 01 min :09 s  --> this cell: 00 min :0.007 s

### cleaning things up a little more


In [None]:
def clean_eval(eval):
    try:
    # if type(eval) == list or type(eval) == np.ndarray:
        text = eval[0]['generated_text']
    except:
        if type(eval) == str:
            text = eval
        else:
            print(type(eval))
            print(eval)
    if "Also, provide an explanation." in text:
        text = text[text.find("Also, provide an explanation.") + len("Also, provide an explanation."):]
    text = text.strip()
    text = text.strip("\n")
    return text

qa_analysis_df.evaluation = qa_analysis_df.evaluation.apply(clean_eval)

qa_analysis_df.evaluation.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_analysis_df.evaluation = qa_analysis_df.evaluation.apply(clean_eval)


Unnamed: 0,evaluation
0,\n\nAnswer: Partially-answered\n\nExplanation:...



  --> so far: 01 min :09 s  --> this cell: 00 min :0.015 s

In [None]:
f"\n\nAnswer: blablah\n\n".strip()

'Answer: blablah'


  --> so far: 01 min :09 s  --> this cell: 00 min :0.004 s

In [None]:
def clean_UA(eval):
    # print(".", end = "")
    ok_answers = ['Well-answered', 'Partially-answered', 'Dodged']
    ok_answers_lower = {x.lower():x for x in ok_answers}
    punct = string.punctuation.replace("-", "")
    punct = punct.replace("\\", "")

    eval = eval.strip("\n")

    e = eval.replace("\n", " ")
    e = e.strip()
    e = e.split(" ")[0]
    # e = e.split("\n")[0]
    for char in punct:
        e = e.replace(char, "")
    # print(f"First word: {eval[:10]} --> {e}")
    if e in ok_answers:
        return e

    if eval.startswith('Answer: '):
        e = eval[len('Answer: '):]
        if e[:e.find("\n")].strip().lower() in ok_answers_lower:
            return ok_answers_lower[e[:e.find("\n")].strip().lower()]
        if e[:e.find(" ")].strip().lower() in ok_answers_lower:
            return ok_answers_lower[e[:e.find(" ")].strip().lower()]

    for pre in ["Classification: ", "classified as ", "Classify the answer as: ", "answer provided by the speaker is ", "\nAnswer:"]:
        e = eval
        while e.count(pre) > 0:
            e = e[e.find(pre) + len(pre):]
            e0 =  e.split()[0].strip()
            # print(e0)
            for char in punct:
                e0 = e0.replace(char, "")
            if e0.lower() in ok_answers_lower:
                return ok_answers_lower[e0.lower()]
            else:
                # print(e0)
                e = e[e.find(pre) + len(pre):]
    S = []
    found = "U/A"
    for res in ok_answers_lower.keys():
        e = eval.lower()
        S.append(e.count(res))
        if e.count(res) > 0:
            if len(S) == 1 or e.count(res) > max(S): #e.count(res) > 0 and
                found = ok_answers_lower[res]
            elif len(S) > 1 and found != 'U/A':
                found = found + ", " + ok_answers_lower[res]
            elif found == 'U/A':
                found = ok_answers_lower[res]
        # elif e.count(res) > 0 and e.count(res) == max(S):
        #     found = found + ", " + ok_answers_lower[res]
    if sum(S) > 0:
        # print(S)
        return found

    return 'U/A'

df = qa_analysis_df.copy()
print(df.value_counts('evaluation_clean'))

df.evaluation_clean = df.evaluation.apply(clean_UA)

print(df.value_counts('evaluation_clean'))

evaluation_clean
Dodged                36
U/A                   26
Partially-answered     7
Well-answered          4
Name: count, dtype: int64
evaluation_clean
Well-answered         36
Partially-answered    19
Dodged                18
Name: count, dtype: int64

  --> so far: 01 min :09 s  --> this cell: 00 min :0.021 s

In [None]:
# df = qa_analysis_df.copy()
print(qa_analysis_df.value_counts('evaluation_clean'))
print(len(df))
qa_analysis_df.loc[qa_analysis_df['evaluation_clean'] == 'U/A', 'evaluation_clean'] = qa_analysis_df.loc[qa_analysis_df['evaluation_clean'] == 'U/A'].evaluation.apply(clean_UA)
display(qa_analysis_df.value_counts('evaluation_clean'))
print(len(qa_analysis_df))

evaluation_clean
Dodged                36
U/A                   26
Partially-answered     7
Well-answered          4
Name: count, dtype: int64
73


Unnamed: 0_level_0,count
evaluation_clean,Unnamed: 1_level_1
Dodged,38
Well-answered,27
Partially-answered,8


73

  --> so far: 01 min :09 s  --> this cell: 00 min :0.019 s

In [None]:
for quarter in qa_analysis_df.quarter.unique():
    print(f"\n\nDocument {quarter}")
    g = qa_analysis_df[qa_analysis_df.quarter == quarter]["evaluation_clean"].value_counts().to_frame()
    g['percent'] = qa_analysis_df[qa_analysis_df.quarter == quarter]["evaluation_clean"].value_counts(normalize = True).round(3)*100
    print(g)



Document 2Q24 FINANCIAL RESULTS  
                    count  percent
evaluation_clean                  
Dodged                 16     61.5
Well-answered           7     26.9
Partially-answered      3     11.5


Document 3Q24 FINANCIAL RESULTS  
                    count  percent
evaluation_clean                  
Dodged                 13     44.8
Well-answered          11     37.9
Partially-answered      5     17.2


Document 4Q24 FINANCIAL RESULTS 
                  count  percent
evaluation_clean                
Well-answered         9     50.0
Dodged                9     50.0

  --> so far: 01 min :09 s  --> this cell: 00 min :0.025 s

In [None]:
qa_analysis_df[qa_analysis_df.evaluation_clean.isin(["U/A"])][["evaluation", "evaluation_clean"]]

Unnamed: 0,evaluation,evaluation_clean



  --> so far: 01 min :09 s  --> this cell: 00 min :0.017 s

### saving things

In [None]:
# num = 11
# print(qa_analysis_df.evaluation_clean.iloc[num])
# print("\n")
# print(qa_analysis_df.evaluation.iloc[num][0]['generated_text'])

# Set display options to show full text
pd.set_option('display.max_colwidth', None)

qa_analysis_df[qa_analysis_df.evaluation_clean.isin(["Dodged", "U/A"])][["question", "answer", "evaluation_clean"]]

Unnamed: 0,question,answer,evaluation_clean
0,"Hi. Good morning, Jeremy. \n","Good morning, Steve. \n",Dodged
1,"So, wanted to start off with a question on capital just given some indications that the Fed is considering favorable revisions to both Basel III \nEndgame and the GSIB surcharge calculations, which I know you've been pushing for some time. As you evaluate just different capital \nscenarios, are these revisions material enough where they could support a higher normalized ROTCE at the Firm versus the 17% target? And \nif so, just how that might impact or inform your appetite for buybacks going forward? \n","Right. Okay. Thanks, Steve. And actually, before answering the question, I just want to remind everyone that Jamie is not able to join because \nhe has a travel conflict overseas, so it's just going to be me today. \n\nOkay. Good question on the capital and the ROTCE. So, let me start with the ROTCE point first. In short, my answer to that question would be \nno. It's hard to imagine a scenario coming out of the whole potential range of outcomes on capital that involves an upward revision on ROTCE. \nIf you think about the way we've been talking about this, we've said that before the Basel III Endgame proposal, we had a 17% through the \ncycle target, and that while you can imagine a range of different outcomes, the vast majority of them involve expansions of the denominator. \nAnd while we had ideas about changing the perimeter and repricing, all of which are still sort of in effect, most of those would be thought of as \nmitigants rather than things that would actually, like increase the ROTCE. And I don't really think that answer has particularly changed. \n\nSo, as of now, that's what I would say, which is a good pivot to the next point, which is, yeah, we've been reading the same press coverage \nyou've been reading and it's fun and interesting to speculate about the potential outcomes here. But in reality, we don't know anything you \ndon't know. We don't know how reliable the press coverage is. And so, in that sense, I feel like on the overall capital return and buyback \ntrajectory, not much has actually changed relative to what I laid out at Investor Day, the comments that I made then, the comments that Jamie \nmade then, as well as the comments that Jamie made the subsequent week at an industry conference. \n\nSo, maybe I'll just briefly summarize for everyone's benefit what we think that is, which is, one, we do recognize that our current practice on \ncapital return and buybacks does lead to an ever-expanding CET1 ratio. But obviously, we're going to run the company over the cycle over \ntime at a reasonable CET1 ratio with reasonable buffers relative to our requirement. So, after all the uncertainty is sorted out, the question of \nthe deployment of the capital, one way or another, is a matter of when, not if. \n\nOn the capital hierarchy, it's also worth noting that's another thing that remains unchanged, so I'll review it quickly. Growing the business \norganically and inorganically, sustainable dividend, and in that context it's worth noting that the board's announced intention to increase it to \n$1.25 is a 19% increase prior to last year. So, that's a testament to our performance and that is a return of capital.",Dodged
3,"Okay, got it. That's helpful. Just following up on capital returns on Steve's question, I think you highlighted in response it's a matter of when, \nnot if, and obviously, Jamie’s not there, you can’t speak for Jamie, but seems to have shown limited enthusiasm for a special dividend or \nbuybacks at current valuations. Can you just give us a sense of how you’re thinking about the various options? Any updated thoughts on a \nspecial dividend? And can you do other things like, for example, have a material increase in your dividend payout, sort of a step function \nincrease, where keep that flat and grow into that, grow your earnings into that over time? Can you just maybe give us a sense of how you're \nthinking about what options you have available to deploy that capital?","Sure. Yeah. I mean, I would direct you to read, I'm sure you have, Jamie's comments at the industry conference, where he participated the \nweek after Investor Day because he went into just good amount of detail on this stuff, addressing some of these points. And I think this \ncomment there about the special dividend was that it's not really our preference. We hear from people that many of our investors wouldn't find \nthat particularly appealing, and he said as much that it wouldn't be sort of our first choice.",Dodged
4,"Okay, great. Thanks a lot. \n","Thanks, Saul. \n...",Dodged
5,"Thanks a lot. Good morning, Jeremy. Jeremy, great to see the progress on Investment Banking fees up sequentially and 50% year-over-year, \nand I saw you on the tape earlier just talking about still regulatory concerns a little bit in the advisory space and we clearly didn't see the debt \npull-forward play through because your DCM was great again. I'm just wondering just where you feel the environment is relative to the \npotential and just where the dialogue is across the three main bucket areas in terms of, like how does this feel in terms of a current \nenvironment versus a potential environment that we could still see ahead. Thanks.","Yeah. Thanks, Ken. It's progress, right? I mean, we're happy to see the progress. People have been talking about depressed banking fee \nwallet for some time and it's nice to see not only the year-on-year pop from a low base, but also a nice sequential improvement. So, that's the \nfirst thing to say. \n\nIn terms of dialog and engagement, it's definitely elevated. So, the dialog on ECM is elevated, and the dialog on M&A is quite robust as well. \nSo, all of those are good things that encourage us and make us hopeful that we could be seeing sort of a better trend in this space. But there \nare some important caveats. So, on the DCM side, yeah, we made pull-forward comments in the first quarter, but we still feel that this second \nquarter still reflects a bunch of pull-forward and therefore, we're reasonably cautious about the second half of the year. Importantly, a lot of the \nactivity is refinancing activity as opposed to, for example, acquisition finance. So, the fact that M&A remains still relatively muted in terms of \n\n7 \n\nactual deals has knock-on effects on DCM as well. And when a higher percentage of the wallet is refied, then the pull-forward risk becomes a \nlittle bit higher. \n\nOn ECM, if you look at it kind of at a remove, you might ask the question, given the performance of the overall indices, you would think it \nwould be a really booming environment for IPOs for example. And while it's improving, it's not quite as good as you would otherwise expect, \nand that's driven by a variety of factors, including the fact that, as has been widely discussed, the extent to which the performance of the large \nindices is driven by, like a few stocks, the sort of mid-cap tech growth space and other spaces that would typically be driving IPOs have had \nmuch more muted performance.",Dodged
7,"you see the same trend happening on the asset-backed finance side because that's a bigger part of the world and it's a bigger part of your \nbusiness? So, I'd appreciate your thoughts there. Thanks.","Yeah. Thanks, Glenn. So, on private credit, so nothing really new to say there. I think, I guess, one way the environment's evolving a little bit \nis, as you know, a lot of money has been raised in private credit funds looking for deals. And sort of a little bit to my prior comment, in a \nrelatively muted acquisition finance environment, at this point you've got a lot of money chasing, like not that many deals. So, the space is a \nlittle bit quieter than it was at the margin. \n\nAnother interesting thing to note is some of this discussion about kind of lender protections that were typical in the syndicated leveraged \nfinance market making their way into the private market as well as sort of people realize that even in the private market you probably need \nsome of those protections in some cases, which is sort of supportive of the theme that we've been talking about, about convergence between \nthe direct lending space and the syndicated lending space, which is kind of our core thesis here, which is that we can offer best-in-class \nservice across the entire continuum, including secondary market trading and so on. So, we feel optimistic about our offering there. I think the \ncurrent environment is maybe a little bit quieter than it was. So, it's maybe not a great moment to, like kind of test whether we're doing a lot \nmore or less in the space, so to speak.",Dodged
9,Super helpful. Thanks so much for that.,"Thanks, Glenn. \n\n9 \n\n...",Dodged
12,Got it. Okay. Thank you for the details. \n\n10,"Thanks, Matt. \n...",Dodged
14,"Your 17% through the cycle ROTCE expectation, what is the CET1 ratio that you assume for that?","I mean, we would generally assume requirements plus a reasonable buffer, which, depending on the shape of rules, could be a little bit \nsmaller or a little bit bigger and no small part is a function of the volatility of those rules, which goes back to my prior comments on SCB and \nCCAR. But obviously, as you well know, what actually matters is less the ratio and more the dollars and at this point, the dollars are very much \na function of where rules land and where the RWA lands and obviously, things like GSIB recalibration and so on. So, we've done a bunch of \nscenario analysis along the lines of what I did at Investor Day that informs those numbers. But that is obviously one big element of uncertainty \nbehind that 17%, which is why at Investor Day when we talked about it, both Daniel and I were quite specific about saying that we thought \n17% was still achievable, assuming a reasonable outcome on the Basel III Endgame. \n",Dodged
16,"Hi, Jeremy. Hi, Jeremy.","Hi, Betsy. \n",Dodged



  --> so far: 01 min :09 s  --> this cell: 00 min :0.038 s

In [None]:
qa_analysis_df[qa_analysis_df.evaluation_clean.isin(["Partially-answered"])][["question", "answer", "evaluation_clean"]]

Unnamed: 0,question,answer,evaluation_clean
2,"Hi, good morning. Thanks for taking my question. Jeremy, can you give an update on the stress capital buffer? You noted obviously that you \nthink there is an error in the Fed's calculation due to OCI. Can you just give us a sense of what the dialogue with the Fed looks like? Is there a \nprocess to modify the SCB higher and if you could give us a sense of what that process looks like? \n","Yeah. So, I'm not going to comment about any conversations with the Fed – not to confirm or deny that they even exist. That stuff is private. \nAnd so, then if you talk about, like the timing here, right, so you know that the stress capital buffer that's been released at 3.3% is a preliminary \nnumber. By rule, the Fed has to release that by August 31. It may come sooner. You talked about an error in the calculation. We haven't used \nthat word. What we know, what we believe rather is that the amount of OCI gain that came through the Fed's disclosed results looked non-\nintuitively high to us. And if you adjust that in ways that we think are reasonable, you would get a slightly higher stress capital buffer. Whether \nthe Fed agrees and whether they decide to make that change or not is up to them, and we'll see what happens.",Partially-answered
11,"Yeah, that was a really helpful chart. Just the one follow-up, on the yield curve effects, I guess, what do you mean by that because right now \nthe yield curve is inverted, maybe you're still breathing in the impact of that. But kind of longer term, you'd expect little bit of steepness of the \ncurve, which I would think would help. But what did you mean by that? Thank you.","Yeah. I mean, you and I have talked about this before, I guess. I sort of – I guess, I don't really agree fundamentally with the notion that the \nway to think about things is that sort of yield curve steepness above and beyond what's priced in by the forewords is a source of structural NII \nor NIM for banks, if you know what I mean. Like, I mean, people have different views about the so-called term premium and obviously, in a \nmoment of inverted curve and different types of treasury supply dynamics, people's thinking on that may be changing.",Partially-answered
15,"Let me just zoom out for one more question on the return target. I mean, when I asked Jamie at the 2013 Investor Day, would it make sense to \nhave 13.5% capital, he was basically telling me to take a hike, right? And now, you have 15.3% capital and you're saying, well, we might want \nto have a lot more capital here. I mean, at some point, if you're spending $17 billion a year to improve the company, if you're gaining share \nwith digital banking, if you're automating the back office, if you're moving ahead with AI, if you're doing all these things that I think you say \nothers aren't doing, why wouldn't those returns go higher over time or do you just assume you'll be competing those benefits away? Thanks.","Yeah. I mean, I think in short, Mike, and we've talked about this a lot and Jamie's talked about this a lot, it's a very, very, very competitive \nmarket, and we're very happy with our performance. We're very happy with the share we've taken, and 17% is like an amazing number \nactually. And like, to be able to do that, given how robust the competition is from banks, from non-banks, from U.S. banks, from foreign banks, \nand all of the different businesses that we compete in, is something that we're really proud of. So, the number has a range around it, \nobviously. So, it's not a promise, it's not a guarantee, and it can fluctuate. But we're very proud to be in the ballpark of being able to think that \n\n11 \n\nwe can deliver it, again, assuming a reasonable outcome on Basel III Endgame. But it's a very, very, very competitive market across all of our \nproducts and services and regions and client segments. \n",Partially-answered
27,All right. Thanks a lot. \n,Thanks. \n ...,Partially-answered
29,had some aversion to that in the past. But do you anticipate redeploying additional excess liquidity just amid the expectation for deeper rate \ncuts?,"Yeah. Sure. So on extending duration, Steve, you know this obviously, but I just think it's important to say that all else equal, extending \nduration doesn't change expected NII if you assume that the policy rate follows the forward, right? So, point one. \n\n... people have mortgages at 6%, so, right. \n\n\nYeah. Now so when we think of the question of extending duration and really managing duration right now, a couple things to say. So, \nobviously, a lot of different versions of duration, but one number that we disclose is the EaR. When the 10-Q comes out, you'll see that that \nnumber is a little bit lower. It'll come down from $2.8 billion to about $2.1 billion if our current estimates are correct. That's for a number of \nreasons, some of which are passive, but some of those are active choices to extend duration a little bit. \n\nAnd the one thing I can assure you is the forward curve will not be the same forward curve in six months. \n",Partially-answered
36,"Good morning. So lower rates was supposed to drive a pick-up in loan growth and conversion of some of these investment banking pipelines. \nObviously, we just had one cut and it's early. But any beginning signs of this in terms of the interest in borrowing more, and again, conversion \nof the banking pipelines?","I would say, Matt, generally no, frankly, with a couple of minor exceptions. So I think it's probably fair to say that the outperformance late in the \nquarter in investment banking fees was to a meaningful degree, as I mentioned, driven by DCM as well as to some degree driven by the \nacceleration of the closing of some M&A transactions. And I do think that some of that DCM outperformance is in the sort of types of deals that \nare opportunistic deals that aren't in our pipeline, and those are often driven by treasurers and CFOs sort of seeing improvement in market \nlevels and jumping on those. So it's possible that that's a little bit of a consequence of the cuts. \n\nYeah. And also in the debt markets, rates came down, spreads are quite low, and markets are wide open. So it kind of makes sense that \npeople are taking advantage of that today. Those conditions may not prevail, be the ongoing conditions late next year. \n",Partially-answered
38,"Hey. Jamie, I think I've seen you comment on government this year more than any other time in your career. And the August 2 op-ed, \nWashington Post, Davos you're talking about government. I think it was this week or last week on Bloomberg, you're saying bank mergers \nshould be allowed. Your bus tour in August, you were asked, which is my question now, under what circumstances would you leave for \ngovernment service and your answer then was “I love what I do.” We get it you love what you do. But under what circumstances would you \nconsider government service? It seems like you'd be more likely to go now than in the past just based on the numerous comments that you've \nmade. Is that right, wrong, what's your thinking?","I think it's wrong. I've always been an American patriot. My country is more important to me than my company, and I think this government is \nvery important to get this – and if you look at the world today, Mike, it is so important that we get things right for the whole geopolitical world. \nI'm not just talking about the American economy, and we try to participate in policy at the local level, at the state level, at the federal level, at \nthe international level to try to help. That's our job. We try to grow economies and things like that. So nothing has changed in my view, my \nopinion, or my interest. I just think it's very, very important that we try to help the government do a good job. \n",Partially-answered
49,What about loans? The Apple card…,"But creating those assets to help clients. That's a whole different matter because when it comes to clients, we earn credit asset spread and we \nusually have other stuff. If our bankers can deploy capital that way, of course be that we want to do more. And our CIO could deploy capital in \nmultiple ways, we would probably do more. And we ask all the time can we do more in affordable housing, can we do more in things we're \nactually quite comfortable. And yeah, if we could find ways to deploy capital, we would be happy to do that. But put us in your closest \ncategory... \n",Partially-answered



  --> so far: 01 min :09 s  --> this cell: 00 min :0.039 s

### Cleaning greetings and pleasantries off the Q&A

In [None]:
# Q_test = remove_thank_you(Q, speaker_words).split()

def is_greeting(text, threshold = 1):
    L = remove_thank_you(text, speaker_words).split()
    return len(L) <= threshold

qa_analysis_df['is_greeting'] = (qa_analysis_df.question.apply(is_greeting) | qa_analysis_df.answer.apply(is_greeting))

# Set display options to show full text
pd.set_option('display.max_colwidth', None)

print(qa_analysis_df.value_counts('is_greeting'))

qa_analysis_df.loc[qa_analysis_df.is_greeting == True, ['question', 'answer', 'evaluation_clean', 'is_greeting']]

is_greeting
False    57
True     16
Name: count, dtype: int64


Unnamed: 0,question,answer,evaluation_clean,is_greeting
0,"Hi. Good morning, Jeremy. \n","Good morning, Steve. \n",Dodged,True
4,"Okay, great. Thanks a lot. \n","Thanks, Saul. \n...",Dodged,True
9,Super helpful. Thanks so much for that.,"Thanks, Glenn. \n\n9 \n\n...",Dodged,True
12,Got it. Okay. Thank you for the details. \n\n10,"Thanks, Matt. \n...",Dodged,True
16,"Hi, Jeremy. Hi, Jeremy.","Hi, Betsy. \n",Dodged,True
18,"All right. Thank you, Jeremy. Appreciate it.","Thanks, Betsy. \n...",Dodged,True
19,"Hi, Jeremy. How are you?","Hi, Gerard. \n",Dodged,True
22,Great. Appreciate the insights as always. Thank you.,"Thanks, Gerard. \n...",Dodged,True
27,All right. Thanks a lot. \n,Thanks. \n ...,Partially-answered,True
30,"Well said. Well, thank you so much for taking my questions.","Thanks, Steve. \n...",Dodged,True



  --> so far: 01 min :09 s  --> this cell: 00 min :0.063 s

In [None]:
df = qa_analysis_df[qa_analysis_df.is_greeting == False]
for quarter in df.quarter.unique():
    print(f"\n\nDocument {quarter}")
    g = df[df.quarter == quarter]["evaluation_clean"].value_counts().to_frame()
    g['percent'] = df[df.quarter == quarter]["evaluation_clean"].value_counts(normalize = True).round(3)*100
    print(g)



Document 2Q24 FINANCIAL RESULTS  
                    count  percent
evaluation_clean                  
Dodged                  8     44.4
Well-answered           7     38.9
Partially-answered      3     16.7


Document 3Q24 FINANCIAL RESULTS  
                    count  percent
evaluation_clean                  
Well-answered          11     47.8
Dodged                  8     34.8
Partially-answered      4     17.4


Document 4Q24 FINANCIAL RESULTS 
                  count  percent
evaluation_clean                
Well-answered         9     56.2
Dodged                7     43.8

  --> so far: 03 min :21 s  --> this cell: 00 min :0.043 s


  --> so far: 23 min :39 s  --> this cell: 20 min :17.695 s

# Save results

In [None]:
if drive_mounted:
    save_folder = "/content/drive/My Drive/Colab Notebooks/DSCA/CAM_DS Employer Project/Giwa"
    qa_analysis_df.to_excel(os.path.join(save_folder, 'QA_analysis_HybrRAG_Mistral_JPMC_all_cleaned.xlsx'), index = False)


  --> so far: 23 min :41 s  --> this cell: 00 min :0.061 s


  --> so far: 01 min :09 s  --> this cell: 00 min :0.097 s