# **Po Taggle III - May 2024**

# What is the problem to solve
In order to summarize an earnings call transcript, we want to extract all the partecipants, understand for each of them what are the relevant sentences from which we can extract financial information, that then will be listed in a financial table and in a bullets points list, differenciating sentences from the PRESENTATION and Q&A session.
# How it is been solved and implemented it
In order to extract relevant financial sentences from the transcript, i have implemented a NER(Named Entity Recognition) pre-trained model(distillied BERT), fine-tuned with custom entities: ***Reported Value***, ***Multiplier***, ***Currency***	and ***Granular Concept***.

In order to train and fine tune it i have used Spacy library. 
I have used this(<a href="train_model.ipynb" target="_blank">train_model.ipynb</a>) other notebook to train the model with the custom entities.

---



# Install and import necessary packages

In [16]:
# install any necessary packages
#python -m pip install fitz
#python -m pip install PyMuPDF
#python -m pip install -U spacy
#python -m pip install spacy_transformers
#python -m pip install python-docx

import spacy
import spacy_transformers
import docx
import sys
import fitz
import re

# Load the trained model
**NOTE:** in order to fine tune it i have only used the financial info extracted from Novartis, Carlesberg and .... summarizations file provided, annotating them manully using https://tecoholic.github.io/ner-annotator/ . </br>
The ***model score is 0.62*** due to limited annotated data, time and resources, but its result seems to be already good, but definetly can be improved with more data.

In [17]:
# Load the trained spaCy NER model from the specified path
nlp = spacy.load('trained_models/output/model-last')

# Infer the model if you want to test it!

In [18]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp("Net new Digital Media ARR $432 million")

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

ARR   ->>>>   GRANULAR CONCEPT
$   ->>>>   CURRENCY
432   ->>>>   REPORTED VALUE
million   ->>>>   MULTIPLIER


# Extract Paragraphs, Conference and Corporate partecipatns from the transcript given in input

In [19]:
# Specify the path to the PDF file
fname = 'test_dataset/Adobe.pdf'

In [20]:
def extract_partecipants(partecipantsList):
  conf_participants = []
  corp_participants = []
  idx=-1

  conf = False
  corp = False
  for row in partecipantsList[6:]:
    idx += 1

    if row == "CORPORATE PARTICIPANTS":
      conf = False
      corp = True
      continue

    if row == "CONFERENCE CALL PARTICIPANTS" :
            conf = True
            corp = False
            continue

    if row == "PRESENTATION":
      return corp_participants , conf_participants ,idx

    if row == " ":
      continue

    if corp:
      corp_participants.append(row)

    if conf:
      conf_participants.append(row)

def extract_parag(txt):
  newParag = True
  speaker = 'Unknown'
  parag = ''
  d = dict()
  arr = []
  txt_len = len(txt)
  idx = 0
  for line in txt:
    idx +=1

    if line == ' ':
      newParag = True
      arr.append({'speaker':speaker.strip(), 'paragraph':[parag]})
      speaker = 'Unknown'
      parag = ''
      continue

    if newParag:
      speaker = line
      newParag = False
    else:
      parag = parag +" "+ line
      if idx == txt_len:
        arr.append({'speaker':speaker.strip(), 'paragraph':[parag]})

  return arr

# Open the PDF document using PyMuPDF (fitz)
doc = fitz.open(fname)

# Initialize an empty string to store the extracted text from the PDF
text = " "

# Display the extracted text
title=''

for page in doc:
  spl = str(page.get_text()).splitlines()

  #Extract Title
  if page.number == 0 :
    title=spl[7]
    continue

  #Extract Conference and Corporate
  if page.number == 1 :
    corp_participants , conf_participants , end_index = extract_partecipants(spl)
    text = text + '\n'.join(spl[end_index+8:]) + '\n'
    continue

  #On the last page skip the diclaimer
  if page.number == len(doc)-1:
    text = text + '\n'.join(spl[6:len(spl)-11]) + '\n'
    continue

  text = text + '\n'.join(spl[6:]) + '\n'

parags = extract_parag(text.splitlines())

# Functions for infering the sentences and create the document

In [118]:
def add_title(doc):
  # Add a paragraph to the document
  p = doc.add_paragraph()

  # Add some formatting to the paragraph
  p.paragraph_format.line_spacing = 1
  p.paragraph_format.space_after = 0

  # Add more text to the same paragraph
  run = p.add_run("Summarization " + title)

  # Format the run
  run.bold = True
  run.font.name = 'Arial'
  run.font.size = docx.shared.Pt(12)

def add_corp_partecipants(doc):
  add_heading("CORPORATE PARTICIPANTS")

  p2 = doc.add_paragraph()
  p2.paragraph_format.line_spacing = 2
  run2 = p2.add_run("")
  run2.font.name = 'Arial'
  run2.font.size = docx.shared.Pt(8)

  for part in corp_participants:
    run2.add_break()
    run2.add_text(part)

def add_conf_partecipants(doc):
  add_heading("CONFERENCE CALL PARTICIPANTS")

  p2 = doc.add_paragraph()
  p2.paragraph_format.line_spacing = 2
  run2 = p2.add_run("")
  run2.font.name = 'Arial'
  run2.font.size = docx.shared.Pt(8)

  for part in conf_participants:
    run2.add_break()
    run2.add_text(part)

def clean_data_from_infer_errors(list_data):
  list_presentation_cleaned = list()
  #remove item with no numeric or not reported, this might mean an error in the inference
  for sentence in list_data:
    rep_val = False
    labels = sentence['entity']
    for ent in labels.ents:
      tx = ent.text.replace(',','').replace('.','')
      #print(ent.text, "........", ent.label_ , "-----", tx.isdigit())
      if ent.label_ == "REPORTED VALUE" and tx.isdigit() :
        rep_val = True

    if rep_val:
        list_presentation_cleaned.append(sentence)

  return list_presentation_cleaned

def detect_umbrella_concept(granular_concept):
    
  umbrella_concept_dict = {'sales':['sales','arr','revenue'],
                           'cash & cash equivalents':['cash'],
                           'margin':['margin'],
                           'cash flow':['free cash flow','cash flow from operations'],
                            'debt':['debt'],
                            'dps':['dividend'],
                            'eps':['reported earnings per share'],
                            'profit':['net profit','operating income']
                          }  
  
  for key in umbrella_concept_dict.keys():
      if any(ext in granular_concept.lower() for ext in umbrella_concept_dict.get(key)):
          return str(key)
          break

  return ""


def add_fin_table(doc,list_presentation):
  add_heading("FINANCIAL TABLE")

  # Creating a table object
  table = doc.add_table(rows=1, cols=6)
  table.style = 'Table Grid'
  # Adding heading in the 1st row of the table
  row = table.rows[0].cells
  row[0].text = 'Reported Value'
  row[1].text = 'Multiplier'
  row[2].text = 'Currency'
  row[3].text = 'Granular Concept'
  row[4].text = 'Umbrella Concept'
  row[5].text = 'Sentence'

  # Adding data from the list to the table
  for item in list_presentation:

    #Skipping the Operator as we don't need any of thosea info
    if item['speaker'] == "Operator":
      continue

    labels = item['entity']
    #print(labels.text)

    row_a = table.add_row()
    row=row_a.cells
    row[5].text = labels.text

    for ent in labels.ents:
      if ent.label_ == "REPORTED VALUE":
        row[0].text = row[0].text +"\r"+ent.text
      if ent.label_ == "MULTIPLIER":
        row[1].text = row[1].text +"\r"+ent.text
      if ent.label_ == "CURRENCY":
        row[2].text = row[2].text +"\r"+ent.text
      if ent.label_ == "GRANULAR CONCEPT":
        row[3].text = row[3].text +"\r"+ent.text
        row[4].text = "\r"+detect_umbrella_concept(ent.text)

      #print(len(labels.ents))
      #print(ent.text + "........" + ent.label_)


def infer_data():
  list_presentation = list()
  list_qanda = list()
  current_list = list_presentation

  for item in parags:
    key = item['speaker']
    t = ''.join(item['paragraph'])
    spt = t.replace(';', '$split$').replace('. ', '$split$').split('$split$')
    #print(key+"----------------")
    for line in spt:

      if "QUESTIONS AND ANSWERS" in line:
        current_list = list_qanda
        continue

      doc = nlp(line)
      if len(doc.ents):
        current_list.append({'speaker':key,'entity':doc})

  return list_presentation, list_qanda

def add_bullets(doc,list_item):
  list_sorted = sorted(list_item, key=lambda d: d['speaker'])
  previous_speaker = ""
  for item in list_sorted:
    sentence = item['entity']
    current_speaker = item['speaker']
    if current_speaker is not previous_speaker:
      # Adding list of style name 'List Bullet'
      doc.add_heading(item['speaker'], 3)

    # Adding points to the list named 'List Number'
    doc.add_paragraph(sentence.text,style='List Bullet')

    previous_speaker = item['speaker']

def add_heading(title):
    # Add another paragraph
    doc.add_paragraph()
    p = doc.add_paragraph()
    #  Add a run and format it
    run = p.add_run(title)
    run.bold = True
    run.font.name = 'Arial'
    run.font.size = docx.shared.Pt(10)

# Infer presentation sentences.

In [115]:
list_presentation, list_qanda = infer_data()
list_presentation_cleaned = clean_data_from_infer_errors(list_presentation)
list_qanda_cleaned = clean_data_from_infer_errors(list_qanda)

# Create summarization and document

In [119]:
# Create a document
doc = docx.Document()

add_title(doc)
add_corp_partecipants(doc)
add_conf_partecipants(doc)
add_fin_table(doc, list_presentation_cleaned)
add_heading("PRESENTATION")
add_bullets(doc,list_presentation_cleaned)
add_heading("QUESTIONS AND ANSWERS")
add_bullets(doc,list_qanda_cleaned)

# Save the document
doc.save("Summarization - "+title+".docx")

# Conclusion
This application extract partecipants and paragraphs from the transcript , and will calculate the sentences from each paragraph splitting on . and ; for each call partecipant, exclueded the Operator.
Each sentence will be infered into the model, and if it recognizes any entity (<b>Reported Value, Multiplier, Currency and Granular Concept</b>) with a valid numeric Reported Value, then it will be added into the finacial table as well as into the bullet points list for whom said that sentence.


# Caveats
The only caveats is that i haven't found a good way to calculate the Umbrella Concept, as it's not present in the transcript explicitely.
The ways it works now is with a dictionary that map the known <b>Granular Concept</b> with the known <b>Umbrella Concept</b>.
One idea that i had but didn't have time to implement is to create another model (classification) that is trained to classify Granular concepts, and the output will be the Umbrella Concept.
