In [1]:
# Download earning call transcript
# Earning call transcript is available at seekingalpha.com
# Origin of the transcript is YAhoo Finance
# However ECT Sum benchmark is a consolidated version of the transcript with summary


# Clone the repository
!git clone https://github.com/rajdeep345/ECTSum.git ../data

Cloning into '../data'...
remote: Enumerating objects: 16499, done.[K
remote: Counting objects: 100% (1265/1265), done.[K
remote: Compressing objects: 100% (822/822), done.[K
remote: Total 16499 (delta 507), reused 978 (delta 425), pack-reused 15234[K
Receiving objects: 100% (16499/16499), 26.45 MiB | 22.59 MiB/s, done.
Resolving deltas: 100% (6924/6924), done.
Updating files: 100% (16592/16592), done.


In [41]:
# prepare the data
import glob
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm


# setting the path
main_dir = Path("./").absolute().parent/'data'
version = ['train', 'val', 'test']
main_dir

PosixPath('/Users/johnjoy/Documents/Projects - Code/ECT_Sum_Finetune_Llama3/ectsum_finetune_llama/data')

In [56]:
def create_dataframe(data_path:Path,version:str)->pd.DataFrame:
    """Create a dataframe from the transcript and summary data

    Args:
        data_path (Path): path to the data directory
        version (str): version of the data

    Returns:
        pd.DataFrame: dataframe containing the transcript and summary data
    """
    transcript_data_path = str(data_path)+f"/data/final/{version}/ects"
    summary_data_path = str(data_path)+f"/data/final/{version}/gt_summaries"
    input_text, output_text = [], []
    for file in tqdm(glob.glob(transcript_data_path+'/*.txt')):
        with open(file, 'r') as f:
            input_text.append(f.read())
    
    for file in tqdm(glob.glob( summary_data_path+'/*.txt')):
        with open(file, 'r') as f:
            output_text.append(f.read())
    data = {'input': input_text, 'output': output_text}
    return pd.DataFrame(data)

In [75]:
def add_instructions(instructions:list, df:pd.DataFrame):
    """Add instructions to the dataframe

    Args:
        instructions (list): list of instructions
        df (pd.DataFrame): dataframe

    Returns:
         pd.DataFrame: dataframe with instructions
    """
    # add the instructions to the dataframe matching the length of the dataframe
    df['instruction'] = instructions*(len(df)//len(instructions))+(len(df)%len(instructions))*[instructions[-1]]
    return df

In [77]:
# create a new prompt/ feel free to generate prompts using bigger accurate LLMs
instuctions = [
    "Please summarize the key points from the recent earnings call transcript. Focus on financial performance, strategic initiatives, and management's outlook.",
    "Summarize the key points from the earnings call transcript. Focus on financial performance, strategic initiatives, and management's outlook.",
    "Summarize call transcript. Focus on financial performance, strategic initiatives, and management's outlook.",
    "From the earnings call transcript, summarize the key points. Focus on financial performance, strategic initiatives, and management's outlook.",
    "Summarize precisely the key points from the earnings call transcript. Focus on financial performance, strategic initiatives, and management's outlook."
]
for ver in version:
    df = create_dataframe(main_dir, ver)
    df = add_instructions(instructions = instuctions, df = df)
    df.to_csv(main_dir/f'ectsum_{ver}.csv', index=False)

100%|██████████| 1681/1681 [00:00<00:00, 6322.02it/s]
100%|██████████| 1681/1681 [00:00<00:00, 9918.51it/s]
100%|██████████| 249/249 [00:00<00:00, 7054.77it/s]
100%|██████████| 249/249 [00:00<00:00, 20143.92it/s]
100%|██████████| 495/495 [00:00<00:00, 7264.25it/s]
100%|██████████| 495/495 [00:00<00:00, 16728.01it/s]
