# Earnings Call Project: Sentence Transformers
<br>
CIS 831 Deep Learning – Term Project<br>
Kansas State University
<br><br>
James Chapman<br>
John Woods<br>
Nathan Diehl<br>
<br>

This notebook featurizes the text data from the earnings calls. [SentenceTransformer](https://www.sbert.net/) from hugging face is used with the following 3 models.
- [finance-embeddings-investopedia](https://huggingface.co/FinLang/finance-embeddings-investopedia)
- [bge-m3-financial-matryoshka](https://huggingface.co/haophancs/bge-m3-financial-matryoshka)
- [bge-base-financial-matryoshka](https://huggingface.co/philschmid/bge-base-financial-matryoshka)

The data from this notebook is stored in the "data/data_prep" directory as the following CSVs.
- investopedia_features -768 features
- bge_features -1024 features
- bge_base_features -768 features

In [1]:
import pandas as pd
import os
from datetime import datetime
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from tqdm.autonotebook import tqdm, trange


In [2]:
MAEC_dir = 'data/MAEC/MAEC_Dataset' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction

############# too big for GitHub ########################
############# stored on local disk ######################
original_data_dir = r"D:\original_dataset" # https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_audio_dir = r"D:\MAEC_audio" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

In [3]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(original_data_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
company_ticker = pd.read_csv('data/data_prep/company_ticker.csv')
filename_data = filename_data.merge(company_ticker, on="Company", how="left")

# Loop through the directory, each folder represents an earnings conference call; the folders are named as "Date_CompanyName".
MAEC_filename_data = []
for filename in os.listdir(MAEC_dir):
    date_str, ticker = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    MAEC_filename_data.append([ticker, date])
MAEC_filename_data = pd.DataFrame(MAEC_filename_data, columns=["Ticker", "Date"])

In [12]:
# model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
# model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

investopedia_features = pd.DataFrame()
investopedia_error = []
first_iteration = True
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 768 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'investopedia_{j}' for j in range(768)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                investopedia_features = pd.concat([investopedia_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        investopedia_error.append((Company, Date, str(e)))

print(len(investopedia_error))
print(investopedia_error)

  0%|          | 0/572 [00:00<?, ?it/s]

(768,)


100%|██████████| 572/572 [1:13:20<00:00,  7.69s/it]

0
[]





In [15]:
investopedia_features.info(verbose=True)
###############################################
investopedia_features.to_csv('data/data_prep/investopedia_features.csv', index=False)
###############################################

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89722 entries, 0 to 89721
Data columns (total 771 columns):
 #    Column            Dtype  
---   ------            -----  
 0    investopedia_0    float32
 1    investopedia_1    float32
 2    investopedia_2    float32
 3    investopedia_3    float32
 4    investopedia_4    float32
 5    investopedia_5    float32
 6    investopedia_6    float32
 7    investopedia_7    float32
 8    investopedia_8    float32
 9    investopedia_9    float32
 10   investopedia_10   float32
 11   investopedia_11   float32
 12   investopedia_12   float32
 13   investopedia_13   float32
 14   investopedia_14   float32
 15   investopedia_15   float32
 16   investopedia_16   float32
 17   investopedia_17   float32
 18   investopedia_18   float32
 19   investopedia_19   float32
 20   investopedia_20   float32
 21   investopedia_21   float32
 22   investopedia_22   float32
 23   investopedia_23   float32
 24   investopedia_24   float32
 25   investopedia_25   fl

In [16]:
model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
# model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
# model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

bge_features = pd.DataFrame()
bge_error = []
first_iteration = True
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 1024 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'bge_{j}' for j in range(1024)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                bge_features = pd.concat([bge_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        bge_error.append((Company, Date, str(e)))

print(len(bge_error))
print(bge_error)

  0%|          | 0/572 [00:00<?, ?it/s]

(1024,)


100%|██████████| 572/572 [1:37:40<00:00, 10.25s/it]

0
[]





In [17]:
bge_features.info(verbose=True)
###############################################
bge_features.to_csv('data/data_prep/bge_features.csv', index=False)
###############################################

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89722 entries, 0 to 89721
Data columns (total 1027 columns):
 #     Column        Dtype  
---    ------        -----  
 0     bge_0         float32
 1     bge_1         float32
 2     bge_2         float32
 3     bge_3         float32
 4     bge_4         float32
 5     bge_5         float32
 6     bge_6         float32
 7     bge_7         float32
 8     bge_8         float32
 9     bge_9         float32
 10    bge_10        float32
 11    bge_11        float32
 12    bge_12        float32
 13    bge_13        float32
 14    bge_14        float32
 15    bge_15        float32
 16    bge_16        float32
 17    bge_17        float32
 18    bge_18        float32
 19    bge_19        float32
 20    bge_20        float32
 21    bge_21        float32
 22    bge_22        float32
 23    bge_23        float32
 24    bge_24        float32
 25    bge_25        float32
 26    bge_26        float32
 27    bge_27        float32
 28    bge_28      

In [None]:
# model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
# model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

bge_base_features = pd.DataFrame()
bge_base_error = []
first_iteration = True
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 768 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'bge_{j}' for j in range(768)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                bge_base_features = pd.concat([bge_base_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        bge_base_error.append((Company, Date, str(e)))

print(len(bge_base_error))
print(bge_base_error)

In [None]:
bge_base_features.info(verbose=True)
###############################################
bge_base_features.to_csv('data/data_prep/bge_features.csv', index=False)
###############################################

# repeat with MAEC

In [None]:
MACE_RoBERTa_features = pd.DataFrame()
MACE_RoBERTa_error = []
for Ticker, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = get_RoBERTa(line.strip())
                # 1024 features 
                features_df = pd.DataFrame(sentence_embedding, columns=[f'RoBERTa_{j}' for j in range(1024)])
                features_df['Ticker'] = Ticker
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                MACE_RoBERTa_features = pd.concat([MACE_RoBERTa_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        MACE_RoBERTa_error.append((Company, Date, str(e)))

print(len(MACE_RoBERTa_error))
print(MACE_RoBERTa_error)

In [None]:
# model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
# model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

MAEC_investopedia_features = pd.DataFrame()
MAEC_investopedia_error = []
first_iteration = True
for Company, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 768 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'investopedia_{j}' for j in range(768)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                investopedia_features = pd.concat([investopedia_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        investopedia_error.append((Company, Date, str(e)))

print(len(investopedia_error))
print(investopedia_error)

In [None]:
investopedia_features.info(verbose=True)
###############################################
investopedia_features.to_csv('data/data_prep/investopedia_features.csv', index=False)
###############################################

In [None]:
model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
# model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
# model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

bge_features = pd.DataFrame()
bge_error = []
first_iteration = True
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 1024 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'bge_{j}' for j in range(1024)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                bge_features = pd.concat([bge_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        bge_error.append((Company, Date, str(e)))

print(len(bge_error))
print(bge_error)

In [None]:
bge_features.info(verbose=True)
###############################################
bge_features.to_csv('data/data_prep/bge_features.csv', index=False)
###############################################

In [None]:
# model = SentenceTransformer("haophancs/bge-m3-financial-matryoshka") # [, 1024]
model = SentenceTransformer("philschmid/bge-base-financial-matryoshka") # [, 768]
# model = SentenceTransformer("FinLang/finance-embeddings-investopedia") # [, 768]

bge_base_features = pd.DataFrame()
bge_base_error = []
first_iteration = True
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = model.encode(line.strip())
                if first_iteration:
                    print(sentence_embedding.shape)
                    first_iteration = False
                # 768 features 
                features_df = pd.DataFrame([sentence_embedding.flatten()], columns=[f'bge_{j}' for j in range(768)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                bge_base_features = pd.concat([bge_base_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        bge_base_error.append((Company, Date, str(e)))

print(len(bge_base_error))
print(bge_base_error)

In [None]:
bge_base_features.info(verbose=True)
###############################################
bge_base_features.to_csv('data/data_prep/bge_features.csv', index=False)
###############################################