# Earnings Call Project: RoBERTa
<br>
CIS 831 Deep Learning – Term Project<br>
Kansas State University
<br><br>
James Chapman<br>
John Woods<br>
Nathan Diehl<br>
<br>

This notebook featurizes the text and audio data from the earnings calls. Each earnings calls data comes pre-processed such that each sentence of the call corresponds to 1 line-of-text and 1 MP3 audio file. The transcript text is processed with the Glove–300 pre-trained word embedding, and the audio files are processed with Praat using parselmouth.
- Text (Glove–300) [Glove Download](https://nlp.stanford.edu/projects/glove/)
- Audio (Praat) [Parselmouth](https://parselmouth.readthedocs.io/en/stable/)

The data from this notebook is stored in the "data/data_prep" directory as the following CSVs.
- glove_features
- MAEC_glove_features
- praat_features
- MAEC_praat_features

In [3]:
import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install transformers
    from google.colab import drive
    drive.mount('/content/gdrive')
    %cd gdrive/My Drive/831

In [4]:
import pandas as pd
import numpy as np
import time
import json
import csv
import re
import os
from datetime import datetime
from tqdm import tqdm
from transformers import RobertaModel, RobertaTokenizer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
MAEC_dir = 'data/MAEC/MAEC_Dataset' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction

############# too big for GitHub ########################
############# stored on local disk ######################
original_data_dir = r"D:\original_dataset" # https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_audio_dir = r"D:\MAEC_audio" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

In [6]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(original_data_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
company_ticker = pd.read_csv('data/data_prep/company_ticker.csv')
filename_data = filename_data.merge(company_ticker, on="Company", how="left")

# Loop through the directory, each folder represents an earnings conference call; the folders are named as "Date_CompanyName".
MAEC_filename_data = []
for filename in os.listdir(MAEC_dir):
    date_str, ticker = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    MAEC_filename_data.append([ticker, date])
MAEC_filename_data = pd.DataFrame(MAEC_filename_data, columns=["Ticker", "Date"])

# RoBERTa features from meeting transcript text files

RoBERTa documentation can be found at https://huggingface.co/FacebookAI/roberta-large

### Following code is adapted FROM
[GitHub HTML Encoder](https://github.com/YangLinyi/HTML-Hierarchical-Transformer-based-Multi-task-Learning-for-Volatility-Prediction/blob/master/Model/Token-Level%20Encoder/HuggingFace-Roberta-Token-Encoder.py)

In [8]:
# This is adapted from
# https://github.com/YangLinyi/HTML-Hierarchical-Transformer-based-Multi-task-Learning-for-Volatility-Prediction/blob/master/Model/Token-Level%20Encoder/HuggingFace-Roberta-Token-Encoder.py
model = RobertaModel.from_pretrained('roberta-large').to(device)
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

def get_RoBERTa(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # [CLS] embedding for sentence-level representation
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    #print(cls_embedding.shape)
    # 1024 features 
    return cls_embedding
####################################################
####################################################
# # How to implement average pooling instead of CLS
#         # average pooling over token embeddings
#         token_embeddings = outputs.last_hidden_state
#         sentence_embedding = torch.mean(token_embeddings, dim=1).cpu().numpy()  
#     return sentence_embedding

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
RoBERTa_features = pd.DataFrame()
RoBERTa_error = []
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = get_RoBERTa(line.strip())
                # 1024 features 
                features_df = pd.DataFrame(sentence_embedding, columns=[f'RoBERTa_{j}' for j in range(1024)])
                features_df['Company'] = Company
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                RoBERTa_features = pd.concat([RoBERTa_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        RoBERTa_error.append((Company, Date, str(e)))

print(len(RoBERTa_error))
print(RoBERTa_error)

100%|██████████| 572/572 [1:38:39<00:00, 10.35s/it]

0
[]





In [10]:
RoBERTa_features.info(verbose=True)
###############################################
RoBERTa_features.to_csv('data/data_prep/RoBERTa_features.csv', index=False)
###############################################

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89722 entries, 0 to 89721
Data columns (total 1027 columns):
 #     Column        Dtype  
---    ------        -----  
 0     RoBERTa_0     float32
 1     RoBERTa_1     float32
 2     RoBERTa_2     float32
 3     RoBERTa_3     float32
 4     RoBERTa_4     float32
 5     RoBERTa_5     float32
 6     RoBERTa_6     float32
 7     RoBERTa_7     float32
 8     RoBERTa_8     float32
 9     RoBERTa_9     float32
 10    RoBERTa_10    float32
 11    RoBERTa_11    float32
 12    RoBERTa_12    float32
 13    RoBERTa_13    float32
 14    RoBERTa_14    float32
 15    RoBERTa_15    float32
 16    RoBERTa_16    float32
 17    RoBERTa_17    float32
 18    RoBERTa_18    float32
 19    RoBERTa_19    float32
 20    RoBERTa_20    float32
 21    RoBERTa_21    float32
 22    RoBERTa_22    float32
 23    RoBERTa_23    float32
 24    RoBERTa_24    float32
 25    RoBERTa_25    float32
 26    RoBERTa_26    float32
 27    RoBERTa_27    float32
 28    RoBERTa_28  

# Repeat with MACE

In [None]:
MACE_RoBERTa_features = pd.DataFrame()
MACE_RoBERTa_error = []
for Ticker, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                sentence_embedding = get_RoBERTa(line.strip())
                # 1024 features 
                features_df = pd.DataFrame(sentence_embedding, columns=[f'RoBERTa_{j}' for j in range(1024)])
                features_df['Ticker'] = Ticker
                features_df['Date'] = Date
                features_df['Sentence_num'] = i
                MACE_RoBERTa_features = pd.concat([MACE_RoBERTa_features, features_df], ignore_index=True)
    except KeyboardInterrupt: break
    except Exception as e:
        MACE_RoBERTa_error.append((Company, Date, str(e)))

print(len(MACE_RoBERTa_error))
print(MACE_RoBERTa_error)

 28%|██▊       | 977/3443 [1:45:21<5:22:24,  7.84s/it] 

In [None]:
MACE_RoBERTa_features.info(verbose=True)
###############################################
MACE_RoBERTa_features.to_csv('data/data_prep/MACE_RoBERTa_features.csv', index=False)
###############################################