# Earnings Call Project: emotion2vec
<br>
CIS 831 Deep Learning – Term Project<br>
Kansas State University
<br><br>
James Chapman<br>
John Woods<br>
Nathan Diehl<br>
<br>

### This notebook featurizes the AUDIO data from the earnings calls with emotion2vec.

emotion2vec documentation can be found at https://github.com/ddlBoJack/emotion2vec
and
https://huggingface.co/emotion2vec

The data from this notebook is stored in the "data/data_prep" directory as the following CSVs.

* emotion2vec
* MAEC_emotion2vec


In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from tqdm import tqdm
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
MAEC_dir = 'data/MAEC/MAEC_Dataset' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction

############# too big for GitHub ########################
############# stored on local disk ######################
original_data_dir = r"D:\original_dataset" # https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_audio_dir = r"D:\MAEC_audio" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

In [3]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(original_data_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
company_ticker = pd.read_csv('data/data_prep/company_ticker.csv')
filename_data = filename_data.merge(company_ticker, on="Company", how="left")

# Loop through the directory, each folder represents an earnings conference call; the folders are named as "Date_CompanyName".
MAEC_filename_data = []
for filename in os.listdir(MAEC_dir):
    date_str, ticker = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    MAEC_filename_data.append([ticker, date])
MAEC_filename_data = pd.DataFrame(MAEC_filename_data, columns=["Ticker", "Date"])

In [4]:
from funasr import AutoModel

model = AutoModel(model="iic/emotion2vec_plus_large")

Downloading Model to directory: C:\Users\James\.cache\modelscope\hub\iic/emotion2vec_plus_large




Detect model requirements, begin to install it: C:\Users\James\.cache\modelscope\hub\iic\emotion2vec_plus_large\requirements.txt
install model requirements successfully
ckpt: C:\Users\James\.cache\modelscope\hub\iic\emotion2vec_plus_large\model.pt


  src_state = torch.load(path, map_location=map_location)


init param, map: modality_encoders.AUDIO.extra_tokens from d2v_model.modality_encoders.AUDIO.extra_tokens in ckpt
init param, map: modality_encoders.AUDIO.alibi_scale from d2v_model.modality_encoders.AUDIO.alibi_scale in ckpt
init param, map: modality_encoders.AUDIO.local_encoder.conv_layers.0.0.weight from d2v_model.modality_encoders.AUDIO.local_encoder.conv_layers.0.0.weight in ckpt
init param, map: modality_encoders.AUDIO.local_encoder.conv_layers.0.2.1.weight from d2v_model.modality_encoders.AUDIO.local_encoder.conv_layers.0.2.1.weight in ckpt
init param, map: modality_encoders.AUDIO.local_encoder.conv_layers.0.2.1.bias from d2v_model.modality_encoders.AUDIO.local_encoder.conv_layers.0.2.1.bias in ckpt
init param, map: modality_encoders.AUDIO.local_encoder.conv_layers.1.0.weight from d2v_model.modality_encoders.AUDIO.local_encoder.conv_layers.1.0.weight in ckpt
init param, map: modality_encoders.AUDIO.local_encoder.conv_layers.1.2.1.weight from d2v_model.modality_encoders.AUDIO.loc

In [5]:
# example of using the model, and the output
# I included the features in this one
MP3_file = "D:/original_dataset/3M Company_20170425/CEO/Nicholas C. Gangestad_1_1.mp3"
results = model.generate(MP3_file, output_dir="./outputs", granularity="utterance", extract_embedding=True)
labels = results[0]['labels']
scores = results[0]['scores']
feats = results[0]['feats']

print(results)
print(labels)
print(scores)
print(len(feats))

rtf_avg: 0.039: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  4.12it/s]                                                                                      

[{'key': 'Nicholas C. Gangestad_1_1', 'labels': ['生气/angry', '厌恶/disgusted', '恐惧/fearful', '开心/happy', '中立/neutral', '其他/other', '难过/sad', '吃惊/surprised', '<unk>'], 'scores': [0.0008483638521283865, 0.003417916828766465, 0.011887827888131142, 0.37373086810112, 0.11250405758619308, 0.008385236375033855, 0.005126012023538351, 0.0004774830595124513, 0.4836222529411316], 'feats': array([-0.04042191, -0.20115899, -0.25954983, ..., -0.46292517,
        0.8943518 ,  0.65777045], dtype=float32)}]
['生气/angry', '厌恶/disgusted', '恐惧/fearful', '开心/happy', '中立/neutral', '其他/other', '难过/sad', '吃惊/surprised', '<unk>']
[0.0008483638521283865, 0.003417916828766465, 0.011887827888131142, 0.37373086810112, 0.11250405758619308, 0.008385236375033855, 0.005126012023538351, 0.0004774830595124513, 0.4836222529411316]
1024





In [6]:

import sys
import os

# the output of the model kept freezing my computer
# each audio file seems to print out a progress bar

# Suppress output temporarily
class SuppressOutput:
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr




In [7]:
# I kept having a memory error
# this splits up each audio file into sections at most 2 minutes
# then averages the output factors
from pydub import AudioSegment
segment_length = 120000 # 2 minutes

MP3_file = "D:/MAEC_audio/20160802_CORT/CORT_20160802_f000023106.mp3"
audio = AudioSegment.from_file(MP3_file, format="mp3")
segments = [audio[i:i + segment_length] for i in range(0, len(audio), segment_length)]
print(len(segments))

all_scores = []
for i, segment in enumerate(segments):
    temp_file = f"temp_segment_{i}.mp3"
    segment.export(temp_file, format="mp3")  
    results = model.generate(temp_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
    print(results[0]['scores'])
    all_scores.append(results[0]['scores'])

scores = [sum(x) / len(all_scores) for x in zip(*all_scores)]
print(scores)
    


4


rtf_avg: 0.011: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.31s/it]                                                                                      


[0.0031992837321013212, 0.12995511293411255, 0.023297548294067383, 0.18970733880996704, 0.06439411640167236, 0.0025770298670977354, 0.23691841959953308, 0.03711989149451256, 0.3128312826156616]


rtf_avg: 0.005: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.64it/s]                                                                                      


[0.0016599190421402454, 0.05526705086231232, 0.012034779414534569, 0.6429855823516846, 0.04890550300478935, 0.0012000900460407138, 0.08021904528141022, 0.024633927270770073, 0.13309405744075775]


rtf_avg: 0.005: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.65it/s]                                                                                      


[0.0009355830843560398, 0.05223856493830681, 0.01718318834900856, 0.6403264999389648, 0.03516412898898125, 0.0018440389540046453, 0.11748670786619186, 0.015552787110209465, 0.11926846951246262]


rtf_avg: 0.004: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 40.70it/s]                                                                                      

[0.0016990875592455268, 0.0038299134466797113, 0.008568341843783855, 0.12420917302370071, 0.24684593081474304, 0.002062635961920023, 0.016790788620710373, 0.0017812704900279641, 0.5942128896713257]
[0.0018734683544607833, 0.060322660545352846, 0.015270964475348592, 0.3993071485310793, 0.0988274198025465, 0.0019209487072657794, 0.11285374034196138, 0.019771969091380015, 0.2898516748100519]





In [8]:
# Returns 9 floating-point values 
def get_emotion2vec(audio_dir,audio_file):
    MP3_file = os.path.join(audio_dir,audio_file)
    audio = AudioSegment.from_file(MP3_file, format="mp3")
    # split MP3 files into sections of 2 minutes
    segments = [audio[i:i + segment_length] for i in range(0, len(audio), segment_length)]

    all_scores = []
    for i, segment in enumerate(segments):
        temp_file = f"temp_segment_{i}.mp3"
        segment.export(temp_file, format="mp3")  
        with SuppressOutput():
            results = model.generate(temp_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
        labels = results[0]['labels']
        all_scores.append(results[0]['scores'])
        torch.cuda.empty_cache() # I don't know if this helps
    # average all of the sections into one factor of 9
    scores = [sum(x) / len(all_scores) for x in zip(*all_scores)]
    return labels, scores

In [None]:

bad_emotion2vec = [] # Company ,Date, i, audio_dir, audio_file, labels, scores, e

emotion2vec = []
for Company,Date in filename_data[['Company','Date']].values: # each audio file of the original data set
    Date = Date.replace('-', '') 
    audio_dir = f"D:/original_dataset/{Company}_{Date}/CEO"
    if os.path.exists(audio_dir):
        for i, audio_file in enumerate(os.listdir(audio_dir), start= 1):
            try:
                # skip files that are not MP3 audio
                if audio_file.lower().endswith('.mp3'):
                    labels, scores = get_emotion2vec(audio_dir,audio_file)
                    features_row = np.concatenate([labels, scores, [Company, Date, i, audio_file]])
                    emotion2vec.append(features_row)
            except KeyboardInterrupt: break
            except Exception as e: 
                print(Company ,Date, i, audio_dir, audio_file, e)
                bad_emotion2vec.append([Company ,Date, i, audio_dir, audio_file, labels, scores, e])
                # set all features to  0.0 if there was an error
                scores = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
                features_row = np.concatenate([labels, scores, [Company, Date, i, audio_file]])
                emotion2vec.append(features_row)
                   
emotion2vec = pd.DataFrame(emotion2vec)
emotion2vec.info(verbose=True)
emotion2vec.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89722 entries, 0 to 89721
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       89722 non-null  object
 1   1       89722 non-null  object
 2   2       89722 non-null  object
 3   3       89722 non-null  object
 4   4       89722 non-null  object
 5   5       89722 non-null  object
 6   6       89722 non-null  object
 7   7       89722 non-null  object
 8   8       89722 non-null  object
 9   9       89722 non-null  object
 10  10      89722 non-null  object
 11  11      89722 non-null  object
 12  12      89722 non-null  object
 13  13      89722 non-null  object
 14  14      89722 non-null  object
 15  15      89722 non-null  object
 16  16      89722 non-null  object
 17  17      89722 non-null  object
 18  18      89722 non-null  object
 19  19      89722 non-null  object
 20  20      89722 non-null  object
 21  21      89722 non-null  object
dtypes: object(22)
memory u

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
count,89722,89722,89722,89722,89722,89722,89722,89722,89722,89722.0,...,89722.0,89722.0,89722.0,89722.0,89722.0,89722.0,89722,89722,89722,89722
unique,1,1,1,1,1,1,1,1,1,89701.0,...,86671.0,73635.0,89682.0,89693.0,89691.0,89684.0,280,127,522,70765
top,生气/angry,厌恶/disgusted,恐惧/fearful,开心/happy,中立/neutral,其他/other,难过/sad,吃惊/surprised,<unk>,0.000116934053949,...,1.0,1.0,1.4592916386391153e-06,0.0447551906108856,0.0005968250916339,0.5674771070480347,Martin Marietta Materials,20170727,2,Craig W. Safian_1_50.mp3
freq,89722,89722,89722,89722,89722,89722,89722,89722,89722,2.0,...,839.0,2824.0,2.0,2.0,2.0,2.0,1736,4123,571,4


In [None]:
# check to make sure all the labels (the first 9 columns) are the same
# then drop them
for column in range(9):
    print('Should be 1 ---', pd.unique(emotion2vec[column]).size)
emotion2vec = emotion2vec.iloc[:, 9:]

# rename columns
emotion2vec.columns = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'other', 'sad', 'surprised', 'unk', 
                       'Company', 'Date', 'Sentence_num', 'audio_file']
emotion2vec = emotion2vec.drop(['audio_file'], axis=1)
emotion2vec.info(verbose=True)
### save ############################################
emotion2vec.to_csv('data/data_prep/emotion2vec.csv', index=False)
#####################################################
emotion2vec.describe()


Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89722 entries, 0 to 89721
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   angry         89722 non-null  object
 1   disgusted     89722 non-null  object
 2   fearful       89722 non-null  object
 3   happy         89722 non-null  object
 4   neutral       89722 non-null  object
 5   other         89722 non-null  object
 6   sad           89722 non-null  object
 7   surprised     89722 non-null  object
 8   unk           89722 non-null  object
 9   Company       89722 non-null  object
 10  Date          89722 non-null  object
 11  Sentence_num  89722 non-null  object
 12  audio_file    89722 non-null  object
dtypes: object(13)
memory usage: 8.9+ MB


Unnamed: 0,angry,disgusted,fearful,happy,neutral,other,sad,surprised,unk,Company,Date,Sentence_num,audio_file
count,89722.0,89722.0,89722.0,89722.0,89722.0,89722.0,89722.0,89722.0,89722.0,89722,89722,89722,89722
unique,89701.0,89694.0,89683.0,86671.0,73635.0,89682.0,89693.0,89691.0,89684.0,280,127,522,70765
top,0.000116934053949,1.0,2.3212887754198164e-05,1.0,1.0,1.4592916386391153e-06,0.0447551906108856,0.0005968250916339,0.5674771070480347,Martin Marietta Materials,20170727,2,Craig W. Safian_1_50.mp3
freq,2.0,5.0,2.0,839.0,2824.0,2.0,2.0,2.0,2.0,1736,4123,571,4


In [9]:
bad_emotion2vec

[['Ventas Inc',
  '20171027',
  113,
  'D:/original_dataset/Ventas Inc_20171027/CEO',
  'Robert Probst_1_8.mp3',
  ['生气/angry',
   '厌恶/disgusted',
   '恐惧/fearful',
   '开心/happy',
   '中立/neutral',
   '其他/other',
   '难过/sad',
   '吃惊/surprised',
   '<unk>'],
  [2.0749192117364146e-05,
   0.0004140714299865067,
   0.00028590569854713976,
   0.0006683205137960613,
   0.988251268863678,
   3.425650220378884e-06,
   0.009793120436370373,
   0.0005107651231810451,
   5.2451694500632584e-05],
  RuntimeError('Failed to load audio: ffmpeg version 7.1-essentials_build-www.gyan.dev Copyright (c) 2000-2024 the FFmpeg developers\r\n  built with gcc 14.2.0 (Rev1, Built by MSYS2 project)\r\n  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enabl

# MAEC

In [10]:

# Collect whenever there is an error
MAEC_bad_emotion2vec = [] # Ticker , Date, i, audio_dir, audio_file, labels, scores, e

MAEC_emotion2vec = []
for Ticker,Date in tqdm(MAEC_filename_data[['Ticker','Date']].values): # each audio file of the original data set
    Date = Date.replace('-', '') 
    audio_dir = f"D:/MAEC_audio/{Date}_{Ticker}"
    if os.path.exists(audio_dir):
        for i, audio_file in enumerate(os.listdir(audio_dir), start= 1):
            try:
                # skip files that are not MP3 audio
                if audio_file.lower().endswith('.mp3'):
                    labels, scores = get_emotion2vec(audio_dir,audio_file)
                    features_row = np.concatenate([labels, scores, [Ticker, Date, i, audio_file]])
                    MAEC_emotion2vec.append(features_row)
            except KeyboardInterrupt: break
            except Exception as e: 
                print(Ticker, Date, i, e)
                MAEC_bad_emotion2vec.append([Ticker ,Date, i, audio_dir, audio_file, labels, scores, e])
                # set all features to  0.0 if there was an error
                scores = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
                features_row = np.concatenate([labels, scores, [Ticker, Date, i, audio_file]])
                MAEC_emotion2vec.append(features_row)

MAEC_emotion2vec = pd.DataFrame(MAEC_emotion2vec)
MAEC_emotion2vec.info(verbose=True)
MAEC_emotion2vec.describe()

100%|██████████| 1743/1743 [50:48:42<00:00, 104.95s/it]   


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394204 entries, 0 to 394203
Data columns (total 22 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       394204 non-null  object
 1   1       394204 non-null  object
 2   2       394204 non-null  object
 3   3       394204 non-null  object
 4   4       394204 non-null  object
 5   5       394204 non-null  object
 6   6       394204 non-null  object
 7   7       394204 non-null  object
 8   8       394204 non-null  object
 9   9       394204 non-null  object
 10  10      394204 non-null  object
 11  11      394204 non-null  object
 12  12      394204 non-null  object
 13  13      394204 non-null  object
 14  14      394204 non-null  object
 15  15      394204 non-null  object
 16  16      394204 non-null  object
 17  17      394204 non-null  object
 18  18      394204 non-null  object
 19  19      394204 non-null  object
 20  20      394204 non-null  object
 21  21      394204 non-null  object
d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
count,394204,394204,394204,394204,394204,394204,394204,394204,394204,394204.0,...,394204.0,394204.0,394204.0,394204.0,394204.0,394204.0,394204,394204,394204,394204
unique,1,1,1,1,1,1,1,1,1,393785.0,...,386876.0,357989.0,393801.0,393403.0,393884.0,393601.0,1213,488,983,394204
top,生气/angry,厌恶/disgusted,恐惧/fearful,开心/happy,中立/neutral,其他/other,难过/sad,吃惊/surprised,<unk>,1.0,...,1.0,1.0,0.0001925268443301,1.0,1.0,0.75323,INCY,20160728,3,LMAT_20150225_f000002100.mp3
freq,394204,394204,394204,394204,394204,394204,394204,394204,394204,9.0,...,1094.0,3072.0,2.0,16.0,8.0,3.0,1772,8181,1738,1


In [11]:
# check to make sure all the labels (the first 9 columns) are the same
# then drop them
for column in range(9):
    print('Should be 1 ---', pd.unique(MAEC_emotion2vec[column]).size)
MAEC_emotion2vec = MAEC_emotion2vec.iloc[:, 9:]

# rename columns
MAEC_emotion2vec.columns = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'other', 'sad', 'surprised', 'unk', 
                       'Company', 'Date', 'Sentence_num', 'audio_file']
MAEC_emotion2vec = MAEC_emotion2vec.drop(['audio_file'], axis=1)
MAEC_emotion2vec.info(verbose=True)
### save ############################################
MAEC_emotion2vec.to_csv('data/data_prep/MAEC_emotion2vec.csv', index=False)
#####################################################
MAEC_emotion2vec.describe()

Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
Should be 1 --- 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394204 entries, 0 to 394203
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   angry         394204 non-null  object
 1   disgusted     394204 non-null  object
 2   fearful       394204 non-null  object
 3   happy         394204 non-null  object
 4   neutral       394204 non-null  object
 5   other         394204 non-null  object
 6   sad           394204 non-null  object
 7   surprised     394204 non-null  object
 8   unk           394204 non-null  object
 9   Company       394204 non-null  object
 10  Date          394204 non-null  object
 11  Sentence_num  394204 non-null  object
dtypes: object(12)
memory usage: 36.1+ MB


Unnamed: 0,angry,disgusted,fearful,happy,neutral,other,sad,surprised,unk,Company,Date,Sentence_num
count,394204.0,394204.0,394204.0,394204.0,394204.0,394204.0,394204.0,394204.0,394204.0,394204,394204,394204
unique,393785.0,393734.0,393745.0,386876.0,357989.0,393801.0,393403.0,393884.0,393601.0,1213,488,983
top,1.0,1.0,0.0031425696797668,1.0,1.0,0.0001925268443301,1.0,1.0,0.75323,INCY,20160728,3
freq,9.0,11.0,2.0,1094.0,3072.0,2.0,16.0,8.0,3.0,1772,8181,1738


In [12]:
MAEC_bad_emotion2vec

[]

In [13]:
print(len(MAEC_bad_emotion2vec))


0
