### Pretrained FinBERT model on earnings call transcripts (q&a ensemble)

Load data:

In [None]:
from google.colab import files
upload = files.upload()

Saving accolade-inc-accd-q3-2021-earnings-call-transcript.json to accolade-inc-accd-q3-2021-earnings-call-transcript.json
Saving acuity-brands-inc-ayi-q1-2021-earnings-call-transc.json to acuity-brands-inc-ayi-q1-2021-earnings-call-transc.json
Saving albertsons-companies-inc-aci-q3-2020-earnings-call.json to albertsons-companies-inc-aci-q3-2020-earnings-call.json
Saving angiodynamics-inc-ango-q2-2021-earnings-call-trans.json to angiodynamics-inc-ango-q2-2021-earnings-call-trans.json
Saving aphria-inc-apha-q2-2021-earnings-call-transcript.json to aphria-inc-apha-q2-2021-earnings-call-transcript.json
Saving audiovox-voxx-q3-2021-earnings-call-transcript.json to audiovox-voxx-q3-2021-earnings-call-transcript.json
Saving azz-inc-azz-q3-2021-earnings-call-transcript.json to azz-inc-azz-q3-2021-earnings-call-transcript.json
Saving bed-bath-beyond-bbby-q3-2020-earnings-call-transcr.json to bed-bath-beyond-bbby-q3-2020-earnings-call-transcr.json
Saving blackrock-blk-q4-2020-earnings-call-trans

Imports:

In [None]:
import os
import json
import pandas
import numpy as np
from collections import Counter

import nltk.data
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

Preprocess data:

In [None]:
directory = '/content'
transcript_data = []
input_data = []
test_data = []
input_data_labels = []
test_data_labels = []
label_map = {0:'neutral', 1:'positive', 2:'negative'}

nltk.download('punkt')
sent_tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')

for filename in os.listdir(directory):
  f = os.path.join(directory, filename)
  if os.path.isfile(f) and f.endswith('.json'):

    # Iterate over .json files
    with open(f) as file:
      transcript_data.append(json.load(file))

      # Extract q&a answers from transcript
      answers = [x['text'] for x in transcript_data[-1]['text_blocks'] if x['section'] == "Questions and Answers" and x['speaker'] != "Operator" and x['speaker'][-7:] != "Analyst"]

      # split text into groups within max input length (512 characters), maintaining complete sentences and without combining adjacent answers
      if len(input_data) < 40:
        input_data.append([])
        for a in answers:
          sentences = sent_tokenizer.tokenize(a)
          text_chunk = ""
          for sent in sentences:
            if len(text_chunk) + len(sent) <= 512:
              text_chunk += sent
            else:
              input_data[-1].append(text_chunk)
              text_chunk = sent
          input_data[-1].append(text_chunk)
      else:
        test_data.append([])
        for a in answers:
          sentences = sent_tokenizer.tokenize(a)
          text_chunk = ""
          for sent in sentences:
            if len(text_chunk) + len(sent) <= 512:
              text_chunk += sent
            else:
              test_data[-1].append(text_chunk)
              text_chunk = sent
          test_data[-1].append(text_chunk)

      # Extract stock prices and volatility
      price_before = transcript_data[-1]['closing_price_day_before'][-1]
      price_day_of = transcript_data[-1]['closing_price_day_of'][-1]
      price_after = transcript_data[-1]['closing_price_day_after'][-1]
      price_volatility = transcript_data[-1]['daily_volatility']

      # Get stock direction
      price_difference = price_after - price_before
      volatility_difference = price_day_of * price_volatility
      if abs(price_difference) - volatility_difference <= 0:
        # No price change (within volatility range)
        label = 0
      elif price_difference > 0:
        # Price increase
        label = 1
      else:
        # Price decrease
        label = 2
      if len(input_data_labels) < 40:
        input_data_labels.append(label)
      else:
        test_data_labels.append(label)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
print(input_data_labels)
print(test_data_labels)

[1, 0, 2, 2, 2, 1, 2, 0, 0, 2, 0, 1, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 2, 0, 1, 1, 0, 1, 2, 2, 1, 1, 1, 1, 1]
[0, 2, 0, 1, 2, 2, 0, 0, 2, 2, 0]


In [None]:
# Create tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# Chunk data to save RAM
def chunks(lst, n):
  for i in range(0, len(lst), n):
    yield lst[i:i + n]

In [None]:
batch_size = 1
val_counts = []
for data in input_data:
  ensemble_labels = []
  for batch in chunks(data, batch_size):
    # Tokenize input data
    inputs = tokenizer(batch, padding = True, truncation = True, max_length = 512, return_tensors='pt')

    # Run model and get outputs
    outputs = finbert(**inputs)

    # Get output labels
    for result in outputs['logits']:
      result = result.tolist()
      ensemble_labels.append(result.index(max(result)))

  # Add to count
  val_counts.append(Counter(ensemble_labels))

In [None]:
# Train logistic regression model
lr_input_data = [[x[0], x[1], x[2]] for x in val_counts]
lr = LogisticRegression().fit(lr_input_data, input_data_labels)

In [None]:
test_counts = []
batch_size = 1
for data in test_data:
  ensemble_labels = []
  for batch in chunks(data, batch_size):
    # Tokenize input data
    inputs = tokenizer(batch, padding = True, truncation = True, max_length = 512, return_tensors='pt')

    # Run model and get outputs
    outputs = finbert(**inputs)

    # Get output labels
    for result in outputs['logits']:
      result = result.tolist()
      ensemble_labels.append(result.index(max(result)))

  # Add to count
  test_counts.append(Counter(ensemble_labels))

In [None]:
# Test logistic regression model
lr_test_data = [[x[0], x[1], x[2]] for x in test_counts]
output_labels = lr.predict(lr_test_data)

In [None]:
print(output_labels)

[0 1 1 0 0 1 1 1 0 1 2]


In [None]:
# Generate and print performance metrics
target_names = ['no change', 'increase', 'decrease']
print(classification_report(test_data_labels, output_labels, target_names=target_names, digits=3))

              precision    recall  f1-score   support

   no change      0.250     0.200     0.222         5
    increase      0.000     0.000     0.000         1
    decrease      0.000     0.000     0.000         5

    accuracy                          0.091        11
   macro avg      0.083     0.067     0.074        11
weighted avg      0.114     0.091     0.101        11

