In [1]:
import nltk
import vader
import glob
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,Trainer, pipeline
import numpy as np
import torch
from datasets import load_dataset
import os

In [2]:
# We can use either a general model such as roberta or a model that has been fine tuned on financial linguo 
#model_name = "siebert/sentiment-roberta-large-english"

model_name = "ProsusAI/finbert"
training_args =  TrainingArguments(output_dir="sentiment_logs/", disable_tqdm=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)
trainer = Trainer(model=model, args=training_args)

In [3]:
# A simple dataset class to use for the sentiment analysis trainer
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [4]:
data = pd.DataFrame(columns=["transcript","speaker", "text", "tokens", "pred", "label", "score"])

In [5]:
sentiment_pipeline("This is a good day", return_all_scores=True)



[[{'label': 'positive', 'score': 0.6927558779716492},
  {'label': 'negative', 'score': 0.017901284620165825},
  {'label': 'neutral', 'score': 0.28934288024902344}]]

In [7]:
"/"  in "a/a"

False

In [8]:
for i,transcript_file in enumerate(glob.glob('transcripts/*.txt')):
    print(f"Processing file {i}/{len(glob.glob('transcripts/*.txt'))} : ({transcript_file})")
    with open(transcript_file, 'r') as f:
        transcript_name = transcript_file.split("/")[-1].split(".")[0]
        transcript = f.readlines()
        processed_transcript = {"OPERATOR": []}
        current_speaker = "OPERATOR"
        for line in transcript:
            
            if line[0]=="[" and len(line)<40 and "/" not in line:
                current_speaker = line.split("]")[0][1:]
                if current_speaker not in processed_transcript:
                    processed_transcript[current_speaker] = []
            else:
                for sentence in line.split("."):
                    processed_transcript[current_speaker].append(sentence)
        for speaker in processed_transcript:
            os.makedirs(f"dataset/{transcript_name}", exist_ok=True)
            with open(f"dataset/{transcript_name}/{speaker}.txt", 'w') as f:
                f.write("\n".join(processed_transcript[speaker]))
       

Processing file 0/4253 : (transcripts/organogenesis-holdings-inc-orgo-q4-2022-earnings-call-transcript.txt)
Processing file 1/4253 : (transcripts/perseus-mining-limited-pmnxf-q3-2023-earnings-call-transcript.txt)
Processing file 2/4253 : (transcripts/slang-worldwide-inc-slgwf-q4-2022-earnings-call-transcript.txt)
Processing file 3/4253 : (transcripts/marathon-oil-corporation-mro-q1-2023-earnings-call-transcript.txt)
Processing file 4/4253 : (transcripts/walmart-inc-wmt-raymond-james-44th-annual-institutional-investors-conference-transcript.txt)
Processing file 5/4253 : (transcripts/usa-compression-partners-lp-usac-q1-2023-earnings-call-transcript.txt)
Processing file 6/4253 : (transcripts/playtika-holding-corp-pltk-q1-2023-earnings-call-transcript.txt)
Processing file 7/4253 : (transcripts/natural-grocers-vitamin-cottage-inc-ngvc-q2-2023-earnings-call-transcript.txt)
Processing file 8/4253 : (transcripts/capstone-copper-corp-csccf-q1-2023-earnings-call-transcript.txt)
Processing file 9

In [9]:
sentiment_pipeline("This is a good day", return_all_scores=True)



[[{'label': 'positive', 'score': 0.6927558779716492},
  {'label': 'negative', 'score': 0.017901284620165825},
  {'label': 'neutral', 'score': 0.28934288024902344}]]

In [10]:
#dataset = load_dataset("text", data_dir="dataset")
#predictions = sentiment_pipeline(dataset["train"]["text"])

In [None]:
""" data = pd.DataFrame(columns=["transcript", "label", "score"])
acc = []
for transcript_file in glob.glob("transcripts/*.txt"):
    transcript= transcript_file.split("/")[-1].split(".")[0]
    try:
        dataset = load_dataset("text", data_files=transcript_file)
        predictions = sentiment_pipeline(dataset["train"]["text"])
        for text, pred in zip(dataset["train"]["text"], predictions):
            acc.append({"transcript": transcript, "label": pred["label"], "score": pred["score"]})
    except Exception as e:
        print(f"Error with {transcript_file} : {e}")
data = pd.DataFrame.from_records(acc)
data.to_csv("reduced_sentiment_analysis_18_may.csv", index=False) """

' data = pd.DataFrame(columns=["transcript", "label", "score"])\nacc = []\nfor transcript_file in glob.glob("transcripts/*.txt"):\n    transcript= transcript_file.split("/")[-1].split(".")[0]\n    try:\n        dataset = load_dataset("text", data_files=transcript_file)\n        predictions = sentiment_pipeline(dataset["train"]["text"])\n        for text, pred in zip(dataset["train"]["text"], predictions):\n            acc.append({"transcript": transcript, "label": pred["label"], "score": pred["score"]})\n    except Exception as e:\n        print(f"Error with {transcript_file} : {e}")\ndata = pd.DataFrame.from_records(acc)\ndata.to_csv("reduced_sentiment_analysis_18_may.csv", index=False) '

In [11]:
data = pd.DataFrame(columns=["transcript","speaker", "text", "label", "score"])
acc = []
for speaker_loc in glob.glob("dataset/*/*.txt"):
    speaker = speaker_loc.split("/")[-1].split(".")[0]
    transcript= speaker_loc.split("/")[-2]
    try:
        dataset = load_dataset("text", data_files=speaker_loc)
        predictions = sentiment_pipeline(dataset["train"]["text"])
        for text, pred in zip(dataset["train"]["text"], predictions):
            acc.append({"transcript": transcript, "speaker": speaker, "text": text, "label": pred["label"], "score": pred["score"]})
    except Exception as e:
        print(f"Error with {speaker_loc} : {e}")
data = pd.DataFrame.from_records(acc)


Downloading and preparing dataset text/default to /home/lucastrg/.cache/huggingface/datasets/text/default-18a3a0a25c8d0651/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/lucastrg/.cache/huggingface/datasets/text/default-18a3a0a25c8d0651/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset text/default to /home/lucastrg/.cache/huggingface/datasets/text/default-75dcf7b1ac45e180/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/lucastrg/.cache/huggingface/datasets/text/default-75dcf7b1ac45e180/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

data.to_csv("sentiment_analysis_18_may.csv", index=False)

In [None]:
data

Unnamed: 0,transcript,speaker,text,label,score
0,ashford-inc-ainc-q1-2023-earnings-call-transcript,OPERATOR,Ashford Inc,neutral,0.925325
1,ashford-inc-ainc-q1-2023-earnings-call-transcript,OPERATOR,(NYSE:AINC) Q1 2023 Earnings Conference Call ...,neutral,0.591237
2,ashford-inc-ainc-q1-2023-earnings-call-transcript,OPERATOR,Welcome to the Ashford Incorported First Quar...,neutral,0.873380
3,ashford-inc-ainc-q1-2023-earnings-call-transcript,OPERATOR,At this time all participants are in a listen...,neutral,0.953002
4,ashford-inc-ainc-q1-2023-earnings-call-transcript,OPERATOR,A question-and-answer session will follow the...,neutral,0.952218
...,...,...,...,...,...
358475,ring-energy-rei-q1-2023-earnings-call-transcript,Jeff Grampp,Got it,neutral,0.913204
358476,ring-energy-rei-q1-2023-earnings-call-transcript,Jeff Grampp,Okay,neutral,0.896818
358477,ring-energy-rei-q1-2023-earnings-call-transcript,Jeff Grampp,That’s helpful,neutral,0.815940
358478,ring-energy-rei-q1-2023-earnings-call-transcript,Jeff Grampp,Thanks guys,neutral,0.725234


In [None]:
for text in data[(data.label == "NEGATIVE")].text:
    print(text)

Playtika Holding Corp
 [Operator Instructions] Please be advised that today's conference is being recorded
 I'd like to remind you that today's discussion may contain forward-looking statements including, but not limited to, the company's anticipated future revenue and operating performance
 These statements and other comments are not a guarantee of future performance, but rather are subject to risks and uncertainties, some of which are beyond our control
 These forward-looking statements apply as of today, and you should not rely on them as representing our views in the future
 We undertake no obligation to update these statements after this call
 For a more complete discussion of the risks and uncertainties, please see our filings with the SEC
 With that, I will now turn the call over to Robert
7 million
 I will now turn it over to Craig
1% year-over-year
 We made the strategic decision to shift more of our user acquisition spend to our casual growth titles
9% compared to 32
1% in Q4

In [None]:
tokenizer(["So with recent bank failures, I'm curious where your franchisees predominantly get their financing from and what impact, if any, the recent significant tightening of lending standards is having on your franchisees and whether that was also contributing to slower store growth this year outside of construction and permitting delays?\n"]
, padding=True, truncation=True)

{'input_ids': [[0, 2847, 19, 485, 827, 12055, 6, 38, 437, 10691, 147, 110, 3468, 293, 15351, 120, 49, 5200, 31, 8, 99, 913, 6, 114, 143, 6, 5, 485, 1233, 12872, 9, 6946, 2820, 16, 519, 15, 110, 3468, 293, 8, 549, 14, 21, 67, 8216, 7, 9992, 1400, 434, 42, 76, 751, 9, 1663, 8, 19289, 6091, 116, 50118, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
trainer.predict(SimpleDataset(tokenizer(["So with recent bank failures, I'm curious where your franchisees predominantly get their financing from and what impact, if any, the recent significant tightening of lending standards is having on your franchisees and whether that was also contributing to slower store growth this year outside of construction and permitting delays?\n"]
, padding=True, truncation=True)))

PredictionOutput(predictions=array([[ 3.0047798, -2.1745396]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.0255, 'test_samples_per_second': 39.285, 'test_steps_per_second': 39.285})

In [None]:
len(processed_transcript["OPERATOR"])

537

In [None]:
len(processed_transcript["OPERATOR"][0])

14

In [None]:
len(processed_transcript["OPERATOR"][0][1])

1

In [None]:
processed_transcript[speaker]=np.array(processed_transcript[speaker])


In [None]:
processed_transcript["OPERATOR"][:,1].tolist()

TypeError: list indices must be integers or slices, not tuple