In [None]:
import os 
import sys
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import torch 
import nltk
import spacy
import tensorflow as tf

nltk.download('punkt')

In [None]:
import platform
platform.platform()

torch.backends.mps.is_built()

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

In [None]:

#Importing Configs
# Define the path where config.py is located
#Mac
os.chdir('/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling')
#Linux
#os.chdir('/home/kwnabors/Documents/GitHub/Finance-ML-Modeling')
config_file_path = os.getcwd()
print(config_file_path)

# Add this path to the sys.path
sys.path.append(config_file_path)

import config
#Variables, Paramaters, and Pathnames needed for this script
database_file = config.database
database_folder = config.database_folder
bert_models = config.bert_models
bert_models_local = config.bert_models_local
keywords = config.keywords
finbert_models = config.finbert_models

Body = config.Body
Model = config.Model
Model_Subfolder = f'/{Body} Texts/{Model}'
Model_Folder = config.texts
Model_Folder = Model_Folder + Model_Subfolder

df = pd.read_csv(f"{Model_Folder}/{Model}_texts_blocks.csv")  
df = df[df['language'] == 'en']


In [None]:
#Finbert 
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

tokenizer_1 = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model_1 = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model_1 = model_1.to('mps')

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',
                                                        num_labels=3)

finbert = finbert.to('mps')

tokenizer_2 = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer_2)

labels = {0:'neutral', 1:'positive',2:'negative'}
out_1= []
out_2 = []
sent_val = list()
tone_val = list()
long = 0
errors = 0
total = 0

In [None]:

for index, row in df.iterrows():
    docs = row["segment"]
    timestamps = row['date']
    title = row['title']
    docs = str(docs)
    doc_num = row['doc_num']
    
    total += 1
    try:
        inputs_2 = tokenizer_1(docs, return_tensors="pt", padding='max_length', max_length=511).to('mps')
        outputs_2 = model_1(**inputs_2)
        outputs_2 = outputs_2
        val_2 = torch.nn.functional.softmax(outputs_2.logits, dim=-1).to('cpu')
        val_2 = val_2.detach().numpy()  
        
        positive = val_2[:, 0][0]
        negative = val_2[:, 1][0]
        neutral = val_2[:, 2][0]
        net = labels[np.argmax(val_2)]

        out_1.append([doc_num, timestamps, title, docs, positive, negative, neutral, net])
        
        inputs_2 = tokenizer_2(docs, return_tensors="pt", padding='max_length', max_length=511).to('mps')
        outputs_2 = finbert(**inputs_2)[0]
        val_2 = labels[np.argmax(outputs_2.cpu().detach().numpy())]
        #tone_val.append(val_2)
        out_2.append([doc_num, timestamps, title, type, docs, val_2])
        
    except:
        errors += 1

    
percent = (errors/total)*100
print(f'Errors Long: {errors}')
print(f'Errors Long %: {percent}')

In [None]:
df_out_1 = pd.DataFrame(out_1, columns=["doc_num", "date", "title", "segment", "positive", "negative", "neutral", "sentiment"])

df_out_1["sentiment"] = df_out_1["sentiment"].replace({'positive': 1, 'neutral' : 0, 'negative' : -1})

df_out_2 = pd.DataFrame(out_2, columns=["doc_num", "date", "title", "segment", "tone"])

df_out_1.to_csv(f"{finbert_models}/{Body}_{Model}_finbert model_sent.csv")  
df_out_2.to_csv(f"{finbert_models}/{Body}_{Model}_finbert model_tone.csv")  

print('done')