In [1]:
import os
import sys
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import torch
import nltk
import spacy
import tensorflow as tf
from tqdm import tqdm

from transformers import (
    BertTokenizer,
    Trainer,
    BertForSequenceClassification,
    TrainingArguments,
)
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, pipeline

In [2]:
import platform

platform.platform()

torch.backends.mps.is_built()

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    # x = torch.ones(1, device=mps_device)
    # print(x)
else:
    print("MPS device not found.")

In [3]:
nlp = spacy.load("en_core_web_lg")


def remove_comma(sentence):
    doc = nlp(sentence)
    indices = []
    for i, token in enumerate(doc):
        if token.dep_ == "punct":
            try:
                next_token = doc[i + 1]
                if next_token.dep_ == "ROOT" or next_token.dep_ == "conj":
                    indices.append(i)
            except IndexError:
                pass
    if not indices:
        return sentence
    else:
        parts = []
        last_idx = 0
        for idx in indices:
            parts.append(doc[last_idx:idx].text.strip())

            last_idx = idx + 1
        parts.append(doc[last_idx:].text.strip())
        return " ".join(parts)


def sentiment_focus(sentence):
    doc = nlp(sentence)
    focus = ""
    focus_changed = 1
    for token in doc[:-1]:
        if token.lower_ == "but":
            focus = doc[token.i + 1 :]
            return str(focus).strip(), focus_changed

    for sent in doc.sents:
        sent_tokens = [token for token in sent]
        for token in sent_tokens:
            if token.lower_ == "although" or token.lower_ == "though":
                try:
                    comma_index_back = [
                        token1.i for token1 in doc[token.i :] if token1.text == ","
                    ][0]
                except IndexError:
                    try:
                        comma_index_front = [
                            token1.i for token1 in doc[: token.i] if token1.text == ","
                        ][-1]
                    except IndexError:
                        return str(doc).strip(), focus_changed
                    focus = doc[:comma_index_front].text
                    return str(focus).strip(), focus_changed
                try:
                    comma_index_front = [
                        token1.i for token1 in doc[: token.i] if token1.text == ","
                    ][-1]
                except IndexError:
                    focus = doc[comma_index_back + 1 :].text
                    return str(focus).strip(), focus_changed
                focus = doc[:comma_index_front].text + doc[comma_index_back:].text
                return str(focus).strip(), focus_changed

    if doc[0].lower_ == "while":
        try:
            comma_index_back1 = [token2.i for token2 in doc if token2.text == ","][0]
        except IndexError:
            return str(doc).strip(), focus_changed
        focus = doc[comma_index_back1 + 1 :].text
        return str(focus).strip(), focus_changed

    focus_changed = 0
    return str(doc).strip(), focus_changed

In [4]:
# Importing Configs
# Define the path where config.py is located
# Mac
os.chdir("/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling")
# Linux
# os.chdir('/home/kwnabors/Documents/GitHub/Finance-ML-Modeling')
config_file_path = os.getcwd()
print(config_file_path)

# Add this path to the sys.path
sys.path.append(config_file_path)

import config

# Variables, Paramaters, and Pathnames needed for this script
database_file = config.database
database_folder = config.database_folder
bert_models = config.bert_models
bert_models_local = config.bert_models_local
keywords = config.keywords
finbert_models = config.finbert_models

Body = config.Body
Model = config.Model
Model_Subfolder = f"/{Body} Texts/{Model}"
Model_Folder = config.texts
Model_Folder = Model_Folder + Model_Subfolder

df = pd.read_csv(f"{Model_Folder}/{Model}_texts.csv")
if Model == "Beige Book":
    print("skip")
else:
    df = df[df["language"] == "en"]

/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling
/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling
/Users/kylenabors/Documents
/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling


In [5]:
# tokenizer_1 = AutoTokenizer.from_pretrained("ProsusAI/finbert", force_download=True)
# model_1 = AutoModelForSequenceClassification.from_pretrained(
#     "ProsusAI/finbert", force_download=True
# )
# model_1 = model_1.to("mps")

model_2 = BertForSequenceClassification.from_pretrained(
    "ZiweiChen/FinBERT-FOMC", num_labels=3
).to("mps")
tokenizer_2 = BertTokenizer.from_pretrained("ZiweiChen/FinBERT-FOMC")
finbert_fomc = pipeline(
    "text-classification", model=model_2, tokenizer=tokenizer_2, device="mps"
)


labels = {0: "positive", 1: "negative", 2: "neutral"}
labels2 = {0: "neutral", 1: "positive", 2: "negative"}
out_1 = []
out_2 = []
sent_val = list()
tone_val = list()
long = 0
errors = 0
total = 0

In [6]:
Fed_list = [
    # "Speeches",
    # "Minutes",
    # "Beige Book",
    # "Statements",
]

In [7]:
from tqdm.auto import tqdm

for i in Fed_list:
    out_1 = []
    out_2 = []
    sent_val = list()
    tone_val = list()
    long = 0
    errors = 0
    total = 0
    Body = "Fed"
    Model = i
    Model_Subfolder = f"/{Body} Texts/{Model}"
    Model_Folder = config.texts
    Model_Folder = Model_Folder + Model_Subfolder

    df = pd.read_csv(f"{Model_Folder}/{Model}_texts.csv")
    if Model == "Beige Book":
        print("skip")
    else:
        df = df[df["language"] == "en"]

    df = df[(df["date"] >= "1999-01-01") & (df["date"] <= "2024-06-30")]
    tqdm.pandas()

    df["sentence_simple"] = df["segment"].progress_apply(remove_comma)
    # Processing sentiment focus
    df[["sentence_simple", "focus_changed"]] = (
        df["sentence_simple"].progress_apply(sentiment_focus).apply(pd.Series)
    )

    # df["focus_ornot"] = df["focus_changed"].apply(lambda x: 1 if x else 0)

    df.drop("focus_changed", axis=1, inplace=True)

    df["len"] = df["sentence_simple"].apply(lambda x: len(x))
    df = df[df["len"] < 512]

    df["sentiment"] = df["sentence_simple"].progress_apply(lambda x: finbert_fomc(x))
    df["sentiment"] = df["sentiment"].apply(lambda x: x[0]["label"])
    df["sentiment"] = df["sentiment"].replace(
        {"Positive": 1, "Neutral": 0, "Negative": -1}
    )
    print(df["sentiment"].unique())

    df = df[["date", "segment", "sentiment", "type"]]
    df.to_csv(f"{finbert_models}/{Body}/{Model}/{Body}_{Model}_finbert_model_short.csv")

In [8]:
ECB_list = [
    # "Speeches",
    # "Monetary policy decisions",
    "Economic Bulletin",
    # "Press Conferences",
]

In [9]:
for i in ECB_list:
    out_1 = []
    out_2 = []
    sent_val = list()
    tone_val = list()
    long = 0
    errors = 0
    total = 0
    Body = "ECB"
    Model = i
    Model_Subfolder = f"/{Body} Texts/{Model}"
    Model_Folder = config.texts
    Model_Folder = Model_Folder + Model_Subfolder

    df = pd.read_csv(f"{Model_Folder}/{Model}_texts.csv")
    if Model == "Beige Book":
        print("skip")
    else:
        df = df[df["language"] == "en"]

    df = df[(df["date"] >= "1999-01-01") & (df["date"] <= "2024-06-30")]
    tqdm.pandas()

    df["sentence_simple"] = df["segment"].progress_apply(remove_comma)
    # Processing sentiment focus
    df[["sentence_simple", "focus_changed"]] = (
        df["sentence_simple"].progress_apply(sentiment_focus).apply(pd.Series)
    )

    # df["focus_ornot"] = df["focus_changed"].apply(lambda x: 1 if x else 0)

    df.drop("focus_changed", axis=1, inplace=True)

    df["len"] = df["sentence_simple"].apply(lambda x: len(x))
    df = df[df["len"] < 512]

    df["sentiment"] = df["sentence_simple"].progress_apply(lambda x: finbert_fomc(x))
    df["sentiment"] = df["sentiment"].apply(lambda x: x[0]["label"])
    df["sentiment"] = df["sentiment"].replace(
        {"Positive": 1, "Neutral": 0, "Negative": -1}
    )
    print(df["sentiment"].unique())
    df = df[["date", "segment", "sentiment", "type"]]
    df.to_csv(f"{finbert_models}/{Body}/{Model}/{Body}_{Model}_finbert_model_short.csv")

  0%|          | 0/110183 [00:00<?, ?it/s]

  0%|          | 0/110183 [00:00<?, ?it/s]

  0%|          | 0/102694 [00:00<?, ?it/s]

[ 0 -1  1]


In [10]:
print("Totally Done")

Totally Done
