## Imports

In [3]:
# standard library imports

from io import StringIO
import os, re, json, time
from datetime import datetime

# third-party imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch
import requests
import numpy as np
import pandas as pd
import concurrent.futures
import scipy.stats as stats
from datasets import Dataset
import matplotlib.pyplot as plt
from IPython.display import display
import pandas_market_calendars as mcal
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

# local imports
import executor as config
from assets import credential as cred

# downloads and settings
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 100)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Functions

### API

In [1]:
def leeway(function, ticker, time_from = "2021-01-01", time_to = "2022-12-31", limit = 1000, session=None):   # TODO: remove defaults from time_from and time_to
    """
    Fetches financial data from the Leeway API based on the specified function, ticker, and time range.

    Args:
    - function (str): The type of financial data to retrieve (e.g., "NEWS", "VALUE", "FUNDAMENTALS", "MARKETCAP").
    - ticker (str): Ticker symbol of the stock.
    - time_from (str, optional): Starting date for fetching data. Defaults to "2021-01-01".
    - time_to (str, optional): Ending date for fetching data. Defaults to "2022-12-31".
    - limit (int, optional): Maximum number of answers to retrieve. Defaults to 1000.
    - session (requests.Session, optional): An optional requests.Session object for making HTTP requests.

    Returns:
    - dict or None: JSON containing relevant data, or None if an error occurs during the request.
    """
    base_url = "https://api.leeway.tech/api/v1/public/"
    function_map = {
        "NEWS": f"news?apitoken={cred.apikey}&s={ticker}&limit={limit}&from={time_from}&to={time_to}",
        "VALUE": f"historicalquotes/{ticker}?apitoken={cred.apikey}&from={time_from}&to={time_to}",
        "FUNDAMENTALS": f"fundamentals/{ticker}?apitoken={cred.apikey}", # etf components
        "MARKETCAP": f"historicalquotes/marketcap/{ticker}?apitoken={cred.apikey}&from={time_from}&to={time_to}",
        "DIVIDENDS": f"dividends/{ticker}?apitoken={cred.apikey}&from={time_from}&to={time_to}&calculations=true"
    }
    
    if function.upper() in function_map:
        url = base_url + function_map[function.upper()]
    else:
        raise Exception("Invalid function")

    if(session is None):
        r = requests.get(url)
        data = r.json()
        return data
    
    else:
        try:
            with session.get(url) as r:
                data = r.json()
                return data
        except requests.RequestException:
            return None

### Convert JSON to DF

In [None]:
def json_to_df(data_js, columns=None):
    """
    Converts JSON data into a Pandas DataFrame.

    Args:
    - data_js (list of dict): JSON data to be converted.
    - columns (list, optional): List of column names to include in the DataFrame. Defaults to None.

    Returns:
    - DataFrame: A Pandas DataFrame created from the JSON data.
    """
    # create an empty list to store the json
    data_list = []

    # iterate through the json file and append them
    for item in data_js:
        data_list.append(item) 

    # convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data_list)

    if columns is not None:
        try:
            df = df[columns]
        except:
            print("Empty request")

    return df

### ETF Components

In [None]:
def extract_from_etf(etfs):
    """
    Extracts stocks from a list of ETFs and combine them into a DataFrame.

    Args:
    - etfs (list): List of ETF symbols.

    Returns:
    - DataFrame: A DataFrame containing stocks for the specified ETFs, including columns "ticker" and "etf".
    """
    df = pd.DataFrame() 
    for etf in etfs:   
        df_fundamental = pd.DataFrame()
        json_fundamental = leeway("FUNDAMENTALS", etf)

        df_fundamental["ticker"] = json_to_df((json_fundamental["ETF_Data"])["Holdings"])
        df_fundamental["etf"] = etf

        df = pd.concat([df, df_fundamental], ignore_index=True)
    return df

### Get News from all the stocks from the given etfs

In [2]:
def get_news_per_row(ticker, start_date, end_date):
    """
    Retrieves news data for a given stock ticker from the API with a workaround for the 1000-answer limit.

    Args:
    - ticker (str): Ticker symbol for a stock.
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").

    Returns:
    - DataFrame: A DataFrame containing news for the specified ticker.
    """
    name = f"{ticker}_{start_date}_{end_date}_news"

    if load_from_db(name) is None:
        # get data from api and load them into dataframe
        news_json = leeway("NEWS", ticker, start_date, end_date) 
        news_df = json_to_df(news_json, ["title", "date", "content"])

        # change sorting so the oldest date is on top
        if not news_df.empty: 
            news_df = news_df.sort_values(by="date", ascending=True)

            # load the oldest date from the dataframe into the first_date variable
            date_obj = datetime.fromisoformat(str(news_df["date"].iloc[0]))
            first_date = date_obj.strftime("%Y-%m-%d") 

            # check if first_date equals our start_date parameter, if not that means the news data exceeds the 1000 limit
            while first_date != start_date:
                # new request to api with our old start_date data and the time_to data, which is now our oldest date in the dataframe
                news_json = leeway("NEWS", ticker, start_date, first_date)
                new_news_df = json_to_df(news_json, ["title", "date", "content"])

                # also sort this new df so the oldest date is on top, only if new df exists
                if not new_news_df.empty:
                    new_news_df = new_news_df.sort_values(by="date", ascending=True)

                # combine the two dfs
                news_df = pd.concat([new_news_df, news_df])     

                # check if the "old" oldest date equals the "new" oldest date, if so we can leave the loop since we dont have any new data
                date_obj = datetime.fromisoformat(str(news_df["date"].iloc[0]))

                if(first_date == date_obj.strftime("%Y-%m-%d")):
                    break
                
                first_date = date_obj.strftime("%Y-%m-%d")

                # we can either leave the loop by having the same oldest date for two iterations or if the oldest date equals our time from parameter

        # drop duplicates
        news_df.drop_duplicates(subset=["title"], inplace=True)

        return split_data_to_db(news_df, name, 2000) #TODO: magic number for batch size, should be a parameter but then the name has to be changed as well

    else:
        return combine_data_from_db(name)

def retrieve_and_combine_news(row, start_date, end_date):
    """
    Helper function to retrieve and combine news with ticker and etf symbol for each row in a DataFrame.

    Args:
    - row (pd.Series): A row from a DataFrame.
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").

    Returns:
    - DataFrame: A DataFrame containing news data for the specified ticker.
    """
    # call the get_news function, which provides a workaround for the 1000 limit of the API
    news_df = get_news_per_row(row["ticker"], start_date, end_date)
    # add new column to the df to show the matching stock and etf
    news_df = news_df.assign(ticker=row["ticker"], etf=row.get("etf", None))

    return news_df

def get_content(df, start_date, end_date):
    """
    Retrieves and combines news for a DataFrame containing tickers.

    Args:
    - df (DataFrame): Pandas DataFrame containing relevant data with columns "ticker" and "etf".
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").

    Returns:
    - DataFrame: A DataFrame containing combined news, including columns for "ticker" and "etf".
    """
    # apply the retrieve_and_combine_news function, which provied a walkaround the 1000 limit of the api
    with concurrent.futures.ThreadPoolExecutor() as executor:
        news = list(executor.map(lambda row: retrieve_and_combine_news(row, start_date, end_date), df.to_dict(orient="records")))
    
    final_df = pd.concat(news, ignore_index=True)

    # TODO: remove this format and also the formats in the get news function. currenty we have to do this or it will not work
    format_final_df = format_date(final_df)  # format date to YYYY-MM-DD

    return format_final_df

### Format date

In [None]:
def format_date(df, columns="date"):
    """
    Formats date columns in a DataFrame to the "YYYY-MM-DD" string format.

    Args:
    - df (DataFrame): Pandas DataFrame containing relevant columns.
    - columns (str or list of str, optional): Name of the column or list of columns to be formatted. Defaults to "date".

    Returns:
    - DataFrame: A copy of the input DataFrame with specified date columns formatted as strings.
    """
    df_copy = df.copy()

    if isinstance(columns, str):
        columns = [columns]

    for col in columns:
        df_copy[col] = pd.to_datetime(df_copy[col])
        df_copy[col] = df_copy[col].dt.strftime("%Y-%m-%d")

    return df_copy

### Create Session

In [None]:
def create_session():
    """
    Creates and returns a new requests.Session object.

    Returns:
    - requests.Session: A new session object for handling HTTP requests.
    """
    return requests.Session()

### Database

In [1]:
def save_to_db(data, name, max_retries=3, delay=5):
    """
    Saves data to the database using the API.

    Args:
    - data: The data to be saved.
    - name: A unique name for the data in the database.

    Returns:
    - int: The HTTP status code of the POST request.
    """
    session = create_session()
    # IMPORTANT: Replace the placeholder URL with the actual API endpoint
    url = f"This is a placeholder. To use this function, replace this string with the actual URL."

    for attempt in range(max_retries):
        try:
            r = session.post(url, json={"data": data}) # wrap dataobject in "data"
            return r.status_code
        except Exception:
            print(f"Attempt {attempt + 1} of {max_retries} failed: Connection timed out. Retrying in {delay} seconds...")
            time.sleep(delay)
        
    print("All retry attempts failed. Could not connect to the server.")
    return None

def load_from_db(name):
    """
    Retrieves data from the database using the API.

    Args:
    - name: The unique name for the data in the database.

    Returns:
    - mixed: The loaded data if successful; otherwise, returns None.
    """
    # IMPORTANT: Replace the placeholder URL with the actual API endpoint
    url = f"This is a placeholder. To use this function, replace this string with the actual URL."
    r = requests.get(url)
    try:
        data = r.json()["data"] # access the dataobject from data
        return data
    except:
        return None # TODO: This was false and is now None, code should be updated to reflect this

def split_content(df, split_size):
    """
    Splits a DataFrame into smaller batches based on a specified size.

    Args:
    - df: The DataFrame to be split.
    - split_size: The size of each split.

    Returns:
    - list: A list of smaller DataFrames.
    """
    # calulate number of splits
    num_splits = -(-len(df) // split_size)
    # iterate over the number of splits and split the dataframe
    split_dfs = [df[i * split_size:(i + 1) * split_size] for i in range(num_splits)]

    return split_dfs

def split_data_to_db(df, name, split_size):
    """
    Splits a DataFrame into smaller batches and saves them to the database.

    Args:
    - df: The DataFrame to be split and saved.
    - name: The unique name for the data in the database.
    - split_size: The size of each split.

    Returns:
    - DataFrame: The original DataFrame ("df").
    """
    # split the dataframe into smaller dataframes
    split_dfs = split_content(df, split_size)
    # save the number of splits to the db
    save_to_db(len(split_dfs), name)
    # iterate over the splits and save them to the db
    for x, part_df in enumerate(split_dfs):
        part_name = f"{name}_{x + 1}"
        data = part_df.to_json(orient="records")
        save_to_db(data, part_name) 
        
    return df

def combine_data_from_db(name):
    """
    Combines previously split data from the database into a single DataFrame.

    Args:
    - name: The unique name for the data in the database.

    Returns:
    - DataFrame: A concatenated DataFrame containing data from all splits.
    """
    # load the number of splits from the db
    length = load_from_db(name)
    # iterate over the number of splits and load them from the db
    dfs = []
    for x in range(length):
        data = load_from_db(f"{name}_{x + 1}")
        if data is not None:
            df = pd.read_json(StringIO(data))
            dfs.append(df)
    # combine the splits into one dataframe
    complete_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
    return complete_df

### Split by Ticker

In [None]:
def split_df_by_ticker(df):
    """
    Splits a DataFrame into training, testing, and validation sets based on ticker symbols.

    Args:
    - df (DataFrame): Pandas DataFrame containing data with a "ticker" column.

    Returns:
    - tuple of DataFrames: Training, testing, and validation sets.
    """
    train_set = pd.DataFrame()
    test_set = pd.DataFrame()
    val_set = pd.DataFrame()

    for _, group_data in df.groupby("ticker"):
        train, test_and_val = train_test_split(group_data, test_size=0.3, random_state=42) # split each group by 70/30
        test, val = train_test_split(test_and_val, test_size=0.5, random_state=42) # split the test_val in 50/50

        # concat the splits to the final sets
        train_set = pd.concat([train_set, train])
        test_set = pd.concat([test_set, test])
        val_set = pd.concat([val_set, val])

    print(f"Trainingsset: {len(train_set)} Zeilen")
    print(f"Testset: {len(test_set)} Zeilen")
    print(f"Validierungsset: {len(val_set)} Zeilen")

    return train_set, test_set, val_set

In [None]:
def split_df_by_marketcap(df):
    pass
    # TODO: implement this function, based on lars' idea

### Adjust

In [None]:
def adjust(df, text, label, date=None):
    """
    Adjusts the columns of a DataFrame to standard names with optional date column with a new label.

    Args:
    - df (DataFrame): Pandas DataFrame to be adjusted.
    - text (str): Name of the column representing textual data.
    - label (str): Name of the column representing labels.
    - date (str, optional): Name of the column representing dates.

    Returns:
    - DataFrame: Adjusted DataFrame with columns named "text" and "label", and an optional "label_date" column.
    """
    df_copy = df.copy()
    
    # rename the columns to text and label
    if date is None:
        df_copy = df[[text, label]]
        return df_copy.rename(columns= {text: "text", label: "label"})

    df_renamed = df_copy.rename(columns= {text: "text", label: "label", date: "label_date"})

    return df_renamed

### Get stock, etf and marketcap values for given time frame

In [None]:
def get_marketcap_per_row(ticker, start_date, end_date, session): #TODO: we dont need the marketcap anymore, remove this function?
    """
    Fetches market capitalization for a given ticker from either the API or database.

    Args:
    - ticker (str): Ticker symbol for a stock or ETF.
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").
    - session (object): Session object for handling API requests.

    Returns:
    - dict: A dictionary containing market capitalization with dates as keys and corresponding values.
    """
    name_db = f"{ticker}_{start_date}_{end_date}_mc"
    ticker_mc = f"{ticker}_mc"

    # check if the data is already in the db, if not fetch it from the api
    mc_dict = load_from_db(name_db)
    if mc_dict is None:
        try:
            mc_json = leeway("MARKETCAP", ticker, start_date, end_date, session=session)
            mc_dict = { ticker_mc: { entry["date"]: entry["value"] for entry in mc_json } }
        except Exception as e:
            print(f"Error fetching marketcap values: {e}")
            mc_dict = {}

        save_to_db(mc_dict, name_db)

    return mc_dict

def get_value_per_row(ticker, start_date, end_date, session):
    """
    Fetches stock or ETF values for a given ticker from either the API or database.

    Args:
    - ticker (str): Ticker symbol for a stock or ETF.
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").
    - session (object): Session object for handling API requests.

    Returns:
    - dict: A dictionary containing stock or ETF values with dates as keys and corresponding adjusted close prices.
    """
    name_db = f"{ticker}_{start_date}_{end_date}_values" 

    # check if the data is already in the db, if not fetch it from the api
    value_dict = load_from_db(name_db)
    if value_dict is None:
        try:
            value_json = leeway("VALUE", ticker, start_date, end_date, session=session)
            value_dict = { ticker: { entry["date"]: entry["adjusted_close"] for entry in value_json } }
        except Exception as e:
            print(f"Error fetching stock values: {e}")
            value_dict = {}

        save_to_db(value_dict, name_db)

    return value_dict

def collect_valid_data(data):
    """
    Filters out false values from a list of dictionaries and combines them into a single dictionary.

    Args:
    - data (list): List of dictionaries.

    Returns:
    - dict: Combined dictionary containing valid data from the input list.
    """
    return {k: v for item in data if item is not False for k, v in item.items()}

def get_values(df, start_date, end_date):
    """
    Retrieves stock, ETF, and market capitalization values for given tickers and ETFs within a specified date range.

    Args:
    - df (DataFrame): Pandas DataFrame containing data with necessary columns "etf" and "ticker".
    - start_date (str): Start date of the data retrieval period (formatted as "YYYY-MM-DD").
    - end_date (str): End date of the data retrieval period (formatted as "YYYY-MM-DD").

    Returns:
    - dict: A dictionary containing combined stock, ETF, and market capitalization values with dates as keys.
    """
    session = create_session()

    # add 3 months to the end date, so we get all values including these from the future date. #TODO: magic number
    adj_end_date = (pd.to_datetime(end_date) + pd.DateOffset(months=3)).strftime("%Y-%m-%d")

    # get unique values from the dataframe
    unique_etf_values = df["etf"].unique()
    unique_ticker_values = df["ticker"].unique()

    # get the stock,etf and mc values from the api using multithreading
    with concurrent.futures.ThreadPoolExecutor() as executor:
        stock_data = list(executor.map(lambda ticker: get_value_per_row(ticker, start_date, adj_end_date, session), unique_ticker_values))
        etf_data = list(executor.map(lambda etf: get_value_per_row(etf, start_date, adj_end_date, session), unique_etf_values))
        mc_data = list(executor.map(lambda ticker: get_marketcap_per_row(ticker, start_date, adj_end_date, session), unique_ticker_values))

    stock_prices = collect_valid_data(stock_data)
    stock_prices.update(collect_valid_data(etf_data))
    stock_prices.update(collect_valid_data(mc_data))

    return stock_prices

### Get trading days

In [None]:
def get_trading_days(start_date, end_date, calendar):
    """
    Get trading days from the specified calendar between the specified start and end dates.

    Args:
    - start_date (str): Start date in the format "YYYY-MM-DD".
    - end_date (str): End date in the format "YYYY-MM-DD".
    - calendar (str): The name of the calendar to retrieve trading days from.

    Returns:
    - str: JSON-formatted string containing a list of trading days.
    """
    adj_end_date = (pd.to_datetime(end_date) + pd.DateOffset(months=3)).strftime("%Y-%m-%d") # add 3 months to the end date to make sure we get all trading days , TODO: magic number

    name = f"{calendar}_{start_date}_{adj_end_date}_market_open"

    # check if the data is already in the db, if not fetch it from the api
    json_dates = load_from_db(name)
    if json_dates is None:
        stock_exchange= mcal.get_calendar(calendar) 
        schedule = stock_exchange.schedule(start_date, adj_end_date)
        trading_days = schedule["market_open"].dt.date.unique()

        dates_str = [d.isoformat() for d in trading_days]
        json_dates = json.dumps(dates_str)

        save_to_db(json_dates, name)

    return json_dates

### Update dates

In [None]:
def find_nearest_date_before(date, date_list):
    """
    Find the nearest valid date before a given date in a list.

    Args:
    - date (str or timestamp): Date to check.
    - date_list (list): List of valid dates.

    Returns:
    - str or timestamp: Nearest date before the given date in the list.
    """
    date = pd.to_datetime(date)
    valid_dates_before = [x for x in date_list if x < date] # take only dates before given date

    if not valid_dates_before:
        # if there are no valid dates before the date, return the closest date
        nearest_date = min(date_list, key=lambda x: abs(date - x))
        return nearest_date
    
    nearest_date = max(valid_dates_before) # choose closest date before given one
    return nearest_date

def update_dates(df, dates_js, target):
    """
    Update date columns in a DataFrame with the nearest dates from a given list.

    Args:
    - df (DataFrame): Input DataFrame.
    - dates_js (str): JSON-formatted string containing a list of valid dates.
    - target (str): Specified target.

    Returns:
    - DataFrame: DataFrame with updated date columns.
    """
    df_copy = df.copy() # copy so we dont change the original df
    df_copy["date"] = pd.to_datetime(df_copy["date"])
    relevant_columns = ["date", f"date_{target}"] # only keep the relevant columns, date and the target dates

    target_mapping = {"3D": 3, "1W": 1, "2W": 2, "1M": 1, "3M": 3}
    target_value = target_mapping.get(target)

    if "M" in target:     df_copy[f"date_{target}"] = df_copy["date"] + pd.DateOffset(months=target_value)
    elif "W" in target:   df_copy[f"date_{target}"] = df_copy["date"] + pd.DateOffset(weeks=target_value)
    else:                   df_copy[f"date_{target}"] = df_copy["date"] + pd.DateOffset(days=target_value)

    dates_series = pd.to_datetime(pd.Series(eval(dates_js)))  # convert JSON string to pandas datetime series

    for col in relevant_columns: # only call the function if the date is not in the list of valid dates
        df_copy[col] = df_copy[col].apply(lambda x: find_nearest_date_before(x, dates_series) if x not in dates_series.values else x)

    # format the dates to YYYY-MM-DD
    format_df = format_date(df_copy, relevant_columns)  

    return format_df

### Calulate change

In [None]:
def calculate_adjusted_change(ticker, etf, date, target_date, values_js):
    """
    Calculate adjusted change for a specific stock and ETF.

    Args:
    - ticker (str): Ticker symbol of the stock.
    - etf (str): Ticker symbol of the ETF.
    - date (str or timestamp): Original date.
    - target_date (str or timestamp): Target date.
    - values_js (str): JSON-formatted string containing stock, ETF, and marketcap values.

    Returns:
    - float or None: Adjusted change percentage, or None if calculation is not possible.
    """
    adjusted_change = None

    # get the values from the dictionary
    stock_values = values_js[ticker]
    etf_values = values_js[etf]

    # check if the dates are in the dictionary
    if (date in stock_values) and (target_date in stock_values) and (date in etf_values) and (target_date in etf_values):

        stock_original_value = stock_values[date]
        stock_future_value = stock_values[target_date]

        etf_original_value = etf_values[date]
        etf_future_value = etf_values[target_date]

        stock_change = ((stock_future_value - stock_original_value) / stock_original_value) * 100
        etf_change = ((etf_future_value - etf_original_value) / etf_original_value) * 100

        adjusted_change = stock_change - etf_change

    return adjusted_change

def calculate_changes_per_row(row, values_js, target):
    """
    Call function for calculation of adjusted changes and marketcap for each row in a DataFrame.

    Args:
    - row (Series): Pandas Series representing a row in the DataFrame.
    - values_js (str): JSON-formatted string containing stock, ETF, and marketcap values.
    - target (str): Target for calcultions.

    Returns:
    - Series: Series containing calculated changes and marketcap for the row.
    """
    etf = row["etf"]
    date = row["date"]
    ticker = row["ticker"]
    target_date = row[f"date_{target}"]
    ticker_mc = f"{ticker}_mc"

    # calculate the adjusted change for each row and add the marketcap column
    change = calculate_adjusted_change(ticker, etf, date, target_date, values_js)
    marketcap = values_js[ticker_mc].get(date)

    return pd.Series({f"change_{target}": change, "marketcap": marketcap})

def calc_changes(df, values_js, target):
    """
    Calculate adjusted changes for the given df and future target.

    Args:
    - df (DataFrame): Input DataFrame with news and dates.
    - values_js (str): JSON-formatted string containing stock, ETF, and marketcap values.
    - target (str): target for future date calculations.

    Returns:
    - DataFrame: DataFrame with alculated changes.
    """
    df_copy = df.copy() # copy so we dont change the original df
    df_copy[[f"change_{target}", "marketcap"]] = df_copy.apply(lambda row: calculate_changes_per_row(row, values_js, target), axis=1)

    return df_copy

### Preprocessing the input

In [None]:
def preprocess(df):
    """
    Perform pre-processing steps on a DataFrame by calling different functions.

    Args:
    - df (DataFrame): Input DataFrame.

    Returns:
    - DataFrame: Processed DataFrame after applying various pre-processing steps.
    """
    df_copy = df.copy()
    
    # drop nan values
    df_copy.dropna(inplace=True)

    # remove symbols, punctuations
    df_copy["content"] = df_copy["content"].apply(remove_symbols)

    # remove stop words
    df_copy["content"] = df_copy["content"].apply(remove_stops)

    # lemmatize text using nltk
    #df_copy["content"] = df_copy["content"].apply(lemmatize_text)

    # stem text using nltk
    #df_copy["content"] = df_copy["content"].apply(stem_text)

    # split text by the maximum limit of bert
    df_copy = split_text(df_copy, column="content", limit=512)

    # since different news articles with differenct titles can have the same content, we drop duplicates
    #TODO: combine with the drop in get_news_per_row, does this make sense to drop
    df_copy = df_copy.drop_duplicates(subset=["content", "ticker"]) 

    # sort by ticker, then date
    df_copy.sort_values(by=["ticker", "date"], inplace=True)

    return df_copy

In [None]:
def count_words(text):
    """
    Count the number of words in a given text.

    Args:
    - text (str): Input text.

    Returns:
    - int: Number of words in the text.
    """
    words = text.split()
    return len(words) # using len words is more precise than len text

def split_text(df, column, limit):
    """
    Split rows in a DataFrame if the content length exceeds a specified limit.

    Args:
    - df (DataFrame): Input DataFrame.
    - column (str): Column containing text content.
    - limit (int): Maximum number of words allowed.

    Returns:
    - DataFrame: Modified DataFrame with rows split as needed.
    """
    df_copy = df.copy()
    # get lenght of content
    df_copy["words"] = df_copy[column].apply(count_words)

    # if content lenght larger than limit
    large_rows = df_copy[df_copy["words"] > limit].copy()

    # for all rows which exceed the limit
    split_rows = []
    for _, row in large_rows.iterrows():
        content = row[column]
        words = content.split()

        # calculate the amount of new rows needed
        num_segments = (len(words) // limit) + 1

        # iterate over new segements
        for i in range(num_segments):
            start = i * limit
            end = (i + 1) * limit
            split_content = " ".join(words[start:end])

            # add new column to list 
            new_row = row.copy()
            new_row[column] = split_content
            split_rows.append(new_row)
    
    # convert list to df
    df_split = pd.DataFrame(split_rows)

    # append new df to old df
    df_copy = df_copy[df_copy["words"] <= limit]
    df_copy = pd.concat([df_copy, df_split], axis=0, ignore_index=True)

    # drop words column
    df_copy.drop(columns="words", inplace=True)

    # some values are some type of null, but wont drop when using dropna, so this is the workaround
    df_copy = df_copy[df_copy[column].apply(len) >= 50]

    return df_copy

def remove_stops(text):
    """
    Remove stop words from a given text.

    Args:
    - text (str): Input text.

    Returns:
    - str: Text with stop words removed.
    """
    stop_words = set(stopwords.words("english"))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

def lemmatize_text(text):
    """
    Lemmatize words in a given text.

    Args:
    - text (str): Input text.

    Returns:
    - str: Text with lemmatized words.
    """
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

def stem_text(text):
    """
    Stem words in a given text.

    Args:
    - text (str): Input text.

    Returns:
    - str: Text with stemmed words.
    """
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

def remove_symbols(text):
    """
    Remove irrelevant symbols from a given text.

    Args:
    - text (str): Input text.

    Returns:
    - str: Text with irrelevant symbols removed.
    """
    relevant_symbols = r'[^\w\s%$€+*.,!?-]'  # %, $, €, mathematical symbols, TODO: symbols as args?
    return re.sub(relevant_symbols, "", text)

### Simulate

In [None]:
def predict_row(row, tokenizer, model, keyfigure=None):
    """
    Predict stock price change for a single row.

    Args:
    - row (Series): Pandas Series containing relevant information for prediction.
    - tokenizer: Tokenizer for text data.
    - model: Pre-trained language model for prediction.

    Returns:
    - dict: Dictionary containing prediction details (ticker, title, date, predicted change, future date, actual change).
    """
    inputs = tokenizer(row["text"], return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_change = outputs.logits.squeeze().item()
    
    result = {
        "ticker": row["ticker"],
        "title": row["title"],
        "date": row["date"],
        "predicted change": predicted_change,
        "future date": row["label_date"],
        "actual change": row["label"]
    }

    if keyfigure is not None:
        result[keyfigure] = row[keyfigure]

    return result


def predict_df(model, df, keyfigure=None):
    """
    Predict stock price changes for a DataFrame by calling the prediction fun for each row.

    Args:
    - model: Pre-trained language model for prediction.
    - df (DataFrame): DataFrame containing text data for prediction.

    Returns:
    - DataFrame: DataFrame containing predicted changes for each row.
    """
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    prediction_result = df.apply(lambda row: predict_row(row, tokenizer, model, keyfigure), axis=1)
    predicted_df = pd.DataFrame(prediction_result.tolist())
    
    return predicted_df

def categorize_prediction(prediction, pos_threshold, neg_threshold):
    """
    Categorize predicted changes into BUY, SELL, or HOLD based on a threshold.

    Args:
    - prediction (float): Predicted stock price change.
    - pos_threshold (float): Positive threshold for categorization.
    - neg_threshold (float): Negative threshold for categorization.

    Returns:
    - str: Categorized signal ("BUY", "SELL", or "HOLD").
    """
    if prediction > pos_threshold: return "BUY"
    elif prediction < neg_threshold: return "SELL"
    else: return "HOLD"

def calculate_auto_threshold(predicted_changes, mode, percentile):
    """
    Calculate automatic thresholds based on the specified mode.

    Args:
    - predicted_changes (Series): Series of predicted stock price changes.
    - mode (str): Mode for threshold calculation ("NORMAL_DISTRIBUTION" or "PERCENTAGE").
    - percentile (float): Desired percentile for threshold.

    Returns:
    - tuple: Automatic thresholds for the specified mode.
    """
    if mode == "NORMAL_DISTRIBUTION":
        mean, std_dev = predicted_changes.mean(), predicted_changes.std()
    
        adj_percentile = percentile / 100

        top_threshold = stats.norm.ppf(1-adj_percentile, loc=mean, scale=std_dev)
        bottom_threshold = stats.norm.ppf(adj_percentile, loc=mean, scale=std_dev)

        return top_threshold, bottom_threshold

    elif mode == "PERCENTAGE":
        top_threshold = np.percentile(predicted_changes, 100-percentile)
        bottom_threshold = np.percentile(predicted_changes, percentile)

        return top_threshold, bottom_threshold

def plot_distribution(df, pos_t, neg_t, predicted_col="predicted change", actual_col="actual change"):
    """
    Plot the distribution of predicted and actual changes in a DataFrame using subplots.

    Args:
    - df (DataFrame): The input DataFrame.
    - pos_t (float): Positive threshold for categorization.
    - neg_t (float): Negative threshold for categorization.
    - predicted_col (str, optional): Name of the column containing predicted changes. Defaults to "predicted change".
    - actual_col (str, optional): Name of the column containing actual changes. Defaults to "actual change".

    Returns:
    - None
    """
    # create two subplots side by side
    _, axs = plt.subplots(1, 2, figsize=(12, 5))

    # plot for predicted changes in the first subplot
    axs[0].hist(df[predicted_col], bins=50, alpha=0.5, color="blue", label="Predicted Changes")
    axs[0].text(0.5, -0.15, f"Mean: {df[predicted_col].mean():.2f}\nStd: {df[predicted_col].std():.2f}", size=10, ha="center", transform=axs[0].transAxes)
    axs[0].axvline(x=pos_t, color="red", linestyle="--", label=f"Positive threshold: {pos_t:.2f}")
    axs[0].axvline(x=neg_t, color="red", linestyle="--", label=f"Negative threshold: {neg_t:.2f}")

    axs[0].set_title("Predicted Changes")

    # plot for actual changes in the second subplot
    axs[1].hist(df[actual_col], bins=50, alpha=0.5, color="red", label="Actual Changes")
    axs[1].text(0.5, -0.15, f"Mean: {df[actual_col].mean():.2f}\nStd: {df[actual_col].std():.2f}", size=10, ha="center", transform=axs[1].transAxes)
    axs[1].set_title("Actual Changes")

    # add legend to both subplots
    axs[0].legend()
    axs[1].legend()

    # adjust layout to prevent overlapping
    plt.tight_layout()

    # show the plots
    plt.show()

def simulate_predictions(model, df, mode, threshold, keyfigure=None):
    """
    Simulate stock signals based on calculated predicted changes.

    Args:
    - model: Pre-trained language model for prediction.
    - df (DataFrame): DataFrame containing news.
    - mode (str): Mode for threshold calculation ("static", "normal_distribution" or "percentage").
    - threshold (float): Threshold for categorization, either static value, percentile or percentage.

    Returns:
    - DataFrame: DataFrame containing simulated stock signals.
    """
    p_df = predict_df(model, df, keyfigure)  # predict the changes for each row

    if mode == "STATIC":
        p_df["signal"] = p_df["predicted change"].apply(categorize_prediction, pos_threshold=threshold, neg_threshold=(-threshold))
        print(f"Mode: {mode} - Positive: {threshold}, Negative: {-threshold}")
        plot_distribution(p_df, threshold, -threshold)

    elif mode == "NORMAL_DISTRIBUTION" or mode == "PERCENTAGE":
        pos_t, neg_t = calculate_auto_threshold(p_df["predicted change"], mode, threshold)
        p_df["signal"] = p_df["predicted change"].apply(categorize_prediction, pos_threshold=pos_t, neg_threshold=neg_t)
        print(f"Mode: {mode}, with thresholds for {threshold} percentile - Positive: {pos_t:.2f}, Negative: {neg_t:.2f}")
        plot_distribution(p_df, pos_t, neg_t) 

    else: 
        raise ValueError("Please use one of the following modes: STATIC, NORMAL_DISTRIBUTION or PERCENTAGE.")

    return p_df

### Evaluate

In [None]:
def calculate_mm_change(signals_df, signal_type):
    """
    Calculate the mean and median change based on stock signals.

    Args:
    - signals_df (DataFrame): DataFrame containing stock signals.
    - signal_type (str): Type of signal, "BUY" or "SELL".

    Returns:
    - tuple: Containing mean and median change.
    """
    if signals_df.empty:
        return 0

    change_mean = signals_df["actual change"].mean()
    change_median = signals_df["actual change"].median()
    count_of_rows = signals_df.shape[0]

    print(f"Adjusted Change for {signal_type} | Mean: {change_mean:.4f}%, Median: {change_median:.4f}%, for a total amount of {count_of_rows} rows.")
    return change_mean, change_median 

def find_best_worst_predictions(df, num_predictions):
    """
    Find the best, worst predictions and some other stats from a DataFrame.

    Args:
    - df (DataFrame): DataFrame containing predicted and actual changes.
    - num_predictions (int): Number of top and bottom predictions to display.

    Returns:
    - tuple: DataFrames for the best predictions, worst predictions, avg-, sum-errors, and count of tickers.
    """
    df["difference"] = abs(df["predicted change"] - df["actual change"])
    sorted_df = df.sort_values("difference")

    best = sorted_df.head(num_predictions)

    # drop rows where the mathematical symbol is identical
    sorted_df = sorted_df[~(sorted_df["predicted change"] * sorted_df["actual change"] > 0)]

    worst = sorted_df.tail(num_predictions)

    # calculate average and sum of errors for each ticker
    avg_errors = df.groupby("ticker")["difference"].mean()
    sum_errors = df.groupby("ticker")["difference"].sum()
    count_of_tickers = df.groupby("ticker").size()

    return best, worst, avg_errors, sum_errors, count_of_tickers

def evaluate_performance(sim_df, target, num_predictions):
    """
    Evaluate the performance of stock value predictions and their signal within a specified target date.

    Args:
    - sim_df (DataFrame): DataFrame containing simulated stock signals.
    - target (str): Target date identifier (e.g., "2W", "1M", "3M") specifying the time horizon.
    - num_predictions (int): Number of top and bottom predictions to display.

    Returns:
    - best (DataFrame): DataFrame containing the best predictions.
    - worst (DataFrame): DataFrame containing the worst predictions.
    - combined_df (DataFrame): DataFrame containing count, sum, and average errors for each ticker.

    Prints:
    - Total change in stock value.
    - Win (+) or Loss (-) in mean and median percentage.
    """
    print(f"Calculations for a target in {target}")

    # get the value of all stocks we want to buy, today and in target date
    buy_signals = sim_df[sim_df["signal"] == "BUY"]
    total_change_buy_mean, total_change_buy_median = calculate_mm_change(buy_signals, "BUY")

    # get the value of all stocks we want to sell, today and in the future
    sell_signals = sim_df[sim_df["signal"] == "SELL"]
    total_change_sell_mean, total_change_sell_median = calculate_mm_change(sell_signals, "SELL")

    # check if it was a good decision to buy/sell the stocks
    total_change_mean = total_change_buy_mean + (-total_change_sell_mean)
    total_change_median = total_change_buy_median + (-total_change_sell_median)
    print(f"Win (+) or Loss (-) | Mean: {total_change_mean:.4f}%, Median: {total_change_median:.4f}%")

    # find the stocks with the best and worst predictions
    best, worst, avg_errors, sum_errors, count_of_tickers = find_best_worst_predictions(sim_df, num_predictions)

    count_of_tickers_df = count_of_tickers.to_frame(name="Count of Rows")
    sum_errors_df = sum_errors.to_frame(name="Sum Errors")
    avg_errors_df = avg_errors.to_frame(name="Average Errors")

    combined_df = (pd.concat([count_of_tickers_df, sum_errors_df, avg_errors_df], axis=1)).sort_values(by="Average Errors")
    
    return best, worst, combined_df

### Get labels for target

In [None]:
def get_label_for_target(target):
    """
    Get label information for the specified target date.

    Args:
    - target (str): Target date identifier (e.g., "3D", "1W", 2W", "1M", "3M").

    Returns:
    - tuple: A tuple containing the content column name, change column name and date column name.

    Raises:
    - ValueError: If the provided target is not valid.
    """
    
    target_mappings = {
        "3D": ("content", "change_3D", "date_3D"),
        "1W": ("content", "change_1W", "date_1W"),
        "2W": ("content", "change_2W", "date_2W"),
        "1M": ("content", "change_1M", "date_1M"),
        "3M": ("content", "change_3M", "date_3M")
    }

    if target not in target_mappings:
        raise ValueError("Invalid target")

    return target_mappings[target]

### Get stats about the df

In [None]:
def show_stats(df, target):
    """
    Compute and print statistics related to changes in a DataFrame.

    Args:
    - df (DataFrame): The input DataFrame containing relevant columns.
    - target (str): Time target identifier (e.g., "3D", "1W", 2W", "1M", "3M").

    Returns:
    - None
    """
    change = (df[f"change_{target}"] > 0).sum() / len(df) * 100
    avg_change = df[f"change_{target}"].mean()

    print(f"Change {target}: {change:.2f}% with an AVG of: {avg_change:.2f}")

### Reduce size of test-dataset but keep equal split

In [None]:
def sample_rows(df, total_rows, random_seed=None):
    """
    Sample rows from a DataFrame for each ticker to maintain an equal representation.

    Args:
    - df (DataFrame): The input DataFrame.
    - total_rows (int): The total number of rows in the sampled DataFrame.
    - random_seed (int, optional): Seed for reproducibility.

    Returns:
    - sampled_df (DataFrame): The sampled DataFrame.
    """
    if total_rows is None: # if specified, use all rows
        return df

    #TODO: instead of not choosing enough, we take the difference at the end on top
    np.random.seed(random_seed)  # seed for reproducibility
    desired_rows_per_ticker = total_rows / len(df["ticker"].unique())
    
    def sample_group(group):
        ticker_rows = group.shape[0]
        
        # calculate the number of rows to sample
        sampled_rows = int(max(1, min(desired_rows_per_ticker, ticker_rows)))
        
        # sample w/o replacement
        sampled_group = group.sample(sampled_rows, replace=False)
        
        return sampled_group

    sampled_df = df.groupby("ticker").apply(sample_group).reset_index(drop=True)

    return sampled_df

### Print configs

In [None]:
def print_configuration(config):
    """
    Print the configuration settings.

    Args:
    - config: An object containing the configuration settings.
    """
    print("The following configuration will be used:")
    print(f"  - ETFs:           {config.ETFS}")
    print(f"  - Start Date:     {config.START_DATE}")
    print(f"  - End Date:       {config.END_DATE}")
    print(f"  - Calendar:       {config.CALENDAR}")
    print(f"  - Target:         {config.TARGET}")
    print(f"  - Model Type:     {config.MODEL_TYPE}")
    print(f"  - Model Name:     {config.MODEL_NAME}")
    print(f"  - Sample Size:    {'All rows' if config.SAMPLE_SIZE is None else config.SAMPLE_SIZE}")
    print(f"  - Random Seed:    {config.RANDOM_SEED}")
    print(f"  - Mode:           {config.THRESH_MODE}")
    print(f"  - Threshold:      {config.THRESHOLD}")

### Stop execution (Clean)

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

### BERT-Model

In [None]:
def bert(model_name, train_df, val_df, model_type, learning_rate, epochs, batch_size, weight_decay):
    """
    Fine-tunes a BERT-based model on the given training dataset and evaluates it on the validation dataset.

    Args:
    - model_name (str): The name of the pre-trained BERT model to use.
    - train_df (pandas.DataFrame): The training dataset as a pandas DataFrame.
    - val_df (pandas.DataFrame): The validation dataset as a pandas DataFrame.
    - model_type (str): The type of model to use (DistilBERT, FinBERT, ...).
    - learning_rate (float): Learning rate for the model.
    - epochs (int): Number of epochs to train the model.
    - batch_size (int): Batch size for the model.
    - weight_decay (float): Weight decay parameter for the optimizer.

    Returns:
    - transformers.AutoModelForSequenceClassification: The fine-tuned BERT model.
    """
    # combining model_name to directory
    output_directory = "models"
    model_directory = os.path.join(output_directory, model_name)

    # if the folder doesnt exist, create it	
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # if the model doesnt already exists, train it
    if not os.path.exists(model_directory):

        # load datasets from df
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)

        # load the tokenizer from the pretrained bert model
        tokenizer = AutoTokenizer.from_pretrained(model_type)

        # tokenizer function which "tokenizes" the input so the model can work with it
        def tokenize_function(examples):
            return tokenizer(examples["text"], padding="max_length", truncation=True)

        # call tokenize function through map of our dataset (high level citizien)
        train_tokenized = train_dataset.map(tokenize_function, batched=True)
        val_tokenized = val_dataset.map(tokenize_function, batched=True)

        # load the bert model for sequence classification, which has one more layer compared to the regular load
        model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=1)

        # rmse and standard deviation
        def compute_metrics_regr(eval_pred):
            predictions, labels = eval_pred
            rmse = mean_squared_error(labels, predictions, squared=False)
            labels_std = np.std(labels)
            rmse_std = ((rmse - labels_std) / labels_std)
            return {"rmse": rmse, "std": labels_std, "(rmse-std)/std": rmse_std}

        # trainer arguments, currently no output because our batch size is to small (min. 500 for first save?!)
        training_args = TrainingArguments(
            output_dir=model_directory,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            logging_strategy="epoch",
            logging_steps=1,
            load_best_model_at_end=True,
            metric_for_best_model="(rmse-std)/std")

        # create trainer object with parameters
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_tokenized,
            eval_dataset=val_tokenized,
            compute_metrics=compute_metrics_regr,
        )

        optimizer = torch.optim.AdamW(
            trainer.model.parameters(),
            lr=training_args.learning_rate,
            weight_decay=training_args.weight_decay
        )

        # set the optimizer in the trainer
        trainer.optimizer = optimizer

        # finetune the model
        trainer.train()

        # save our finetuned model to output path
        trainer.save_model(model_directory)

    # if the file already exists then load the model
    finetuned_model = AutoModelForSequenceClassification.from_pretrained(model_directory)
    return finetuned_model

## Execution Code Cells

### Load and display configuration

In [None]:
## Data to be used for the model
etfs        = config.ETFS
start_date  = config.START_DATE
end_date    = config.END_DATE
calendar    = config.CALENDAR
target      = config.TARGET

# LLM model parameters
learning_rate   = config.LEARNING_RATE
epochs          = config.EPOCHS
batch_size      = config.BATCH_SIZE
weight_decay    = config.WEIGHT_DECAY
model_type      = config.MODEL_TYPE
model_name      = config.MODEL_NAME

# Parameters for simulation & evaluation
sample_size = config.SAMPLE_SIZE 
random_seed = config.RANDOM_SEED
mode        = config.THRESH_MODE
threshold   = config.THRESHOLD
num_pred    = config.NUM_PREDICTIONS

print_configuration(config)

### Run the code

In [None]:
stocks_df = extract_from_etf(etfs) # get the stocks from the etfs

In [None]:
dates_js = get_trading_days(start_date, end_date, calendar) # all possible trading days in given time frame for the given calendar

In [None]:
values_js = get_values(stocks_df, start_date, end_date) # all stock, etf and mc values for the given stocks and time frame

In [None]:
news_df = get_content(stocks_df, start_date, end_date) # all news for the given stocks and time frame

In [None]:
up_news_df = update_dates(news_df, dates_js, target) # update dates to valid trading dates 

In [None]:
changes_df = calc_changes(up_news_df, values_js, target) # calc changes to the dataframe for the specified target

In [None]:
prep_df = preprocess(changes_df) # preprocess the data for the model 
show_stats(prep_df, target) # show some stats about the data

In [None]:
train, test, val = split_df_by_ticker(prep_df) # split the data into train, test and validation set by grouped tickers

text, label, label_date = get_label_for_target(target) # get the column names for the text, label and label_date

test = adjust(test, text, label, label_date)
train = adjust(train, text, label)
val = adjust(val, text, label)

In [None]:
model = bert(model_name, train, val, model_type, learning_rate, epochs, batch_size, weight_decay)

In [None]:
small_test = sample_rows(test, sample_size, random_seed)

In [None]:
sim = simulate_predictions(model, small_test, mode, threshold)

In [None]:
best, worst, combined_df = evaluate_performance(sim, target, num_pred)
display(best); display(worst); display(combined_df);