In [1]:
# Python packages

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import pathlib
import openpyxl
from sklearn.model_selection import train_test_split
import torch 
from pathlib import Path
from typing import Sequence
import os


ModuleNotFoundError: No module named 'models'

In [None]:
# File specific stuff (not strictly needed, only if one wishes to test Preprocess module code here)

# Swap files prefix and postfix names
prefix = ['ad','cd','nk','sw','dk','EU','US','uk','jy']
postfix = ["1year.xlsx", "2year.xlsx", "3year.xlsx", "5year.xlsx", "10year.xlsx", "15year.xlsx", "20year.xlsx", "30year.xlsx"]

# Training data range
date_start = "2009-12-31"
date_end = "2023-01-01"

data_path = 'data/BloombergData/'



# Function for loading data, loads a single currency into a dataframe, only removes meta data from .xlsx files

def load_data(data_path, prefix, postfix ):
    final_df = pd.DataFrame()
    files = [data_path +prefix +"/"  + prefix + post for post in postfix]
    
    for index, file in enumerate(files):
        temp_df = pd.DataFrame()
        df = pd.read_excel(file, index_col=None)
        if index == 0:
            final_df["Date"] = df.iloc[:, 0]
            final_df[index] = df.iloc[:, 1]
        else:
            temp_df["Date"] = df.iloc[:, 0]
            temp_df[index] = df.iloc[:, 1]
            final_df = final_df.set_index('Date').join(temp_df.set_index('Date'), on="Date").reset_index()
    
    final_df.insert(1, "Currency", prefix)
    return final_df.iloc[6:].reset_index(drop=True)


# Initial cleaining of data, removes dates not in the specified range and drops NaNs, also collects all the currencies into one DataFrame
# calls load_data()

def init_cleaning(prefix, postfix, data_path, date_start, date_end):

    df_list = []
    for file in prefix:
        df_list.append(load_data(data_path, file, postfix))

    result = pd.concat(df_list)
    result.drop(result[pd.to_datetime(result.Date) < date_start].index, inplace=True)
    result.drop(result[pd.to_datetime(result.Date) > date_end].index, inplace=True)
    return result.dropna()

# Final clean of data, removes Dates all together from DataFrame, and splits the data in train and validation set.

def data_clean(df): 
    array = np.array(df.iloc[:,1:].reset_index(drop=True))
    x_train, x_val = train_test_split(array, test_size=0.1,random_state=0)    

    return x_train, x_val

# Final clean of data, removes Dates all together from Dataframe, this function should only be used for test data OR if you want full training data
# without the shuffling done in scikit train_test_split

def data_clean_test(df): 
    array = np.array(df.iloc[:,1:].reset_index(drop=True))
    
    return array


In [None]:

# Code for creating plots of the input data, both swaprates for a single date, but also development of fx 30 maturitiy swap rates over whole data range.


# Alternative way of loading data set, 

def load_dataset(
        maturities: Sequence[int] = (1, 2, 3, 5, 10, 15, 20, 30),
        currencies: Sequence[str] = ('dk', 'eu', 'uk', 'sw', 'nk',
                                     'ad', 'cd', 'jy', 'us'),
        data_path: Path = Path(os.path.abspath(''))/ 'data'
        ) -> dict[str, pd.DataFrame]:
    """Load Bloomberg datasets and return a dictionary with frames for each
    currency"""
    possible_dates = pd.read_excel(data_path / 'all_dates.xlsx',
                                   parse_dates=[0],
                                   header=None)
    final_dict = dict()
    for currency in currencies:
        currency_df = pd.DataFrame(columns=['Date'] + list(maturities))
        currency_df['Date'] = possible_dates
        mask = ((currency_df['Date'] > '2009-12-31') &
                (currency_df['Date'] < '2023-01-01'))
        currency_df = currency_df[mask].set_index('Date')
        for maturity in maturities:
            df = pd.read_excel(
                data_path / f'BloombergData/{currency}/{currency}{maturity}year.xlsx',
                skiprows=6,
                parse_dates=['Date'])
            currency_df[maturity] = df.set_index('Date')['PX_LAST']
        final_dict.update({f'{currency}': currency_df/100})
    return final_dict


def run_data_load(date_to_plot_fig_a: str,
                  maturity_fig_b: int):
    data_dict = load_dataset()
    color_dict = {'DKK': 'red', 'EUR': 'blue', 'GBP': 'green', 'SEK': 'purple',
                  'NOK': 'orange', 'AUD': 'pink', 'CAD': 'grey', 'JPY': 'black',
                  'USD': 'brown'}
    map = {"dk" : "DKK", "eu":"EUR", "cd" : "CAD", "nk": "NOK", "sw": "SEK", "us": "USD", "jy": "JPY", "uk": "GBP", "ad" : "AUD"}
    data_dict ={ map[k]: data_dict[k] for k in data_dict}

    # Code for plotting all currency swap curve for a single data
    for currency, swap_rate in data_dict.items():
        plt.plot(swap_rate.loc[date_to_plot_fig_a, :], label=currency,
                 marker='^', color=color_dict[currency])
    plt.legend(loc = "center left", bbox_to_anchor=(1,0.5))
    plt.ylim(-0.01, 0.04)
    plt.xlabel("Maturity")
    plt.ylabel("Swap Rate")
    plt.title("Swap Rates For All Currencies")
    plt.show()

    # Code for plotting development of a single maturity for the whole date range.
    for currency, swap_rate in data_dict.items():
        plt.plot(swap_rate.loc[:, maturity_fig_b], label=currency,
                 color=color_dict[currency])
    plt.legend(loc = "center left", bbox_to_anchor=(1,0.5))
    plt.xlabel("Years")
    plt.ylabel("Swap Rate")
    plt.title("Swap Rates With 30 Years to Maturity")
    plt.show()


if __name__ == '__main__':
    run_data_load(date_to_plot_fig_a='2017-10-31',
                  maturity_fig_b=30)