In [None]:
import numpy as np
import pandas as pd
import os 

start_date = '2010-01-04' # included 
end_date = '2022-12-30' # included

path = os.getcwd()  
print(f"The current absolute file path is: {path}")

data_path = os.path.join(os.path.dirname(path), 'Data') 

testdata_folder_path = data_path + "/SwapData/TestData"
bloombergdata_folder_path = data_path + "/SwapData/BloombergData"

In [None]:
# Go through the folder TestData and then go through each country folder and load all included excel-files. 
# Important that no files are opened when running the code

# Function to substract number part of file name so sort is: ad1year, ad2year, ad3year ...
def sort_filenames(filename):
    number_part = filename.split('year')[0][2:]
    return int(number_part)

def get_files(folder_path):

    df_list = []  
    list_curr = []

    for dirpath, dirnames, filenames in os.walk(folder_path): # type: ignore
    
        # Sort directories (curr) alphabetically 
        dirnames.sort()


        # On mac there is hidden files ".DS_Store" when using os.walk - We exclude them - probably not necesarry on PC? 
        # Command + shift + punktum -> show hidden files in finder! 
        non_hidden_files = [filename for filename in filenames if not filename.startswith('.')]
        non_hidden_files.sort(key=sort_filenames)

        for file_name in non_hidden_files:
     
            file_path = os.path.join(dirpath, file_name)       
            data = pd.read_excel(file_path)  
            
            curr = file_name[:2]
            mat = file_name[2:-9]

            list_curr.append(curr)

            processed_df = (
                pd.DataFrame(data)
                .iloc[6:]                                            # Remove first rows
                .rename(columns={data.columns[0]: 'Date',
                                 data.columns[1]: mat}))           # Rename Date)

            processed_df['Date'] = pd.to_datetime(processed_df['Date'])
            processed_df['Currency'] = curr

            processed_df = processed_df[["Date", "Currency", mat]].reset_index(drop = True)

            df_list.append(processed_df)

    return df_list, list_curr

In [None]:
def seperate_currency(data, currency):
    
    filtered_dataframes = []
    for df in data:
     filtered_df = df.query('Currency == @currency ')
     if not filtered_df.empty:
        filtered_dataframes.append(filtered_df)
    merged_df = filtered_dataframes[0]
    for i in range(1, len(filtered_dataframes)):
        merged_df = pd.merge(merged_df, filtered_dataframes[i], on=['Date', 'Currency'], how='outer')

    return merged_df 

def concat_currencies(data, currencies):
    dfs = [seperate_currency(data, currencies[i]) for i in range(len(currencies))]
    df  = pd.concat(dfs, ignore_index=True)
    return df.dropna().reset_index(drop=True)  

def find_missing(group):
    interpol_dates = pd.date_range(start=group["Date"].min(), end=group["Date"].max(), freq="BM")
    missing_dates = set(interpol_dates) - set(group["Date"])
    return ", ".join(d.strftime("%Y-%m-%d") for d in missing_dates) if missing_dates else ""

In [None]:
### Henter (training data)
df_bb, list_bb_curr = get_files(bloombergdata_folder_path)
bbdata = concat_currencies(df_bb, pd.Series(list_bb_curr).unique().tolist())
bbdata_filtered = bbdata[(bbdata["Date"] >= pd.to_datetime(start_date)) & 
                         (bbdata["Date"] <= pd.to_datetime(end_date))].drop(columns = "7")
#bbdata_filtered.to_csv(data_path + "/BloombergData_Swap_Features.csv", index = False)

### Henter test data
df_test, list_test_curr = get_files(testdata_folder_path)

testdata    = concat_currencies(df_test, pd.Series(list_test_curr).unique().tolist())
testdata_filtered_old_data = testdata[(testdata["Date"] < pd.to_datetime(start_date))]
testdata_filtered_new_data = testdata[(testdata["Date"] > pd.to_datetime(end_date))]

# testdata_filtered_old_data.to_csv(data_path + "/TestData_Swap_Features_pre.csv", index = False)
# testdata_filtered_new_data.to_csv(data_path + "/TestData_Swap_Features_post.csv", index = False)

# testdata_filtered_old_data.groupby("Currency").apply(find_missing).reset_index()