## Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import random # random class
import warnings
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from scipy.stats import linregress
from tqdm import tqdm # loop progress

from fredapi import Fred

In [None]:
# set your Eikon API app key
import eikon as ek
ek.set_app_key("d9b8f435f4694441a667806b4245931101403e3e") # this will not work for you if you dont have an refinitiv account

## get sp500 constituents from eikon (ignore, get it from file -> next couple of cells)

In [None]:
# function to download data
def GetStockData(ticker, start, end, interval):
        temp_data = ek.get_timeseries([ticker], 
                                      start_date = start, 
                                      end_date = end, 
                                      interval=interval)
        return temp_data

In [None]:
import datetime
# create a list of dates
start_date = datetime.date(2023, 4, 1)
end_date = start_date #datetime.date.today()

date_list = []
for year in range(start_date.year, end_date.year + 1):
    for month in range(1, 13):
        if year == start_date.year and month < start_date.month:
            continue
        elif year == end_date.year and month > end_date.month:
            break
        else:
            date_list.append(datetime.date(year, month, 1).strftime('%Y-%m-%d'))
date_list

In [None]:
# get constituents for specific dates
Ticker_ISIN_GICSSector_dict = {}

for date in tqdm(date_list):
    temp_data = ek.get_data('.SPX', ['TR.IndexConstituentRIC', 'TR.IndexConstituentName'], {'SDate':date})[0]
    riclist = temp_data['Constituent RIC'].tolist()
    temp_data_2 = ek.get_data(riclist, ['TR.ISIN', 'TR.GICSSector'])
    Ticker_ISIN_GICSSector_dict[date] = temp_data_2



In [None]:
# get the unique lists
isin_set = set()
ticker_set = set()
for date in Ticker_ISIN_GICSSector_dict:
    if isinstance(Ticker_ISIN_GICSSector_dict[date], tuple):
        isin_data = Ticker_ISIN_GICSSector_dict[date][0][['ISIN']]
        ticker_data = Ticker_ISIN_GICSSector_dict[date][0][['Instrument']]
    else:
        isin_data = Ticker_ISIN_GICSSector_dict[date][['ISIN']]
        ticker_data = Ticker_ISIN_GICSSector_dict[date][['Instrument']]
    isin_set.update(isin_data['ISIN'].tolist())
    ticker_set.update(ticker_data['Instrument'].tolist())

isin_df = pd.DataFrame({'ISIN': list(isin_set)})
isin_list = list(isin_set)
ticker_list = list(ticker_set)

In [None]:
# create a df with all close prices

# define the parameters for getting the stock data
start_date = "2019-12-31"
end_date = "2023-03-31"
interval = "daily"

# create an empty dictionary to store the resulting dataframes
result_dict = {}
# loop through the ticker list and call the function for each ticker
for ticker in ticker_list:
    try:
        result_dict[ticker] = GetStockData(ticker, start_date, end_date, interval)
        print(ticker, "downloaded successfully")
    except:
        print(ticker, "not successful")
        continue

In [None]:
# get all close prices to one dataframe
df_list = []

# Iterate over the dictionary items and reset the index of each DataFrame
for key, value in result_dict.items():
    value = value.reset_index()
    # Select the 'Date' and 'CLOSE' columns
    value = value[['Date', 'CLOSE']]
    # Rename the 'CLOSE' column to the key
    value = value.rename(columns={'CLOSE': key})
    # Set the 'Date' column as the index
    value = value.set_index('Date')
    # Add the DataFrame to the list
    df_list.append(value)

# Join the DataFrames on the index with an outer join
result_df = df_list[0]
for i in range(1, len(df_list)):
    result_df = result_df.join(df_list[i], how='outer')

# get rid of incomplete columns
result_df_clean = result_df.loc[:, result_df.count() == max(result_df.describe().T["count"])]
nans_count = result_df_clean.isna().sum().sum()
print("Number of NaN values in result_df:", nans_count)

In [None]:
# get 5 day returns
weekly_returns = result_df_clean.pct_change().rolling(window=5).sum()

In [None]:

# set the file path to where you want to save the DataFrame
desktop_path = '~/Desktop/ML2_assignment_5_project/'
file_name = 'clean_data_sp500_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
file_path = desktop_path + file_name
# save the DataFrame to a CSV file with a timestamp index and current date and time in the filename
result_df_clean.round(2).to_csv(file_path, index_label='timestamp', date_format='%Y-%m-%d')

## get sp500 constituents from file (use this, the files are hard copied in our Git repository)

In [None]:
# load data from desktop
path = r"~/Desktop/ML2_assignment_5_project/clean_data_sp500_20230421_150246.csv"
result_df_clean = pd.read_csv(path)
result_df_clean = result_df_clean.set_index('timestamp')

In [None]:
# get 5 day returns
weekly_returns = result_df_clean.pct_change().rolling(window=5).sum()

# define the type of change should be used
target = weekly_returns

## get GICS Sectors, again, eikon code cor completeness but use the files in the repository)

In [None]:
# eikon (ignore)

sector_classes_code, err = ek.get_data(target.columns.tolist(), 
                                       ['TR.GICSSectorCode', 'TR.GICSIndustryGroupCode', 
                                        'TR.GICSIndustryCode', 'TR.GICSSubIndustryCode'])
sector_classes, err = ek.get_data(target.columns.tolist(), 
                                  ['TR.GICSSector', 'TR.GICSIndustryGroup', 
                                   'TR.GICSIndustry', 'TR.GICSSubIndustry'])

In [2]:
path = r"~/Desktop/ML2_assignment_5_project/sector_classes_code_20230503_005638.csv" # update
sector_classes_code = pd.read_csv(path)

## get more idiosyncratic features (again Eikon, files in the repository)

In [3]:
fields = [
    "TR.ROAActValue",
    "TR.PriceToSalesPerShare",
    "TR.TotalDebtToEV",
    "TR.PriceToBVPerShare", 
    "TR.CompanyMarketCapitalization", 
    "TR.PriceNetChg30D", 
    "TR.AvgDailyValTraded30D", 
    "TR.SharpeRatioWkly2Y", 
    "TR.Volatility260D", 
    "TR.BetaWklyUp2Y", 
    "TR.BetaWklyDown2Y", 
    "TR.RSISimple14D", 
    "TR.ShortInterestPCT", 
    "TR.ShortInterestDTC"
]

In [None]:
df2, e = ek.get_data('0#.SPX', fields, parameters={'SDate': '2022-12-31', 'EDate':'2022-12-31'})
df2 = df2.set_index(df2.columns[0])
df2['Return On Assets - Actual'].fillna(df2['Return On Assets - Actual'].median(), inplace=True)
df2_without_nan = df2.dropna()
#rename a few columns
df2_without_nan = df2_without_nan.rename(columns={'Price To Sales Per Share (Daily Time Series Ratio)': 'Price To Sales Per Share'})
df2_without_nan = df2_without_nan.rename(columns={'Total Debt To Enterprise Value (Daily Time Series Ratio)': 'Total Debt To Enterprise Value'})
df2_without_nan = df2_without_nan.rename(columns={'Price To Book Value Per Share (Daily Time Series Ratio)': 'Price To Book Value Per Share'})

In [None]:
def sigmoid(x):
    x = 1 / (1 + np.exp(-x))
    return x

idio_feats = df2_without_nan.astype(float)

#ln transformation with negatives
ln_transform_list = ["Price To Sales Per Share",
                     "Total Debt To Enterprise Value",
                     "Price To Book Value Per Share",
                     "Company Market Capitalization",
                     "Average Daily Value Traded - 30 Days"]
for i in ln_transform_list:
    idio_feats.loc[:, i] = np.where(idio_feats[i] <= 0, 0, np.log(1+idio_feats[i]))

# price change transform
idio_feats.loc[:, "Trailing 30-day Price Net Change"] = np.clip(0.01*idio_feats["Trailing 30-day Price Net Change"], -0.5, 0.5)

# new feature: convexity 
idio_feats.loc[:, "Convexity"] = (idio_feats["Weekly Beta Up - 2 Year"] / idio_feats["Weekly Beta Down - 2 Year"])
idio_feats = idio_feats.drop(['Weekly Beta Up - 2 Year', 'Weekly Beta Down - 2 Year'], axis=1)
idio_feats.loc[:, "Convexity"] = sigmoid(idio_feats.loc[:, "Convexity"])
# short interest features
idio_feats.loc[:, "Days To Cover"] = np.log(1+idio_feats["Days To Cover"])
idio_feats.loc[:, "Short Interest Pct"] = np.log(idio_feats["Short Interest Pct"])

# profitability
idio_feats.loc[:, "Return On Assets - Actual"] = np.clip(np.log(1+idio_feats["Return On Assets - Actual"]*0.01), -0.1,1)

# vola
idio_feats.loc[:, "Volatility - 260 days"] = np.log(idio_feats["Volatility - 260 days"])

# standardize each row
idio_feats = idio_feats.apply(zscore)

In [None]:
# list with all tickers alphabetically
# idiosyncratic_feature_stocks = list(idio_feats.index)

In [None]:
# idiosyncratic feature plot

warnings.filterwarnings("ignore")

# Load data
df = idio_feats

# Plot time series on diagonal
g = sns.PairGrid(df, diag_sharey=False)

# Add correlation coefficients to lower triangle with colors
corr_matrix = df.corr()
cmap = sns.diverging_palette(220, 10, as_cmap=True)
for i, j in zip(*np.tril_indices_from(g.axes)):
    if i != j:
        coef = corr_matrix.iloc[i, j]
        g.axes[i, j].annotate(f"{coef:.2f}", (0.1, 0.9), xycoords='axes fraction', ha='left', va='center', color='w', fontsize=10, bbox=dict(boxstyle="round", facecolor=cmap(coef/2 + 0.5), alpha=1))

g.map_upper(sns.scatterplot)
#g.map_lower(sns.histplot, color='steelblue', bins=20, edgecolor='white')
g.map_lower(sns.kdeplot, shade=True)
g.map_diag(sns.kdeplot, shade=True)


plt.show()

warnings.filterwarnings("default")

In [None]:
# set the file path to where you want to save the DataFrame
desktop_path = '~/Desktop/ML2_assignment_5_project/'
file_name = 'idi_features{}.csv'
file_path = desktop_path + file_name
# save the DataFrame to a CSV file with a timestamp index and current date and time in the filename
df2_without_nan.round(5).to_csv(file_path, index_label='timestamp', date_format='%Y-%m-%d')

## get macro variables

In [None]:
# import FRED data
fred = Fred(api_key = "4a017ca39a1f96774f9587e5956bfd6b") # Alex

In [None]:
# function to get a dataframe with the variables
def create_table(series, column_name):
    df = series.to_frame()
    df = df.reset_index()
    df = df.rename(columns={"index": "date", 0: column_name})
    return df.set_index("date")

In [None]:
# variables: "ticker": "clear_name"
x_factors_dict = {
    "SP500": "S&P500", #risk
    "VIXCLS": "VIX", #risk
    
    "DGS10": "_10_y_UST", #rate
    "T10Y2Y": "_2s10s_UST", #rate
    "T10YIE": "_10_y_BE", #rate
    
    "DEXUSEU": "USD_to_EUR", #fx
    "DEXUSUK": "USD_to_GBP", #fx
    "DEXJPUS": "JPY_to_USD", #fx
    "DEXCHUS": "CNH_to_USD", #fx
    "DEXMXUS": "MXN_to_USD", #fx
    "DEXCAUS": "CAD_to_USD", #fx
    
    "DCOILBRENTEU": "Brent_Crude", #cmdty
    
    "WILLLRGCAP": "Large", #factor
    "WILLSMLCAP": "Small", #factor
    "WILLLRGCAPVAL": "Value", #factor
    "WILLLRGCAPGR": "Growth" #factor
}

why these variables?

#risk: overall equity market exposure

#rate: overall (real)rate and inflation exposure

#fx: mayjor currencies and neighbours currencies (some US stocks might be dependend on other regions)

#factor: small cap premium and value premium

In [None]:
# this will download the data. for each variable the total history will be downloaded! dont know why the date does not work...
dfs = []
for fred_series_id, column_name in x_factors_dict.items():
    series = fred.get_series(fred_series_id)
    df = create_table(series, column_name)
    dfs.append(df)

df = pd.concat(dfs, axis=1)

# we fill NaN with the previous value
df_filled = df.fillna(method='ffill')

In [None]:
#a little feature engineering (takes a few seconds)
df_filled["ValueOverGrowth"] = df_filled["Value"]/df_filled["Growth"]
df_filled["SmallOverLarge"] = df_filled["Small"]/df_filled["Large"]
df_filled["_10_y_UST_real"] = df_filled["_10_y_UST"]-df_filled["_10_y_BE"]
df_raw_features = df_filled.drop(['Large', 'Small', 'Value', 'Growth'], axis=1)

new_order = ["S&P500", "VIX", "ValueOverGrowth", "SmallOverLarge", "USD_to_EUR", 
             "USD_to_GBP", "JPY_to_USD", "CNH_to_USD", "MXN_to_USD", "CAD_to_USD",
             "_10_y_UST", "_10_y_UST_real", "_10_y_BE", "_2s10s_UST", "Brent_Crude"]

df_raw_features = df_raw_features[new_order]
df_raw_features

In [None]:
#visualizing all raw features (this takes a few seconds)
as_of = '2019-12-31'
df = df_raw_features[df_raw_features.index >= as_of]

# define the number of rows and columns for the plot grid
nrows = 3
ncols = 5

# create a new figure and axes
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(25, 10))

# flatten the axes array for easier indexing
axs = axs.flatten()

# iterate over the columns and create subplots
for i, col in enumerate(df.columns):
    ax = axs[i]
    sns.lineplot(x=df.index, y=col, data=df, ax=ax)
    ax.set_title(col)
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=45)  # set the x-tick label rotation to 45 degrees
    ax.set_ylabel('')

# hide the unused subplots
for i in range(len(df.columns), nrows*ncols):
    axs[i].axis('off')

# adjust the spacing between subplots
plt.subplots_adjust(wspace=0.3, hspace=0.5)

# add a title to the plot grid
fig.suptitle('Raw Macro Variables as of ' + as_of, fontsize=20, fontweight='bold')

# display the plot
plt.show()

In [None]:
# and slice the dateframe to the period we want
# start_date = "2019-12-31" end_date = "2023-03-31" is used in the initial data sourcing from eikon

start_date = "2019-12-31"
end_date = "2023-03-31"

df_raw_features_slice = df_raw_features.loc[start_date:end_date]

In [None]:
#calculate the changes

abs_changes = ["VIX", "_10_y_UST", "_10_y_UST_real", "_10_y_BE", "_2s10s_UST"]
rel_changes = ["S&P500", "ValueOverGrowth", "SmallOverLarge", "USD_to_EUR", "USD_to_GBP",
               "JPY_to_USD", "CNH_to_USD", "MXN_to_USD", "CAD_to_USD","Brent_Crude"]

df_raw_features_slice_chg = df_raw_features_slice.copy()
df_raw_features_slice_chg.loc[:, rel_changes] = df_raw_features_slice[rel_changes].pct_change().rolling(window=5).sum()
df_raw_features_slice_chg.loc[:, abs_changes] = df_raw_features_slice[abs_changes].diff().rolling(window=5).sum()

## Plotting the macrovariables

In [None]:
# feature plot changes

warnings.filterwarnings("ignore")

# Load data
df = df_raw_features_slice_chg.iloc[5:]

# Plot time series on diagonal
g = sns.PairGrid(df, diag_sharey=False)

# Add correlation coefficients to lower triangle with colors
corr_matrix = df.corr()
cmap = sns.diverging_palette(220, 10, as_cmap=True)
for i, j in zip(*np.tril_indices_from(g.axes)):
    if i != j:
        coef = corr_matrix.iloc[i, j]
        g.axes[i, j].annotate(f"{coef:.2f}", (0.1, 0.9), xycoords='axes fraction', ha='left', va='center', color='w', fontsize=10, bbox=dict(boxstyle="round", facecolor=cmap(coef/2 + 0.5), alpha=1))

g.map_upper(sns.scatterplot)
#g.map_lower(sns.histplot, color='steelblue', bins=20, edgecolor='white')
g.map_lower(sns.kdeplot, shade=True)
g.map_diag(sns.kdeplot, shade=True)


plt.show()

warnings.filterwarnings("default")

In [None]:
# feature plot changes

warnings.filterwarnings("ignore")

# Load data
df = df_raw_features_slice_chgLT.iloc[20:]

# Plot time series on diagonal
g = sns.PairGrid(df, diag_sharey=False)

# Add correlation coefficients to lower triangle with colors
corr_matrix = df.corr()
cmap = sns.diverging_palette(220, 10, as_cmap=True)
for i, j in zip(*np.tril_indices_from(g.axes)):
    if i != j:
        coef = corr_matrix.iloc[i, j]
        g.axes[i, j].annotate(f"{coef:.2f}", (0.1, 0.9), xycoords='axes fraction', ha='left', va='center', color='w', fontsize=10, bbox=dict(boxstyle="round", facecolor=cmap(coef/2 + 0.5), alpha=1))

g.map_upper(sns.scatterplot, cmap=cmap)
g.map_lower(sns.kdeplot, shade=True)
g.map_diag(sns.kdeplot, shade=True)

plt.show()

warnings.filterwarnings("default")

In [None]:
# feature plot 

warnings.filterwarnings("ignore")

# Load data
df = df_raw_features_slice

# Plot time series on diagonal
g = sns.PairGrid(df, diag_sharey=False)

# Add correlation coefficients to lower triangle with colors
corr_matrix = df.corr()
cmap = sns.diverging_palette(220, 10, as_cmap=True)
for i, j in zip(*np.tril_indices_from(g.axes)):
    if i != j:
        coef = corr_matrix.iloc[i, j]
        g.axes[i, j].annotate(f"{coef:.2f}", (0.1, 0.9), xycoords='axes fraction', ha='left', va='center', color='w', fontsize=10, bbox=dict(boxstyle="round", facecolor=cmap(coef/2 + 0.5), alpha=1))

g.map_upper(sns.scatterplot, cmap=cmap)
g.map_lower(sns.kdeplot, shade=True)
g.map_diag(sns.kdeplot, shade=True)

plt.show()

warnings.filterwarnings("default")

## merging all raw data to one big dataframe 

In [None]:
# n features
nfeat = df_raw_features_slice_chg.shape[1]

# Rename the index to date
target = target.rename_axis("date")
df_raw_features_slice_chg = df_raw_features_slice_chg.rename_axis("date")

# Convert the index to datetime
target.index = pd.to_datetime(target.index)
df_raw_features_slice_chg.index = pd.to_datetime(df_raw_features_slice_chg.index)

# Merge using left join, based on their index
total_raw_data = target.merge(df_raw_features_slice_chg, how='left', left_index=True, right_index=True)

# Detrend stocks by substracting s&p50
detrende_stocks = (total_raw_data.iloc[:, :-nfeat].sub(total_raw_data["S&P500"], axis=0))#.append(total_raw_data.iloc[:,-nfeat:])

# recreate the dataframe
total_raw_data_dt = pd.concat([detrende_stocks, total_raw_data.iloc[:,-nfeat:]], axis=1)


In [None]:
# visualizing the detrending
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15, 7))

ax[0].plot(total_raw_data["NEE.N"], alpha=0.5)
ax[0].plot(total_raw_data_dt["NEE.N"])
ax[0].plot(total_raw_data["S&P500"], alpha=0.5)

ax[1].plot(total_raw_data["NEE.N"].cumsum(), label="NEE.N", alpha=0.5)
ax[1].plot(total_raw_data_dt["NEE.N"].cumsum(), label="NEE.N Detrended")
ax[1].plot(total_raw_data["S&P500"].cumsum(), label="S&P500", alpha=0.5)

fig.suptitle("Stock vs. S&P vs. detrended stock", fontsize=16)
ax[0].set_ylabel("5d Return")
ax[0].spines["bottom"].set_visible(False)
ax[0].tick_params(axis='x', which='both', labelbottom=False)
ax[0].xaxis.grid(True)

ax[1].set_ylabel("Cumulative 5d Return")
ax[1].legend(loc="lower right")

for i in range(2):
    ax[i].spines["top"].set_visible(False)
    ax[i].spines["right"].set_visible(False)
    ax[i].xaxis.grid(True)  # set zorder to 0 to show grid lines below tick marks

plt.show()


# not this is not really a correct time series because it cumulates 5day returns. but for sake of visualizing its ok. 
# we're not using the cumulative time series anyways

## creating usable features
the idea is to calculate linear sensitivities of each stock towards the feature. for this, we first normalize all features (z-scores).
Second we calculate the slopes of linear regressions (betas). Those betas will be of similar scale

In [None]:
def data_split(data, start_date, mid_date, end_date, feature_pos):
    first_part = data.loc[start_date:mid_date]
    second_part = data.loc[mid_date:end_date]
    second_part_ex_feats = data.loc[mid_date:end_date].iloc[:,:-feature_pos]

    return first_part, second_part[1:], second_part_ex_feats[1:]

In [None]:
classifying_data, test_data, test_data_ex_feats = data_split(total_raw_data_dt, "2020-01-08", "2022-12-31", "2023-3-31", nfeat)

# for later:
corr_matrix_train = classifying_data.iloc[:, :-nfeat].corr()
corr_matrix_test = test_data_ex_feats.corr()

var_covar_matrix_train = np.cov(classifying_data.iloc[:, :-df_raw_features_slice_chg.shape[1]], rowvar=False)
var_covar_matrix_train = np.cov(test_data_ex_feats, rowvar=False)

# save matrices to harddrive
file_name = 'total_raw_data{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
total_raw_data.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')


In [None]:
# Select the last `nfeat` columns of your dataframe
cols_to_standardize = classifying_data.columns[-nfeat:]
standardized_features = classifying_data[cols_to_standardize].apply(zscore)
stocks = classifying_data.iloc[:,:-nfeat]
classifying_data = pd.concat([stocks,standardized_features], axis=1)

In [None]:
stocks = classifying_data.iloc[:, :-df_raw_features_slice_chg.shape[1]]
variables = classifying_data.iloc[:, -df_raw_features_slice_chg.shape[1]:]
# Create an empty dataframe to store the beta coefficients
betas = pd.DataFrame(index = stocks.columns, 
                     columns=variables.columns)
correls = pd.DataFrame(index = stocks.columns, 
                     columns=variables.columns)

In [None]:
# Loop through the columns of df1 and df2 and calculate the beta coefficients
for col1 in stocks.columns:
    for col2 in variables.columns:
        beta, _, _, _, _ = linregress(variables[col2], stocks[col1])
        betas.loc[col1, col2] = beta
        
# Loop through the columns of df1 and df2 and calculate the correlation coefficients
for col1 in stocks.columns:
    for col2 in variables.columns:
        corr = variables[col2].corr(stocks[col1])
        correls.loc[col1, col2] = corr

# raw beta scaling        
betas_std = np.std(betas.values, axis=None)
betas_mean = np.mean(betas.values, axis=None)

correls_std = np.std(correls.values, axis=None)
correls_mean = np.mean(correls.values, axis=None)

scaled_betas = (betas-betas_mean)/betas_std
scaled_correls = (correls-correls_mean)/correls_std

# merge scaled idiosyncratic features to scaled macro sensitivities
scaled_betas = scaled_betas.join(idio_feats, how = "inner")
scaled_correls = scaled_correls.join(idio_feats, how = "inner")

# calculate the percentile ranks of the values in betas and correls and store it in rank_betas and rank_correls dataframes
ranked_betas = scaled_betas.rank(pct=True)
ranked_correls = scaled_correls.rank(pct=True)

# combined values
mean_scaled = pd.DataFrame(index=scaled_betas.index, columns=scaled_betas.columns)
mean_ranked = pd.DataFrame(index=scaled_betas.index, columns=scaled_betas.columns)
# calculate the mean of the scaled values
mean_scaled = (scaled_betas + scaled_correls) / 2
# calculate the mean of the percentile ranks
mean_ranked = (ranked_betas + ranked_correls) / 2


In [None]:
warnings.filterwarnings("ignore")

# Define a blue color palette
colors = sns.color_palette("Blues")

# Create a figure with 5 subplots
fig, axs = plt.subplots(2, 3, figsize=(15, 10), sharex=True)

# Set the style for all subplots
sns.set_style("whitegrid")

# Create a boxplot for each column of betas and add it to a subplot


sns.boxplot(data=scaled_betas, ax=axs[0, 0], palette=colors)
axs[0, 0].set_title('Boxplot of Scaled Betas')
axs[0, 0].set_ylabel('Scaled Beta')

sns.boxplot(data=scaled_correls, ax=axs[0, 1], palette=colors)
axs[0, 1].set_title('Boxplot of Scaled Correls')
axs[0, 1].set_ylabel('Rank Beta')

sns.boxplot(data=mean_scaled, ax=axs[0, 2], palette=colors)
axs[0, 2].set_title('Boxplot of Mean_Scaled')
axs[0, 2].set_ylabel('Mean')

sns.boxplot(data=ranked_betas, ax=axs[1, 0], palette=colors)
axs[1, 0].set_title('Boxplot of Ranked Betas')
axs[1, 0].set_ylabel('Rank Correlation')

sns.boxplot(data=ranked_correls, ax=axs[1, 1], palette=colors)
axs[1, 1].set_title('Boxplot of Ranked Correls')
axs[1, 1].set_ylabel('Rank Correlation')

sns.boxplot(data=mean_ranked, ax=axs[1, 2], palette=colors)
axs[1, 2].set_title('Boxplot of Mean_Ranked')
axs[1, 2].set_ylabel('Mean')

# Rotate x-axis labels by 90 degrees for all subplots
for ax in axs.flat:
    ax.tick_params(axis='x', labelrotation=90)

fig.tight_layout()

# Show the plot
plt.show()



In [None]:
warnings.filterwarnings("ignore")

# Define a blue color palette
colors = sns.color_palette("Blues")

# Create a figure with 5 subplots
fig, axs = plt.subplots(2, 4, figsize=(20, 10), sharex=True)

# Set the style for all subplots
sns.set_style("whitegrid")

# Create a boxplot for each column of betas and add it to a subplot
sns.boxplot(data=betas, ax=axs[0, 0], palette=colors)
axs[0, 0].set_title('Boxplot of Raw Betas')
axs[0, 0].set_ylabel('Beta')

sns.boxplot(data=scaled_betas, ax=axs[0, 1], palette=colors)
axs[0, 1].set_title('Boxplot of Scaled Betas')
axs[0, 1].set_ylabel('Z-Score')

sns.boxplot(data=scaled_correls, ax=axs[0, 2], palette=colors)
axs[0, 2].set_title('Boxplot of Scaled Correlations')
axs[0, 2].set_ylabel('Z-Score')

sns.boxplot(data=mean_scaled, ax=axs[0, 3], palette=colors)
axs[0, 3].set_title('Boxplot of Mean_Scaled')
axs[0, 3].set_ylabel('avg. Z-Score')

sns.boxplot(data=correls, ax=axs[1, 0], palette=colors)
axs[1, 0].set_title('Boxplot of Raw Correlations')
axs[1, 0].set_ylabel('Correlation')

sns.boxplot(data=ranked_betas, ax=axs[1, 1], palette=colors)
axs[1, 1].set_title('Boxplot of Ranked Betas')
axs[1, 1].set_ylabel('Percentile Rank')

sns.boxplot(data=ranked_correls, ax=axs[1, 2], palette=colors)
axs[1, 2].set_title('Boxplot of Ranked Correlations')
axs[1, 2].set_ylabel('Percentile Rank')

sns.boxplot(data=mean_ranked, ax=axs[1, 3], palette=colors)
axs[1, 3].set_title('Boxplot of Mean_Ranked')
axs[1, 3].set_ylabel('avg. Percentile Rank')

# Rotate x-axis labels by 90 degrees for all subplots
for ax in axs.flat:
    ax.tick_params(axis='x', labelrotation=90)

fig.tight_layout()

# Show the plot
plt.show()


# ignore rest from here on down, legacy code. 

In [None]:
feature_list_of_dfs = [scaled_betas, ranked_betas, scaled_correls, ranked_correls, mean_scaled, mean_ranked]

## Clusterings

In [None]:
# getting cross correls per cluster

def map_stocks_to_clusters(stock_labels): # stock_labels is an array
    # Create an empty dictionary to store the mapping between stock labels and their respective classes.
    class_stock_mapping = {}
    # Loop through each stock label and its index in the input list.
    for i, stock_label in enumerate(stock_labels):
        # If the stock label is not already in the class_stock_mapping dictionary, add it as a key with an empty list value.
        if stock_label not in class_stock_mapping:
            class_stock_mapping[stock_label] = []
        # Append the index of the current stock label to its corresponding class list.
        class_stock_mapping[stock_label].append(i+1)
    # Sort the dictionary by the keys in ascending order.
    class_stock_mapping = dict(sorted(class_stock_mapping.items()))
    # Return the final dictionary that maps stock labels to their respective classes.
    return class_stock_mapping

def calculate_avg_interclass_corr(class_stock_mapping, stock_returns): # class_stock_mapping from the previous function, stock_returns is the return data
    # create an empty dictionary to store the dataframes and correlation matrices
    Cluster = {}
    # create an empty list to store the average interclass correlations
    avg_Cluster_Correls = []
    # loop over the stock_to_class dictionary using the enumerate function to get the index and the key-value pairs
    for i, (key, value) in enumerate(class_stock_mapping.items()):
        # concatenate the dataframes using a list comprehension and store the result in the H_Cluster dictionary
        # key is used as the dictionary key to store the result
        Cluster[key] = pd.concat([stock_returns.iloc[:, idx-1:idx] for idx in value], axis=1)
        # calculate the correlation matrix for the current dataframe and store it in a variable
        corr_mtrx = Cluster[key].corr()
        # calculate the interclass correlation excluding selfcorrelations 
        interclass_corr = (corr_mtrx.mean().mean() * len(corr_mtrx)**2 - len(corr_mtrx)) / (len(corr_mtrx)**2 - len(corr_mtrx))
        # append the average interclass correlation to the avg_H_Cluster_Correls list
        avg_Cluster_Correls.append(round(interclass_corr, 3))
    # return the resulting list of average interclass correlations
    return avg_Cluster_Correls

In [None]:
Level_1 = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(sector_classes_code.iloc[:, 1].to_numpy()), stock_returns = test_data_ex_feats)
Level_1 = [x for x in Level_1 if not math.isnan(x)]

Level_2 = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(sector_classes_code.iloc[:, 2].to_numpy()), stock_returns = test_data_ex_feats)
Level_2 = [x for x in Level_2 if not math.isnan(x)]

Level_3 = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(sector_classes_code.iloc[:, 3].to_numpy()), stock_returns = test_data_ex_feats)
Level_3 = [x for x in Level_3 if not math.isnan(x)]

Level_4 = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(sector_classes_code.iloc[:, 4].to_numpy()), stock_returns = test_data_ex_feats)
Level_4 = [x for x in Level_4 if not math.isnan(x)]

In [None]:

h_clust_results = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_kmean_correl), stock_returns = test_data_ex_feats)
h_clust_results = [x for x in h_clust_results if not math.isnan(x)]


h_clust_results_100 = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_3d), stock_returns = test_data_ex_feats)
h_clust_results_100 = [x for x in h_clust_results_100 if not math.isnan(x)]

In [None]:


# Create a figure with two subplots arranged side-by-side
fig, ax = plt.subplots(1, 6, figsize=(15, 5))

ax[0].boxplot(Level_1)
ax[0].set_title('GICS Level 1')
ax[0].set_ylabel('Average Interclass Correlation')

ax[1].boxplot(Level_2)
ax[1].set_title('GICS Level 2')

ax[2].boxplot(Level_3)
ax[2].set_title('GICS Level 3')

ax[3].boxplot(Level_4)
ax[3].set_title('GICS Level 4')

ax[4].boxplot(h_clust_results)
ax[4].set_title('Hierachical Clustering vs. Level 1')

ax[5].boxplot(h_clust_results_100)
ax[5].set_title('Hierachical Clustering vs. Level 4')

for i in range(6):
    ax[i].set_ylim([-1,1])
    ax[i].set_xticklabels([])  # remove x-labels
    if i == 5:
        break
    else:
        ax[i+1].set_yticklabels([])  # remove x-labels

    
    
# Add a shared y-axis label and adjust spacing between subplots
fig.text(0.5, 0.04, 'Average Interclass Correlation', ha='center', fontsize=14)
fig.subplots_adjust(wspace=0.3)

# Show the plot
plt.show()

## Test Correlation Matrix

In [None]:
corr_stats = corr_matrix_train.describe()
corr_stats_2 = corr_matrix_test.describe()
# Create a figure with two subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Plot the boxplot in the first subplot
sns.violinplot(ax=axes[0], data=corr_stats.iloc[1:-1].transpose(), color='green')
axes[0].set_title('train')
axes[0].set_ylim(-1, 1)
axes[0].set_ylabel('Beta')

# Plot the violinplot in the second subplot
sns.violinplot(ax=axes[1], data=corr_stats_2.iloc[1:-1].transpose(), color='green')
axes[1].set_title('test')
axes[1].set_ylim(-1, 1)
axes[1].set_ylabel('Beta')

# Adjust the layout and show the figure
plt.tight_layout()
plt.show()

## unnecessary clusterings

## hirachical clustering

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import AgglomerativeClustering

In [None]:
# Convert all columns of the DataFrame to float64
betas = scaled_betas.astype('float64')

# Calculate the distance matrix based on the features
distance_matrix = pdist(betas.values, metric='euclidean')

# Perform hierarchical clustering using the distance matrix
clusters = linkage(distance_matrix, method='ward')

# Set the number of colorized clusters
num_color_clusters = 10

# Create a dendrogram with the specified number of clusters colorized
plt.figure(figsize=(12, 6))
color_threshold = clusters[-num_color_clusters, 2]
dendrogram(clusters, labels=betas.index, leaf_rotation=90, color_threshold=color_threshold)
plt.axhline(y=color_threshold, c='gray', lw=1, linestyle='dashed')


plt.title(f"Hierarchical Clustering Dendrogram with First {num_color_clusters+1} Clusters Colorized")
#plt.gca().set_ylim([0.01, 1.5])
#plt.yscale("log")
plt.xlabel("Stocks")
plt.ylabel("Euclidean Distances")
plt.show()

# get array with clusters
h_clust_labes = fcluster(clusters, num_color_clusters, criterion='maxclust')



In [None]:
# Convert all columns of the DataFrame to float64
betas = scaled_betas.astype('float64')

# Calculate the distance matrix based on the features
distance_matrix = pdist(betas.values, metric='euclidean')

# Perform hierarchical clustering using the distance matrix
clusters = linkage(distance_matrix, method='ward')

# Set the number of colorized clusters
num_color_clusters = 100

# Create a dendrogram with the specified number of clusters colorized
plt.figure(figsize=(12, 6))
color_threshold = clusters[-num_color_clusters, 2]
dendrogram(clusters, labels=betas.index, leaf_rotation=90, color_threshold=color_threshold)
plt.axhline(y=color_threshold, c='gray', lw=1, linestyle='dashed')


plt.title(f"Hierarchical Clustering Dendrogram with First {num_color_clusters+1} Clusters Colorized")
#plt.gca().set_ylim([0.01, 1.5])
#plt.yscale("log")
plt.xlabel("Stocks")
plt.ylabel("Euclidean Distances")
plt.show()

# get array with clusters
h_clust_labes_100 = fcluster(clusters, num_color_clusters, criterion='maxclust')




## PCA & k-means on betas

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
# Calculate variance-covariance matrix

# Perform PCA to reduce dimensionality of the data
pca = PCA(n_components=28)
pca.fit(betas)
betas_pca = pca.transform(betas)

In [None]:
betas.shape

In [None]:
betas_pca.shape

In [None]:
pca_var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=9)*100)

In [None]:
# Creating an elbow plot to determine the optimal number of principal components
pca_var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
plt.plot(np.arange(1, len(pca_var)+1), pca_var)
plt.xlabel('Number of Principal Components')
plt.xticks(np.arange(1, len(pca_var)+1)) # Set x-ticks to match number of principal components
plt.ylabel('Explained Variance (%)')
plt.title('Elbow Plot')
plt.show()

In [None]:
# apply PCA
pca_2d = PCA(n_components=3)
pca_3d = PCA(n_components=8)

pca_2d.fit(betas.iloc[:,:15])
pca_3d.fit(betas.iloc[:,:15])

betas_pca_2d = pca_2d.transform(betas.iloc[:,:15])
betas_pca_3d = pca_3d.transform(betas.iloc[:,:15])

# Define a custom colormap
custom_cmap = plt.cm.get_cmap('viridis', 10)

# Perform k-means clustering on the principal components
kmeans_2d = KMeans(n_clusters=10)
kmeans_3d = KMeans(n_clusters=10)

kmeans_2d.fit(betas_pca_2d)
kmeans_3d.fit(betas_pca_3d)

labels_2d = kmeans_2d.labels_
labels_3d = kmeans_3d.labels_

In [None]:
# Create a grid of plots with both the 2D and 3D scatter plots
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].scatter(betas_pca_2d[:,0], betas_pca_2d[:,1], c=kmeans_2d.labels_, cmap=custom_cmap)
ax[0].set_xlabel('PC1')
ax[0].set_ylabel('PC2')
ax[0].set_title('2D Scatter Plot')

ax[1] = fig.add_subplot(122, projection='3d')
ax[1].scatter(betas_pca[:,0], betas_pca[:,1], betas_pca[:,2], c=kmeans_3d.labels_, cmap=custom_cmap)
ax[1].set_xlabel('PC1')
ax[1].set_ylabel('PC2')
ax[1].set_zlabel('PC3')
ax[1].set_title('3D Scatter Plot')

plt.show()

## BM Clusterings

## k-means on correlmatrix alone

In [None]:
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture




In [None]:
df_list = [corr_matrix_train, corr_matrix_test]

# loop through cluster depths and dataframes
results_BM = pd.DataFrame(columns=["Stock"])

cluster_depths = [11, 25, 67, 126]

for n_clusters in cluster_depths:

    for df_name, df in zip(["corr_matrix_train", "corr_matrix_test"], df_list):
        print(f"Working on {df_name} with depth {n_clusters}...")
        
        cluster_algos = {
            "KMeans": KMeans(n_clusters=n_clusters, max_iter=5000, n_init=1000),
            "GMM": GaussianMixture(n_components=n_clusters, random_state=42),
            "Hierarchical": AgglomerativeClustering(n_clusters=n_clusters),
            "Birch": Birch(threshold=0.5, n_clusters=n_clusters)
        }
        df_result = pd.DataFrame({"Stock": df.iloc[:, 0]})
        df_data = df.iloc[:, 1:]
        
        # loop through clustering algorithms
        for algo_name, algo in cluster_algos.items():
            algo_results = algo.fit_predict(df_data)
            df_result[f"{algo_name}_{n_clusters}_{df_name}"] = algo_results
        
        # check for missing or duplicated rows in df_result
        if df_result["Stock"].duplicated().any() or df_result["Stock"].isna().any():
            print(f"Warning: {df_name} with depth {n_clusters} contains missing or duplicated rows.")
        
        # merge results with df_result
        results_BM = pd.merge(results, df_result, on="Stock", how="outer")
        results_BM.to_excel("results_after_algos.xlsx", index=False)
print("done")

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

train_kmeans_11_a = KMeans(n_clusters=11, max_iter=1000, n_init=3000).fit(corr_matrix_train)
train_kmeans_11_b = KMeans(n_clusters=11, max_iter=1000, n_init=3000).fit(corr_matrix_train)

# Get the cluster labels for each model
labels_a = train_kmeans_11_a.labels_
labels_b = train_kmeans_11_b.labels_

# Calculate the Adjusted Rand Index between the two clusterings
ari = adjusted_rand_score(labels_a, labels_b)
print(f"The Adjusted Rand Index between the two clusterings is {ari}")


## results

In [None]:
test_data.iloc[:,:-15].corr().shape


In [None]:
plt.rcParams['figure.figsize'] = [6.4, 4.8]


#list of all crosscorrelations
crosscorr_matrix = test_data.iloc[:,:-15].corr()
lower_elements = np.tril(crosscorr_matrix, k=-1).flatten()
crosscorrels = pd.Series([element for element in lower_elements if element != 0 and element != 1])
crosscorrels.describe()

In [None]:

# Create a figure with two subplots arranged side-by-side
fig, ax = plt.subplots(1, 5, figsize=(15, 5))

ax[0].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(stock_labels), stock_returns = test_data))
ax[0].set_title('H_Clust')
ax[0].set_ylim([-1, 1])  # Set y-axis limits

ax[1].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_2d), stock_returns = test_data))
ax[1].set_title('PCA_KMEANS_2d')
ax[1].set_ylim([-1, 1])  # Set y-axis limits

ax[2].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_3d), stock_returns = test_data))
ax[2].set_title('PCA_KMEANS_3d')
ax[2].set_ylim([-1, 1])  # Set y-axis limits

ax[3].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_kmean_correl), stock_returns = test_data))
ax[3].set_title('KMEANS_correl')
ax[3].set_ylim([-1, 1])  # Set y-axis limits

# Create a boxplot for the third dataset
ax[4].boxplot(crosscorrels)
ax[4].set_title('All Crosscorrels')
ax[4].set_ylim([-1, 1])  # Set y-axis limits

# Add a shared y-axis label and adjust spacing between subplots
fig.text(0.5, 0.04, 'Average Interclass Correlation', ha='center', fontsize=14)
fig.subplots_adjust(wspace=0.3)

# Show the plot
plt.show()

## tests   klappt noch nicht wirklich

In [None]:
len(feature_list_of_dfs)

In [None]:
# apply PCA
pca_feat_1 = PCA(n_components=3)
pca_feat_2 = PCA(n_components=3)
pca_feat_3 = PCA(n_components=3)
pca_feat_4 = PCA(n_components=3)
pca_feat_5 = PCA(n_components=3)
pca_feat_6 = PCA(n_components=3)

pca_feat_1.fit(feature_list_of_dfs[0])
pca_feat_2.fit(feature_list_of_dfs[1])
pca_feat_3.fit(feature_list_of_dfs[2])
pca_feat_4.fit(feature_list_of_dfs[3])
pca_feat_5.fit(feature_list_of_dfs[4])
pca_feat_6.fit(feature_list_of_dfs[5])

feat_pca_1 = pca_feat_1.transform(feature_list_of_dfs[0])
feat_pca_2 = pca_feat_2.transform(feature_list_of_dfs[1])
feat_pca_3 = pca_feat_3.transform(feature_list_of_dfs[2])
feat_pca_4 = pca_feat_4.transform(feature_list_of_dfs[3])
feat_pca_5 = pca_feat_5.transform(feature_list_of_dfs[4])
feat_pca_6 = pca_feat_6.transform(feature_list_of_dfs[5])

# Define a custom colormap
custom_cmap = plt.cm.get_cmap('viridis', 10)

# Perform k-means clustering on the principal components
kmeans_feat_1 = KMeans(n_clusters=len(Level_4))
kmeans_feat_2 = KMeans(n_clusters=10)
kmeans_feat_3 = KMeans(n_clusters=10)
kmeans_feat_4 = KMeans(n_clusters=10)
kmeans_feat_5 = KMeans(n_clusters=10)
kmeans_feat_6 = KMeans(n_clusters=10)

kmeans_feat_1.fit(feat_pca_1)
kmeans_feat_2.fit(feat_pca_2)
kmeans_feat_3.fit(feat_pca_3)
kmeans_feat_4.fit(feat_pca_4)
kmeans_feat_5.fit(feat_pca_5)
kmeans_feat_6.fit(feat_pca_6)

labels_feat_1 = kmeans_feat_1.labels_
labels_feat_2 = kmeans_feat_2.labels_
labels_feat_3 = kmeans_feat_3.labels_
labels_feat_4 = kmeans_feat_4.labels_
labels_feat_5 = kmeans_feat_5.labels_
labels_feat_6 = kmeans_feat_6.labels_


In [None]:


# Create a figure with two subplots arranged side-by-side
fig, ax = plt.subplots(1, 6, figsize=(15, 5))

ax[0].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_1), stock_returns = test_data))
ax[0].set_title('Betas')
ax[0].set_ylabel('Average Interclass Correlation')

ax[1].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_2), stock_returns = test_data))
ax[1].set_title('Centered Betas')

ax[2].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_3), stock_returns = test_data))
ax[2].set_title('Rank Betas')

ax[3].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_4), stock_returns = test_data))
ax[3].set_title('Correls')

ax[4].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_5), stock_returns = test_data))
ax[4].set_title('Rank Correls')

ax[5].boxplot(calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(labels_feat_6), stock_returns = test_data))
ax[5].set_title('Mean Rank')

for i in range(6):
    ax[i].set_ylim([0,0.75])
    ax[i].set_xticklabels([])  # remove x-labels
    if i == 5:
        break
    else:
        ax[i+1].set_yticklabels([])  # remove x-labels

    
    
# Add a shared y-axis label and adjust spacing between subplots
fig.text(0.5, 0.04, 'Average Interclass Correlation', ha='center', fontsize=14)
fig.subplots_adjust(wspace=0.3)

# Show the plot
plt.show()

In [None]:
Clusterings = betas.iloc[:,:0]

# Add the labels as a new column to the betas DataFrame
Clusterings['H_Clust'] = stock_labels
Clusterings['PCA_KMEANS_2d'] = labels_2d
Clusterings['PCA_KMEANS_3d'] = labels_3d
Clusterings['KMEANS_correl'] = labels_kmean_correl

plt.plot(Clusterings.sort_values('H_Clust')['H_Clust'])
plt.plot(Clusterings.sort_values('H_Clust')['PCA_KMEANS_2d'])

In [None]:
Clusterings

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

# compute the Adjusted Rand Index between all pairs of columns (i.e., clusterings)
ari_scores = []
for c1, c2 in combinations(Clusterings.columns, 2):
    ari_score = adjusted_rand_score(Clusterings[c1], Clusterings[c2])
    ari_scores.append([c1, c2, ari_score])

# compute the Normalized Mutual Information between all pairs of columns (i.e., clusterings)
nmi_scores = []
for c1, c2 in combinations(Clusterings.columns, 2):
    nmi_score = normalized_mutual_info_score(Clusterings[c1], Clusterings[c2])
    nmi_scores.append([c1, c2, nmi_score])

# create dataframes to store the ARI and NMI scores
ari_df = pd.DataFrame(ari_scores, columns=['Clustering 1', 'Clustering 2', 'ARI'])
nmi_df = pd.DataFrame(nmi_scores, columns=['Clustering 1', 'Clustering 2', 'NMI'])

# create a pivot table to display the ARI and NMI scores as a heatmap
ari_pivot = ari_df.pivot(index='Clustering 1', columns='Clustering 2', values='ARI')
nmi_pivot = nmi_df.pivot(index='Clustering 1', columns='Clustering 2', values='NMI')

# plot the ARI and NMI heatmaps side-by-side
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.heatmap(ari_pivot, cmap='coolwarm', annot=True, fmt='.3f', vmin=-1, vmax=1, ax=ax[0])
sns.heatmap(nmi_pivot, cmap='coolwarm', annot=True, fmt='.3f', vmin=0, vmax=1, ax=ax[1])
ax[0].set_title('Adjusted Rand Index (ARI)')
ax[1].set_title('Normalized Mutual Information (NMI)')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# compute the pairwise correlation between all pairs of columns (i.e., clusterings) using the ARI metric
correlation_matrix = pd.DataFrame(index=Clusterings.columns, columns=Clusterings.columns)
for i in range(len(Clusterings.columns)):
    for j in range(len(Clusterings.columns)):
        correlation_matrix.iloc[i, j] = adjusted_rand_score(Clusterings.iloc[:, i], Clusterings.iloc[:, j])
correlation_matrix = correlation_matrix.astype(float)
# plot a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, vmin=-1, vmax=1)
plt.title('Similarity of Clustering Results')
plt.show()


In [None]:
import numpy as np

clusters = np.arange(1, 11)  # Create an array of cluster labels from 1 to 10
one_hot_vectors = np.eye(10)[clusters - 1]  # Apply one-hot encoding to the clusters
clusters, one_hot_vectors

In [None]:
np.eye(10)[Clusterings["H_Clust"]-1]

In [None]:

from sklearn.metrics.pairwise import cosine_similarity

# Load the Clusterings DataFrame

# One-hot encode the cluster columns
one_hot_clusters = pd.get_dummies(Clusterings)

# Calculate the cosine similarity between all pairs of clusters
similarity_matrix = cosine_similarity(one_hot_clusters)


## Data download

In [None]:
# set the file path to where you want to save the DataFrame
desktop_path = '~/Desktop/ML2_assignment_5_project/'
from datetime import datetime

In [None]:
# all training data in one df
file_name = 'time_series_train_data_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
classifying_data.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

# all test data in one df
file_name = 'time_series_test_data_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
test_data.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

# raw data features 
file_name = 'time_series_df_raw_features_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
df_raw_features_slice.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

# raw data stocks 
file_name = 'time_series_df_raw_stocks_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
result_df_clean.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

## sector mapping names
file_name = 'sector_classes_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
sector_classes.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

## sector mapping codes 
file_name = 'sector_classes_code_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
sector_classes_code.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')


In [None]:
## sector mapping codes 
file_name = 'sector_classes_code_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S'))
sector_classes_code.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

In [None]:
## stock time series
file_name = 'final_stock_timeseries.csv'
target.round(5).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

In [None]:
## matrices
feature_list_of_dfs = [scaled_betas, ranked_betas, scaled_correls, ranked_correls, mean_scaled, mean_ranked]

file_name = 'final_scaled_betas.csv'
feature_list_of_dfs[0].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

file_name = 'final_ranked_betas.csv'
feature_list_of_dfs[1].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

file_name = 'final_scaled_correls.csv'
feature_list_of_dfs[2].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

file_name = 'final_rank_correls.csv'
feature_list_of_dfs[3].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

file_name = 'final_mean_scaled.csv'
feature_list_of_dfs[4].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')

file_name = 'final_mean_ranked.csv'
feature_list_of_dfs[5].round(3).to_csv(desktop_path + file_name, index_label='timestamp', date_format='%Y-%m-%d')


In [None]:
scaled_betas.round(3)

In [None]:
data1
data2
data3
data4
data5
data6

data1_PCA
data2_PCA
data3_PCA
data4_PCA
data5_PCA
data6_PCA

cluster_depth = [11,25,67,126]
data_list = [data1, data2, ..., data6_PCA]

result_df = pandas
for cd in cluster_depth:
    for data in data_ist:
        algo1(data, cd)
        #add return to dataframe
        


def algo1(data, cluster_depth):
    return cluster_array

## resulst post clustering

In [None]:
# load data from desktop
path = r"~/Desktop/ML2_assignment_5_project/results_after_algos.xlsx"
results_after_algos = pd.read_excel(path)
results_after_algos = results_after_algos.set_index(results_after_algos.columns[0])

path = r"~/Desktop/ML2_assignment_5_project/results_after_algos_order.xlsx"
model_descr = pd.read_excel(path)


In [None]:
new_order = model_descr.iloc[:,0:1]
model_descr

In [None]:
flat_list = [item for sublist in new_order.values.tolist() for item in sublist]
results_after_algos = results_after_algos.loc[:, flat_list]
results_after_algos

In [None]:
clustermap = sns.clustermap((results_after_algos), cmap='coolwarm', method='ward')

# Show the plot
plt.show()

In [None]:
n_models = results_after_algos.shape[1]

results = []

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    for i in range(n_models):
        temp = results_after_algos.iloc[:,i:i+1].values.flatten()
        temp_res = calculate_avg_interclass_corr(class_stock_mapping = map_stocks_to_clusters(temp), stock_returns = test_data_ex_feats)
        temp_res = [x for x in temp_res if not math.isnan(x)]
        results.append(temp_res)
        
result_means = [np.mean(result) for result in results]

In [None]:
final_results = model_descr
final_results = final_results.assign(new_col=result_means).rename(columns={'new_col': 'avg. cluster performance'})
final_results = final_results.sort_values(by='avg. cluster performance')
final_results

In [None]:
data = final_results["avg. cluster performance"].values
# Set the plot style
sns.set_style('darkgrid')

# Plot the data
fig, ax = plt.subplots(figsize=(30, 8))
ax.bar(final_results['full_name'], final_results['avg. cluster performance'])

# Add axis labels and a title
ax.set_xlabel('Model')
ax.set_ylabel('Average Cluster Performance')
ax.set_title('Results of Clustering Algorithms')

# Rotate the x-axis tick labels for better readability
plt.xticks(rotation=90)

# Display the plot
plt.show()

In [None]:
import seaborn as sns

# create a pivot table with the parameters as columns and rows and the score as values
pivot_table = final_results.pivot_table(index=['type', 'depth'], columns=['inputs', 'dims'], values='avg. cluster performance')

# create a heatmap
sns.heatmap(pivot_table, cmap='coolwarm')

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# create a pivot table with the parameters as columns and rows and the score as values
pivot_table = final_results.pivot_table(index=['type', 'depth'], columns=['inputs', 'dims'], values='avg. cluster performance')

# add the row averages as a new column
pivot_table.loc[:, ('row_avg', '')] = pivot_table.mean(axis=1)

# add the column averages as a new row
pivot_table.loc[('', 'col_avg'), :] = pivot_table.mean(axis=0)



# create a heatmap
sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt='.3f')

plt.show()
