## Import data

In [36]:
### Data Processing

## Ignore warnings
import warnings
warnings.filterwarnings("ignore")
# Helper imports
import numpy as np
import pandas as pd
import os
import sys

from functools import reduce
import matplotlib.pyplot as plt

%matplotlib inline

current_working_dir = os.getcwd()
print(f"Current Working Directory: {current_working_dir}")
project_root = os.path.dirname(current_working_dir)
modules_path = os.path.join(project_root, 'Modules')
if modules_path not in sys.path:
    sys.path.append(modules_path)
    print(f"Added to sys.path for custom modules: {modules_path}")

####################################################################
#### NYSE Daily Open-Close Returns
####################################################################
data_folder_path = os.path.join(project_root, 'Data')
data_file_name = "OPCL_20000103_20201231.csv"   
data_file_path = os.path.join(data_folder_path, data_file_name) # So that we get to the file itself and not the folder it is in
returns_df = pd.read_csv(data_file_path) # Assumes file exists and is readable
returns_df.set_index('ticker', inplace=True)
returns_df.columns = pd.to_datetime(returns_df.columns.str.lstrip('X'), format='%Y%m%d').strftime('%Y-%m-%d')
returns_df_cleaned = returns_df.dropna().transpose() # Assumes dropna results in non-empty returns_df
returns_df_cleaned.index = pd.to_datetime(returns_df_cleaned.index)
returns_df_cleaned.index.name = 'date'
print("Data loaded and cleaned. Sample (first 5 rows/cols):")
print(returns_df_cleaned.iloc[0:5,0:5])
print(f"Shape of the cleaned data: {returns_df_cleaned.shape}")


Current Working Directory: n:\GitHub\ICAIF_25\Current_Code\Script
Data loaded and cleaned. Sample (first 5 rows/cols):
ticker            AA       ABM       ABT       ADI       ADM
date                                                        
2000-01-03 -0.013042 -0.009188 -0.007117 -0.036071  0.000000
2000-01-04  0.010043  0.012346 -0.012786 -0.044261  0.005277
2000-01-05  0.047628 -0.006192  0.011111  0.014493 -0.015915
2000-01-06 -0.011713  0.000000  0.032553 -0.027719  0.010695
2000-01-07 -0.016118  0.003091  0.028573  0.033654  0.005249
Shape of the cleaned data: (5279, 663)


In [2]:
# print(returns_df_cleaned)
# print(returns_df_cleaned[:50])

# Clustering Visualization

In [22]:
from signet.cluster import Cluster
from scipy import sparse

from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix


def calculate_weighted_cluster_portfolio_returns(asset_returns_df, n_clusters_to_form=10, 
                                                 sigma_for_gaussian_weights=.05) -> pd.DataFrame:
    """
    Calculates weighted returns for asset clusters using a streamlined, vectorized approach.

    NOTE: This version has no error handling and assumes perfect input data.

    Args:
        asset_returns_df (pd.DataFrame): DataFrame of asset returns.
            Rows are timestamps, columns are asset tickers.
        n_clusters_to_form (int): The desired number of clusters to form.
        sigma_for_gaussian_weights (float): Sigma value used in the Gaussian
            weight calculation (controls the spread of weights).

    Returns:
        pd.DataFrame: DataFrame containing the weighted returns for each cluster.
    """
    # --- 1. Calculate Correlation Matrix (More Efficiently) ---
    # Pearson correlation is invariant to scaling, so StandardScaler is redundant.
    correlation_matrix_df = asset_returns_df.corr(method='pearson').fillna(0)

    # --- 2. Apply Clustering Algorithm (with Vectorized SIGNET Prep) ---
    # Vectorize the creation of positive and negative correlation matrices.
    pos_corr = np.maximum(correlation_matrix_df.values, 0)
    neg_corr = np.maximum(-correlation_matrix_df.values, 0)
    signet_data = (sparse.csc_matrix(pos_corr), sparse.csc_matrix(neg_corr))

    num_assets = correlation_matrix_df.shape[0]
    effective_n_clusters = min(n_clusters_to_form, num_assets)

    # Assuming 'Cluster' is an external class with a 'SPONGE_sym' method
    cluster_obj = Cluster(signet_data)
    labels = cluster_obj.SPONGE_sym(effective_n_clusters)

    return np.array(labels)

def compute_clusters_over_time(returns_df_cleaned, num_days_to_cluster, lookback_period=252*4, 
                               n_clusters_to_form=10, sigma_for_gaussian_weights=.05):
    
    labels_list = []
    for day in range(num_days_to_cluster):
        lookback_window = returns_df_cleaned[day:day+lookback_period]
        labels_unaligned = calculate_weighted_cluster_portfolio_returns(lookback_window, n_clusters_to_form, sigma_for_gaussian_weights)

        # IZ: Find the optimal label alignment by solving the hungarian matching problem
        if day == 0:
            # For the first day, the initial labels are fine
            labels_aligned = labels_unaligned
        else:
            # For the second day onward, try to align the current labels with the most recent day
            labels_last = labels_list[-1]
            # Compute the confusion matrix between the two labeligns
            conf_mat = confusion_matrix(labels_last, labels_unaligned)
            # Compute the linear sum assignment / matching problem
            row_ind, col_ind = linear_sum_assignment(-conf_mat)
            mapping = dict(zip(col_ind, row_ind))
            labels_aligned = np.vectorize(mapping.get)(labels_unaligned)

        labels_list.append(labels_aligned)

    labels_arr = np.stack(labels_list, axis=0)
    return labels_arr


labels = compute_clusters_over_time(returns_df_cleaned, num_days_to_cluster=20, lookback_period=252*4, n_clusters_to_form=10, sigma_for_gaussian_weights=.05)

In [28]:
import numpy as np
import plotly.graph_objects as go

# Example clustering assignments (replace with your data)
assignments = labels[:10]

unique_clusters = np.unique(assignments)
cluster_to_index = {c: i for i, c in enumerate(unique_clusters)}

# Create nodes: one for each (day, cluster) combination
nodes = []
node_lookup = {}  # (day, cluster) -> node index

for day in range(assignments.shape[0]):
    for cluster in unique_clusters:
        label = f"Cluster {cluster}"
        node_idx = len(nodes)
        nodes.append(label)
        node_lookup[(day, cluster)] = node_idx

# Build links between days
source = []
target = []
value = []

for day in range(assignments.shape[0] - 1):
    curr_clusters = assignments[day]
    next_clusters = assignments[day + 1]
    
    for cluster_from in unique_clusters:
        for cluster_to in unique_clusters:
            mask = (curr_clusters == cluster_from) & (next_clusters == cluster_to)
            count = np.sum(mask)
            if count > 0:
                source.append(node_lookup[(day, cluster_from)])
                target.append(node_lookup[(day + 1, cluster_to)])
                value.append(count)

# Build Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])

# Add day labels as annotations
num_days = assignments.shape[0]
x_positions = np.linspace(0, 1, num_days)  # Evenly spaced x for each stack

annotations = []
for i, x in enumerate(x_positions):
    annotations.append(dict(
        x=x,
        y=-0.1,  # Position below the Sankey diagram
        xref='paper',
        yref='paper',
        text=f"Day {i}",
        showarrow=False,
        font=dict(size=12)
    ))

fig.update_layout(
    title_text="Cluster Assignment Transitions Over Time",
    font_size=10,
    annotations=annotations
)

fig.show()


# Sector Name Lookup

In [46]:
# import os
# os.path.abspath(os.getcwd())

sectors_df = pd.read_csv(r"..\Data\Sectors_SP1500.csv", header=0, names=["ETF_ticker", "Sector_code", "Individual_ticker", "Sector_name"])
print(sectors_df.head())

all_mappings = sectors_df[["ETF_ticker", "Sector_code", "Sector_name"]].drop_duplicates()

# Count unique ETF_Ticker values
num_unique_etf = sectors_df["ETF_ticker"].nunique()

# If these counts match, mappings are consistent
if len(all_mappings) == num_unique_etf:
    print("Perfect agreement between ETF_ticker, Sector_code, and Sector_name.")
else:
    print("Inconsistencies found in the mappings.")

print(sectors_df["Sector_name"].unique())

ticker_to_sector = dict(zip(sectors_df["Individual_ticker"], sectors_df["Sector_code"]))
print(f"SP1500 sectors dataset has {len(ticker_to_sector.keys())} individual companies in it")
# print(ticker_to_sector.keys())
dataset_tickers = returns_df_cleaned.columns.to_list()
missing = [s for s in dataset_tickers if s not in ticker_to_sector]
missing_count = len(missing)
print(f"There are {missing_count} tickers in the time series not found in the SP1500 Sectors dataset out of {len(dataset_tickers)} total")

# Convert to DataFrame
lookup_df = pd.DataFrame(list(ticker_to_sector.items()), columns=["Ticker", "Sector_Code"])

# Export to CSV
lookup_df.to_csv("../Data/ticker_sector_lookup.csv", index=False)

print("Lookup table exported to ticker_sector_lookup.csv")


  ETF_ticker  Sector_code Individual_ticker             Sector_name
0        XLK            1                 A  Information_Technology
1        XLB            2                AA               Materials
2        XLY            3               AAN  Consumer_Discretionary
3        XLI            4              AAON             Industrials
4        XLY            3               AAP  Consumer_Discretionary
Perfect agreement between ETF_ticker, Sector_code, and Sector_name.
['Information_Technology' 'Materials' 'Consumer_Discretionary'
 'Industrials' 'Financials' 'Health_Care' 'Energy' 'Consumer_Staples'
 'Utilities' 'Telecommunications_Services']
SP1500 sectors dataset has 1459 individual companies in it
There are 233 tickers in the time series not found in the SP1500 Sectors dataset out of 663 total
Lookup table exported to ticker_sector_lookup.csv


Perfect agreement between ETF_ticker, Sector_code, and Sector_name.


In [None]:
# Try to pull up-to-date sector codes from yfinance

# import yfinance as yf
# import time
# import math

# tickers = returns_df_cleaned.columns.to_list()

# yfinance_data = pd.DataFrame(columns=["ticker", "sector", "industry"])

# period=50
# for i in range(math.ceil(len(tickers)/period)):
#     time.sleep(1)
#     tickers_sublist = tickers[i*period:(i+1)*period]
#     tickers_sublist_str = " ".join(tickers_sublist)
#     t = yf.Tickers(tickers_sublist_str)
#     for sym in tickers_sublist:
#         sector = t.tickers[sym].info.get('sector')
#         industry = t.tickers[sym].info.get('industry')
#         pd.concat([yfinance_data, pd.Series([sym, sector, industry])], ignore_index=True)
#         print(f"Draw {i}: {sym}: Sector = {sector}, Industry = {industry}")

SyntaxError: incomplete input (2079608571.py, line 20)