In [None]:
import pandas as pd
import numpy as np
import os
import sys

current_working_dir = os.getcwd()
print(f"Current Working Directory: {current_working_dir}")
project_root = os.path.dirname(current_working_dir)
modules_path = os.path.join(project_root, 'Modules')
if modules_path not in sys.path:
    sys.path.append(modules_path)
    print(f"Added to sys.path for custom modules: {modules_path}")
data_folder_path = os.path.join(project_root, 'Data')
data_file_name = "OPCL_20000103_20201231.csv"   # Is this log(returns) or just returns
data_file_path = os.path.join(data_folder_path, data_file_name)

Current Working Directory: n:\GitHub\ICAIF_25\New Code\Script
Added to sys.path for custom modules: n:\GitHub\ICAIF_25\New Code\Modules


In [None]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from parallelized_runs import run_sliding_window_var_evaluation_vectorized
import multiprocessing

warnings.filterwarnings("ignore", category=UserWarning, module='statsmodels')
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

try:
    from signet.cluster import Cluster
except ImportError:
    print("Signet package not found. Attempting to install from GitHub...")
    try:
        import subprocess
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "git+https://github.com/alan-turing-institute/SigNet.git"]
        )
        # This part of the code should go first since importing parallelized_runs already requires the signet package
        from signet.cluster import Cluster
        print("Signet package installed successfully.")
    except Exception as e:
        print(f"Error installing Signet package: {e}")
        print("Please install it manually: pip install git+https://github.com/alan-turing-institute/SigNet.git")

In [None]:
multiprocessing.freeze_support()

df = pd.read_csv(data_file_path) # Assumes file exists and is readable

df.set_index('ticker', inplace=True)
df.columns = pd.to_datetime(df.columns.str.lstrip('X'), format='%Y%m%d').strftime('%Y-%m-%d')
df_cleaned = df.dropna().transpose() # Assumes dropna results in non-empty df
df_cleaned.index = pd.to_datetime(df_cleaned.index).astype(float)
print("Data loaded and cleaned. Sample (first 5 rows/cols):")
print(df_cleaned.iloc[0:5,0:5])
print(f"Shape of the cleaned data: {df_cleaned.shape}")

train_data_ratio = 0.8
split_index = train_data_ratio * int(len(df_cleaned))
df_train_dataset = df_cleaned.iloc[:split_index]
df_test_dataset = df_cleaned.iloc[split_index:]

##################################################################### PARAMETERS #####################################################################
# I think we also need a train-test split here
initial_lookback_len = 252
evaluation_len = 20 # I feel like this could be 252? Refer to our discussion
# num_clusters_config = [50, 100, 150] # Why 50, 100, 150? 
num_clusters_config = [5, 10, 15, 20] 
cluster_method_config = 'SPONGE'
sigma_config = 0.01 # This is a hyperparameter for SPONGE, ... hyperparameter tuning? Usual: 0.01 (less sparse), 0.1 (more sparse)
num_windows_config = 20 # Ensure df_cleaned has enough data for this: (num_windows_config-1)*eval_len + initial_lookback_len + eval_len
repetitions = 3 # This should not be \"multiplied\" for the number of runs, if cluster initialization is to be refreshed every window.
var_orders_config = [5, 7, 9] # Ensure initial_lookback_len - evaluation_len > max(var_orders_config)
####################################################################################################################################################
# Basic check for parameter sanity (example)
if not (initial_lookback_len - evaluation_len > max(var_orders_config)):
    raise ValueError("Insufficient lookback length for hyperparameter evaluation based on var_orders_config and evaluation_len.")
if not (df_cleaned.shape[0] >= (num_windows_config) * evaluation_len + initial_lookback_len):
    raise ValueError("Insufficient total data for the specified number of windows, lookback, and evaluation lengths.")


all_lags_combined_pnl = []
sample_forecast_details = {}


print(f"\n===== Running Parallelized Grid Search Evaluation (Simplified Edges) =====")
results_dict = run_sliding_window_var_evaluation_vectorized(
    asset_returns_df=df_train_dataset,
    initial_lookback_len=initial_lookback_len,
    eval_len=evaluation_len,
    repetitions=repetitions,
    n_clusters_config=num_clusters_config,
    cluster_method=cluster_method_config,
    var_order_config=var_orders_config,
    sigma_intra_cluster=sigma_config,
    # num_windows_config=num_windows_config,
    num_windows_config=96,
    store_sample_forecasts=True,
    run_naive_var_comparison=True,

    max_threads=8
)

all_lags_combined_pnl.extend(results_dict['cluster_avg_pnl_list'])
if 'naive_avg_pnl_list' in results_dict: # Still check as it's optional
    all_lags_combined_pnl.extend(results_dict['naive_avg_pnl_list'])

sample_forecast_details['forecast'] = results_dict.get('sample_forecast_cluster')
sample_forecast_details['actual'] = results_dict.get('sample_actual_cluster')
sample_forecast_details['window_idx'] = results_dict.get('sample_window_idx_cluster')
sample_forecast_details['method'] = 'Clustered VAR (Parallel Grid Search, Simplified)'

df_all_pnl_by_lag_method = pd.DataFrame(all_lags_combined_pnl) # Assumes all_lags_combined_pnl is not empty

print("\n--- Grid Search Completed (Parallelized, Simplified) ---")
print("\nAverage Window PNL per Selected Lag Order and Method:")
pivot_index_cols = ['VAR_Order']
if 'N_Clusters' in df_all_pnl_by_lag_method.columns and \
    any(item['Method'] == 'Clustered VAR' for item in all_lags_combined_pnl if 'Method' in item):
    pivot_index_cols.append('N_Clusters')

avg_pnl_pivot = df_all_pnl_by_lag_method.pivot_table(
    index=pivot_index_cols,
    columns='Method',
    values='Avg_Window_PNL',
    aggfunc='mean'
)
print(avg_pnl_pivot)

# Plotting (assumes data is present for plots)
plt.figure(figsize=(14, 8))
sns.boxplot(x='VAR_Order', y='Avg_Window_PNL', hue='Method', data=df_all_pnl_by_lag_method)
plt.title(f'Distribution of Average Window PNL by Selected VAR Lag Order and Method')
plt.xlabel('Selected VAR Lag Order for Window')
plt.ylabel(f'Average Window PNL')
plt.legend(title='Forecast Method')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

sample_forecast = sample_forecast_details['forecast']
sample_actual = sample_forecast_details['actual']
stored_window_idx = sample_forecast_details.get('window_idx') # Removed "N/A" default
method_name = sample_forecast_details.get('method')

title_var_order, title_n_clusters = "N/A", "N/A" # Keep N/A for title if not found
# Simplified logic for getting title parameters
sample_window_data_rows = df_all_pnl_by_lag_method[
    (df_all_pnl_by_lag_method['Window_ID'] == stored_window_idx) &
    (df_all_pnl_by_lag_method['Method'] == 'Clustered VAR')
]
if not sample_window_data_rows.empty: # Still need this check for robustness of title
    row = sample_window_data_rows.iloc[0]
    title_var_order = row['VAR_Order']
    if 'N_Clusters' in row.index:
        title_n_clusters = row['N_Clusters']

print(f"\n--- Plotting Predictions vs. Actuals for Sample Window {stored_window_idx + 1 if isinstance(stored_window_idx, int) else stored_window_idx} ({method_name}) ---")
print(f"Params for this sample: VAR Lag={title_var_order}, N_Clusters={title_n_clusters}")

# Assumes sample_forecast and sample_actual are valid DataFrames with columns
num_items_to_plot = min(3, sample_forecast.shape[1], sample_actual.shape[1])
for i in range(num_items_to_plot):
    item_name = sample_forecast.columns[i]
    # Assumes item_name is in sample_actual.columns
    plt.figure(figsize=(14, 7))
    actual_plot_data = sample_actual[item_name].values[title_var_order-1:]
    forecast_plot_data = sample_forecast[item_name].values
    min_plot_len = min(len(actual_plot_data), len(forecast_plot_data))

    sns.lineplot(data=actual_plot_data[:min_plot_len], label=f'Actual - {item_name}', marker='o', linestyle='-')
    sns.lineplot(data=forecast_plot_data[:min_plot_len], label=f'Forecast - {item_name}', marker='x', linestyle='--')
    plt.title(f'Prediction vs. Actual for {item_name} (Window {stored_window_idx + 1 if isinstance(stored_window_idx, int) else stored_window_idx}, VAR Lag {title_var_order}, N_Clust {title_n_clusters})')
    plt.xlabel('Forecast Step')
    plt.ylabel('Return Value (Cluster)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

NameError: name 'multiprocessing' is not defined

In [None]:
print(f"\n===== Running Parallelized Grid Search Evaluation (Simplified Edges) =====")
results_dict = run_sliding_window_var_evaluation_vectorized(
    asset_returns_df=df_test_dataset,
    initial_lookback_len=initial_lookback_len,
    eval_len=evaluation_len,
    repetitions=repetitions,
    n_clusters_config=num_clusters_config,
    cluster_method=cluster_method_config,
    var_order_config=var_orders_config,
    sigma_intra_cluster=sigma_config,
    # num_windows_config=num_windows_config,
    num_windows_config=20*evaluation_len - initial_lookback_len,
    store_sample_forecasts=True,
    run_naive_var_comparison=True,

    max_threads=8
)

In [None]:
print(results_dict.keys())

# results_dict['sample_forecast_cluster']
results_dict['sample_actual_cluster']


dict_keys(['cluster_avg_pnl_list', 'sample_forecast_cluster', 'sample_actual_cluster', 'sample_window_idx_cluster', 'naive_avg_pnl_list'])


Unnamed: 0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20
0,0.008645178,0.01003,-0.005038,0.000977,-0.000418,0.007577,0.005224,0.005244,-0.000455,-0.002318,-0.011875,-0.009528,-0.04406,-0.004329,0.001288,,0.007609,-0.014493,-0.001773,-0.018238
1,-0.003624925,-0.031505,-0.008396,-0.014764,-0.001101,-0.006242,0.006646,-0.005214,-0.007093,-0.006097,-0.025695,0.009553,0.008482,0.003088,-9e-06,,0.008927,-0.008558,0.002027,-0.013505
2,0.004347631,-0.030872,0.005072,0.002966,-0.000289,-0.018778,0.002055,0.011675,0.005217,-0.003047,-0.017271,0.010292,0.025946,-0.027081,0.008618,,-0.014234,0.001894,0.000648,0.013407
3,0.0007221055,-0.019112,0.003361,-0.027399,-0.000352,-0.032529,-0.000734,0.006444,-0.000318,-0.012298,-0.004698,-0.005965,-0.013448,-0.008734,0.006642,,-0.025562,-0.005352,4.3e-05,0.006028
4,0.002879745,-0.02697,0.003384,-0.022002,0.000241,-0.023993,-0.003348,0.02207,-0.002187,-0.00308,-0.020549,-0.006809,-0.05657,-0.0479,0.004012,,-0.001325,-0.010279,0.000593,0.015523
5,-8.924309e-08,-0.026102,0.0,0.012685,0.006848,-0.022934,-0.000658,-0.046867,-0.00071,0.005399,-0.034683,0.003384,-0.038466,-0.065502,-0.003363,,-0.031898,-0.016831,0.001487,0.002367
6,0.004328607,-0.018059,-0.005063,0.001569,0.006529,-0.032003,0.006522,-0.025638,-0.007664,-0.006224,-0.035389,-1.257255e-11,-0.107849,-0.020001,0.002046,,-0.021644,-0.025816,0.000279,-0.017741
7,-0.00505977,0.091874,0.001688,0.062585,-0.01617,0.080888,-0.003979,-0.013438,0.004845,-0.00929,0.018742,0.008333,0.153464,0.084474,-0.013318,,0.001772,0.015602,-0.000996,0.003599
8,1.60073e-07,-0.012606,-0.011775,0.01335,0.013528,-0.003268,-0.000711,0.016783,0.005174,-0.003905,0.008386,-0.024795,-0.031319,0.000952,-0.001975,,0.024142,-0.00446,-0.000726,-0.014546
9,-0.0007225496,0.017005,0.0,0.009334,1.8e-05,0.011125,-0.000607,0.030429,0.001709,0.000786,0.003357,0.009909,-0.021822,0.01848,-0.001995,,0.004818,0.001849,0.000126,0.006057
