In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import multiprocessing as mp

data_dir = 'Data/PriceData'

# Load Parquet files into data_frames dictionary
data_frames = {}
files = [file for file in os.listdir(data_dir) if file.endswith('.parquet')]
for file in tqdm(files, desc="Loading Parquet files"):
    ticker = file.split('.')[0]
    data_frames[ticker] = pd.read_parquet(os.path.join(data_dir, file))

def calculate_returns(df):
    df['Return'] = df['Close'].pct_change(fill_method=None)
    return df

def resample_and_calculate_returns(df, frequency):
    resampled_df = df.resample(frequency).last()
    resampled_df['Return'] = resampled_df['Close'].pct_change(fill_method=None)
    return resampled_df

for ticker, df in tqdm(data_frames.items(), desc="Calculating Returns"):
    data_frames[ticker] = calculate_returns(df)

# Calculate returns for different timeframes
daily_returns_df = pd.DataFrame({ticker: df['Return'] for ticker, df in data_frames.items()}).dropna(how='all')
weekly_returns_df = pd.DataFrame({ticker: resample_and_calculate_returns(df, 'W')['Return'] for ticker, df in data_frames.items()}).dropna(how='all')
monthly_returns_df = pd.DataFrame({ticker: resample_and_calculate_returns(df, 'ME')['Return'] for ticker, df in data_frames.items()}).dropna(how='all')

# Calculate correlation matrices for different timeframes
daily_correlation_matrix = daily_returns_df.corr()
weekly_correlation_matrix = weekly_returns_df.corr()
monthly_correlation_matrix = monthly_returns_df.corr()





#===============================[ Clustering TimeFrame Weighting ]==================================#
#===============================[ Clustering TimeFrame Weighting ]==================================#
#===============================[ Clustering TimeFrame Weighting ]==================================#
weights = {'daily': 0.65, 'weekly': 0.25, 'monthly': 0.10}
weighted_correlation_matrix = (weights['daily'] * daily_correlation_matrix +
                               weights['weekly'] * weekly_correlation_matrix +
                               weights['monthly'] * monthly_correlation_matrix)





# Handle missing values and scale the weighted correlation matrix
imputer = SimpleImputer(strategy='mean')
imputed_correlation = imputer.fit_transform(weighted_correlation_matrix)

scaler = StandardScaler()
scaled_correlation = scaler.fit_transform(imputed_correlation)

# Perform clustering
kmeans = KMeans(n_clusters=6, random_state=0)
clusters = kmeans.fit_predict(scaled_correlation)

# Create a DataFrame for the correlation matrix with cluster labels
correlation_matrix_df = pd.DataFrame(weighted_correlation_matrix)
correlation_matrix_df['Cluster'] = clusters

# Elbow Method to determine the optimal number of clusters
inertia = []
for k in tqdm(range(2, 20), desc="Elbow Method"):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(scaled_correlation)
    inertia.append(kmeans.inertia_)

# Silhouette Scores for optimal number of clusters
silhouette_scores = []
for k in tqdm(range(2, 20), desc="Silhouette Score"):
    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(scaled_correlation)
    silhouette_scores.append(silhouette_score(scaled_correlation, labels))

# Calculate group correlations
clustered_assets = correlation_matrix_df[['Cluster']].reset_index()
clustered_assets.columns = ['Ticker', 'Cluster']

def calculate_group_correlations(ticker):
    group_correlations = {}
    for group in sorted(clustered_assets['Cluster'].unique()):  # Ensure the clusters are processed in order
        tickers_in_group = clustered_assets[clustered_assets['Cluster'] == group]['Ticker']
        group_corr = weighted_correlation_matrix.loc[ticker, tickers_in_group].mean()
        group_correlations[f'correlation_{group}'] = group_corr
    return ticker, group_correlations

group_corr_columns = [f'correlation_{group}' for group in sorted(clustered_assets['Cluster'].unique())]
for col in group_corr_columns:
    clustered_assets[col] = np.nan

if __name__ == "__main__":
    tickers = clustered_assets['Ticker'].unique()
    
    with mp.Pool(mp.cpu_count()) as pool:
        results = list(tqdm(pool.imap(calculate_group_correlations, tickers), total=len(tickers), desc="Calculating Group Correlations"))

    for ticker, group_correlations in results:
        for col, value in group_correlations.items():
            clustered_assets.loc[clustered_assets['Ticker'] == ticker, col] = value

    mean_intra_group_corr = clustered_assets.groupby('Cluster')[group_corr_columns].mean().mean(axis=1)
    clustered_assets['mean_intragroup_correlation'] = clustered_assets['Cluster'].map(mean_intra_group_corr)
    clustered_assets['diff_to_mean_group_corr'] = clustered_assets.apply(lambda row: row[f'correlation_{row.Cluster}'] - row['mean_intragroup_correlation'], axis=1)

    reordered_columns = ['Ticker', 'Cluster', 'mean_intragroup_correlation', 'diff_to_mean_group_corr'] + group_corr_columns
    clustered_assets = clustered_assets[reordered_columns]

    clustered_assets = clustered_assets.round(5)
    clustered_assets.to_parquet('Correlations.parquet', index=False)

print("Correlations saved to 'Correlations.parquet'.")


Loading Parquet files: 100%|██████████| 1450/1450 [00:01<00:00, 780.81it/s]
Calculating Returns: 100%|██████████| 1450/1450 [00:00<00:00, 4667.58it/s]
Elbow Method: 100%|██████████| 18/18 [00:01<00:00, 11.66it/s]
Silhouette Score: 100%|██████████| 18/18 [00:01<00:00,  9.02it/s]
Calculating Group Correlations: 100%|██████████| 1450/1450 [00:00<00:00, 9224.71it/s]


Correlations saved to 'Correlations.parquet'.
