In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from numba import njit

data_dir = 'Data/PriceData'

# Load Parquet files into data_frames dictionary
data_frames = {}
files = [file for file in os.listdir(data_dir) if file.endswith('.parquet')]
for file in tqdm(files, desc="Loading Parquet files"):
    ticker = file.split('.')[0]
    data_frames[ticker] = pd.read_parquet(os.path.join(data_dir, file))

@njit
def calculate_dynamic_weights(volatility, mean_volatility, std_volatility):
    if std_volatility == 0:
        return np.array([0.25, 0.25, 0.25, 0.25])
    
    z_score = (volatility - mean_volatility) / std_volatility
    weights = np.array([0.25 - 0.1*z_score, 0.25 - 0.05*z_score, 0.25 + 0.05*z_score, 0.25 + 0.1*z_score])
    weights = np.maximum(np.minimum(weights, 1), 0)  # This replaces np.clip
    return weights / np.sum(weights)

@njit
def calculate_weighted_returns_fast(returns, volatility, mean_volatility, std_volatility):
    weighted_returns = np.zeros(len(returns))
    for i in range(21, len(returns)):
        weights = calculate_dynamic_weights(volatility[i], mean_volatility[i], std_volatility[i])
        weighted_returns[i] = np.sum(returns[i] * weights)
    return weighted_returns

def process_stock_data(df):
    df['Daily_Return'] = df['Close'].pct_change().fillna(method='bfill').round(3)
    df['Weekly_Return'] = df['Close'].pct_change(5).fillna(method='bfill').round(3)
    df['Monthly_Return'] = df['Close'].pct_change(21).fillna(method='bfill').round(3)
    df['Yearly_Return'] = df['Close'].pct_change(252).fillna(method='bfill').round(3)

    df['Volatility'] = df['Daily_Return'].rolling(window=21).std().fillna(method='bfill')
    df['Mean_Volatility'] = df['Volatility'].rolling(window=21).mean().fillna(method='bfill')
    df['Volatility_Std'] = df['Volatility'].rolling(window=21).std().fillna(method='bfill')

    returns = np.column_stack((df['Daily_Return'], df['Weekly_Return'], df['Monthly_Return'], df['Yearly_Return']))
    volatility = df['Volatility'].values
    mean_volatility = df['Mean_Volatility'].values
    std_volatility = df['Volatility_Std'].values

    df['Weighted_Return'] = calculate_weighted_returns_fast(returns, volatility, mean_volatility, std_volatility)
    df['Weighted_Return'] = df['Weighted_Return'].round(3)

    return df

# Process all stocks
for ticker, df in tqdm(data_frames.items(), desc="Processing stocks"):
    data_frames[ticker] = process_stock_data(df)

returns_df = pd.DataFrame({ticker: df['Weighted_Return'] for ticker, df in data_frames.items()}).fillna(method='bfill')

correlation_matrix = returns_df.corr().fillna(method='bfill').fillna(method='ffill')

scaler = StandardScaler()
scaled_correlation = scaler.fit_transform(correlation_matrix)

# Perform clustering
kmeans = KMeans(n_clusters=8, random_state=0, n_init=10)
clusters = kmeans.fit_predict(scaled_correlation)

correlation_matrix_df = pd.DataFrame(correlation_matrix)
correlation_matrix_df['Cluster'] = clusters

# Elbow Method
inertia = []
for k in tqdm(range(2, 20), desc="Elbow Method"):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
    kmeans.fit(scaled_correlation)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(2, 20), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

# Silhouette Score
silhouette_scores = []
for k in tqdm(range(2, 20), desc="Silhouette Score"):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
    labels = kmeans.fit_predict(scaled_correlation)
    silhouette_scores.append(silhouette_score(scaled_correlation, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 20), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Optimal Number of Clusters')
plt.show()

clustered_assets = correlation_matrix_df[['Cluster']].reset_index()
clustered_assets.columns = ['Ticker', 'Cluster']

def calculate_group_correlations_vectorized(correlation_matrix, clustered_assets):
    corr_array = correlation_matrix.values
    unique_clusters = clustered_assets['Cluster'].unique()
    cluster_indices = {cluster: clustered_assets.index[clustered_assets['Cluster'] == cluster].tolist() 
                       for cluster in unique_clusters}
    
    result = pd.DataFrame(index=correlation_matrix.index, 
                          columns=[f'correlation_{cluster}' for cluster in unique_clusters])
    
    for cluster in tqdm(unique_clusters, desc="Calculating Group Correlations"):
        indices = cluster_indices[cluster]
        cluster_correlations = corr_array[:, indices].mean(axis=1)
        result[f'correlation_{cluster}'] = cluster_correlations
    
    return result

group_correlations = calculate_group_correlations_vectorized(correlation_matrix, clustered_assets)

clustered_assets = clustered_assets.merge(group_correlations, left_on='Ticker', right_index=True)

mean_intra_group_corr = clustered_assets.groupby('Cluster')[group_correlations.columns].mean().mean(axis=1)
clustered_assets['mean_intragroup_correlation'] = clustered_assets['Cluster'].map(mean_intra_group_corr)
clustered_assets['diff_to_mean_group_corr'] = clustered_assets.apply(
    lambda row: row[f'correlation_{row.Cluster}'] - row['mean_intragroup_correlation'], axis=1)

reordered_columns = ['Ticker', 'Cluster', 'mean_intragroup_correlation', 'diff_to_mean_group_corr'] + list(group_correlations.columns)
clustered_assets = clustered_assets[reordered_columns].round(5)

clustered_assets.to_parquet('Correlations.parquet', index=False)

pd.set_option('display.max_rows', None)
print(clustered_assets[['Ticker', 'Cluster']])

print("Correlations saved to 'Correlations.parquet'.")