# Clustering

Group stocks into 4 risk profiles using K-Means.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import sys
sys.path.append('../src')

from clustering import find_optimal_clusters, StockClusterer

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

## Load Features

In [None]:
df = pd.read_csv('../Data/Processed/nse_features.csv')
print(f"Loaded {len(df)} stocks")
print(f"Features: {list(df.columns)}")
df.head()

## Find Optimal K

In [None]:
# Features for clustering
feature_cols = [
    'volatility_mean', 'volatility_max', 'downside_deviation',
    'std_return', 'var_95', 'max_drawdown',
    'sharpe_ratio', 'return_skew', 'return_kurtosis',
    'rsi_mean', 'bb_width_mean', 'macd_volatility',
    'momentum_30d', 'momentum_90d', 'trend_strength',
    'trading_frequency', 'amihud_illiquidity', 'volume_volatility'
]

# Only use features that exist
feature_cols = [col for col in feature_cols if col in df.columns]
print(f"\nUsing {len(feature_cols)} features")

cluster_metrics = find_optimal_clusters(df, feature_cols, max_clusters=8)
print(cluster_metrics)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Elbow plot
ax1.plot(cluster_metrics['n_clusters'], cluster_metrics['inertia'], 'bo-', linewidth=2, markersize=8)
ax1.set_title('Elbow Method', fontweight='bold')
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Inertia')
ax1.grid(True, alpha=0.3)

# Silhouette plot
ax2.plot(cluster_metrics['n_clusters'], cluster_metrics['silhouette'], 'ro-', linewidth=2, markersize=8)
ax2.axhline(y=0.5, color='g', linestyle='--', label='Good (0.5)', linewidth=2)
ax2.set_title('Silhouette Score', fontweight='bold')
ax2.set_xlabel('Number of Clusters')
ax2.set_ylabel('Silhouette Score')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

best_k = cluster_metrics.loc[cluster_metrics['silhouette'].idxmax(), 'n_clusters']
print(f"\nBest K = {int(best_k)} (highest silhouette)")

## Train Model

In [None]:
clusterer = StockClusterer(n_clusters=4, random_state=42)
df_clustered = clusterer.fit_predict(df)

print(f"\nCluster Distribution:")
print(df_clustered['Risk_Profile'].value_counts())

## Visualize with PCA

In [None]:
# Prepare data
X = df_clustered[clusterer.feature_columns].fillna(df_clustered[clusterer.feature_columns].median())
X_scaled = clusterer.scaler.transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(12, 8))
colors = ['green', 'blue', 'orange', 'red']
labels = ['Low Risk', 'Medium-Low Risk', 'Medium-High Risk', 'High Risk']

for i, (color, label) in enumerate(zip(colors, labels)):
    mask = df_clustered['Risk_Profile'] == label
    if mask.sum() > 0:
        plt.scatter(X_pca[mask, 0], X_pca[mask, 1],
                    c=color, label=f"{label} ({mask.sum()})",
                    alpha=0.7, s=120, edgecolors='black', linewidth=1)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})', fontsize=12, fontweight='bold')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})', fontsize=12, fontweight='bold')
plt.title('Stock Risk Clusters', fontsize=14, fontweight='bold')
plt.legend(title='Risk Profile', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Cluster Profiles

In [None]:
summary = clusterer.get_cluster_summary(df_clustered)
print("\nCluster Summary:")
print(summary)

## Sample Stocks

In [None]:
for risk in ['Low Risk', 'Medium-Low Risk', 'Medium-High Risk', 'High Risk']:
    subset = df_clustered[df_clustered['Risk_Profile'] == risk]
    if len(subset) > 0:
        print(f"\n{'='*60}")
        print(f"{risk} ({len(subset)} stocks)")
        print('='*60)
        
        cols = ['Stock_code', 'Name', 'Sector', 'volatility_mean', 'sharpe_ratio']
        available_cols = [c for c in cols if c in subset.columns]
        
        sample = subset.nsmallest(min(5, len(subset)), 'volatility_mean')[available_cols]
        print(sample.to_string(index=False))

## Save Results

In [None]:
# Save clustered data
df_clustered.to_csv('../Data/Processed/nse_clustered.csv', index=False)
print("✅ Saved clustered data")

# Save model
clusterer.save_model('../models/stock_clusterer.pkl')
print("✅ Saved model")