In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score, silhouette_samples
import sys
sys.path.append('../src')
from clustering import StockClusterer

df = pd.read_csv('../data/processed/nse_clustered.csv')
print(f"Loaded {len(df)} clustered stocks")

Loaded 57 clustered stocks


## 1. Cluster Quality Metrics

In [3]:
# Load model
clusterer = StockClusterer.load_model('../models/stock_clusterer.pkl')

# Calculate silhouette scores
X = df[clusterer.feature_columns].fillna(df[clusterer.feature_columns].median())
X_scaled = clusterer.scaler.transform(X)

avg_silhouette = silhouette_score(X_scaled, df['Cluster'])
print(f"Average Silhouette Score: {avg_silhouette:.3f}")
print("(Score > 0.5 indicates good separation)\n")

AttributeError: type object 'StockClusterer' has no attribute 'load_model'

## 2. Cluster Profiles

Interpret each risk profile:

In [None]:
cluster_summary = df.groupby('Risk_Profile').agg({
    'Stock_code': 'count',
    'volatility_30d': 'mean',
    'return_std': 'mean',
    'max_drawdown': 'mean',
    'trading_frequency': 'mean',
    'avg_volume': 'median'
}).round(4)

cluster_summary.columns = ['Count', 'Avg Volatility', 'Return Std', 'Avg Drawdown', 'Trading Freq', 'Median Volume']
print("Cluster Summary:")
print(cluster_summary)

## 3. Sector Distribution by Risk

In [None]:
sector_risk = pd.crosstab(df['Sector'], df['Risk_Profile'], normalize='index') * 100
print("\nSector Distribution Across Risk Profiles (%):")
print(sector_risk.round(1))

## 4. Sample Stocks by Risk Profile

In [None]:
for risk_profile in ['Low Risk', 'Medium-Low Risk', 'Medium-High Risk', 'High Risk']:
    stocks = df[df['Risk_Profile'] == risk_profile].nsmallest(5, 'volatility_30d')[['Stock_code', 'Name', 'Sector', 'volatility_30d']]
    print(f"\n{risk_profile} (Top 5 by stability):")
    print(stocks.to_string(index=False))