In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [10]:
# Load your processed financial data
financial_features = pd.read_csv('../data/processed/financial_features_2010.csv', index_col=0)
financial_features.columns

Index(['avg_ROA', 'ROA_volatility', 'avg_ROE', 'ROE_volatility',
       'avg_OperatingMargin', 'OperatingMargin_volatility',
       'avg_AssetTurnover', 'AssetTurnover_volatility', 'avg_CurrentRatio',
       'CurrentRatio_volatility', 'avg_Leverage', 'Leverage_volatility',
       'avg_DebtToEquity', 'DebtToEquity_volatility', 'avg_log_Revenue',
       'log_Revenue_volatility', 'avg_log_Assets', 'log_Assets_volatility',
       'avg_log_NetIncome', 'log_NetIncome_volatility',
       'avg_log_OperatingIncome', 'log_OperatingIncome_volatility',
       'avg_log_Liabilities', 'log_Liabilities_volatility',
       'avg_log_StockholdersEquity', 'log_StockholdersEquity_volatility',
       'avg_log_CurrentAssets', 'log_CurrentAssets_volatility',
       'avg_log_CurrentLiabilities', 'log_CurrentLiabilities_volatility',
       'avg_log_LongTermDebt', 'log_LongTermDebt_volatility', 'avg_log_CapEx',
       'log_CapEx_volatility', 'Revenue_sign_change', 'Revenue_growth',
       'Assets_sign_change', '

In [11]:
tickers = financial_features.index.tolist()

X = financial_features.values

In [12]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
labels = kmeans.fit_predict(X)

score = silhouette_score(X, labels)

In [13]:
from sklearn.preprocessing import StandardScaler

# Try different k values
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
k_values = range(2, 100)  # Test from 2 to 10 clusters
best_score = -1
best_k = None
best_labels = None

print("Testing different numbers of clusters...")
for k in k_values:
    # Run K-means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    
    # Calculate silhouette score
    score = silhouette_score(X_scaled, labels)
    
    print(f"k={k}, silhouette score: {score:.4f}")
    
    # Keep track of the best model
    if score > best_score:
        best_score = score
        best_k = k
        best_labels = labels

print(f"\nBest number of clusters: {best_k} (silhouette score: {best_score:.4f})")

# Count samples in each cluster
unique_clusters, counts = np.unique(best_labels, return_counts=True)
print("\nCluster sizes:")
for cluster, count in zip(unique_clusters, counts):
    print(f"Cluster {cluster}: {count} samples")

# Save the results
results_df = pd.DataFrame({
    'ticker': tickers,
    'cluster': best_labels
})

print("\nResults saved to 'kmeans_clusters.csv'")

Testing different numbers of clusters...
k=2, silhouette score: 0.0829
k=3, silhouette score: 0.1299
k=4, silhouette score: 0.1322
k=5, silhouette score: 0.1293
k=6, silhouette score: 0.0850
k=7, silhouette score: 0.0735
k=8, silhouette score: 0.0755
k=9, silhouette score: 0.0829
k=10, silhouette score: 0.0857
k=11, silhouette score: 0.0862
k=12, silhouette score: 0.0872
k=13, silhouette score: 0.0796
k=14, silhouette score: 0.0803
k=15, silhouette score: 0.0811
k=16, silhouette score: 0.0797
k=17, silhouette score: 0.0825
k=18, silhouette score: 0.0847
k=19, silhouette score: 0.0886
k=20, silhouette score: 0.0812
k=21, silhouette score: 0.0976
k=22, silhouette score: 0.0875
k=23, silhouette score: 0.0953
k=24, silhouette score: 0.0901
k=25, silhouette score: 0.1054
k=26, silhouette score: 0.0825
k=27, silhouette score: 0.1078
k=28, silhouette score: 0.0932
k=29, silhouette score: 0.1091
k=30, silhouette score: 0.1098
k=31, silhouette score: 0.1013
k=32, silhouette score: 0.1010
k=33, 