In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
# Load your processed financial data
financial_features = pd.read_csv('../data/processed/financial_features_2010.csv', index_col=0)
financial_features.columns

Index(['avg_ROA', 'ROA_volatility', 'avg_ROE', 'ROE_volatility',
       'avg_OperatingMargin', 'OperatingMargin_volatility',
       'avg_AssetTurnover', 'AssetTurnover_volatility', 'avg_CurrentRatio',
       'CurrentRatio_volatility', 'avg_Leverage', 'Leverage_volatility',
       'avg_DebtToEquity', 'DebtToEquity_volatility', 'avg_log_Revenue',
       'log_Revenue_volatility', 'avg_log_Assets', 'log_Assets_volatility',
       'avg_log_NetIncome', 'log_NetIncome_volatility',
       'avg_log_OperatingIncome', 'log_OperatingIncome_volatility',
       'avg_log_Liabilities', 'log_Liabilities_volatility',
       'avg_log_StockholdersEquity', 'log_StockholdersEquity_volatility',
       'avg_log_CurrentAssets', 'log_CurrentAssets_volatility',
       'avg_log_CurrentLiabilities', 'log_CurrentLiabilities_volatility',
       'avg_log_LongTermDebt', 'log_LongTermDebt_volatility', 'avg_log_CapEx',
       'log_CapEx_volatility', 'Revenue_growth', 'Assets_growth',
       'NetIncome_growth', 'capex_in

In [3]:
tickers = financial_features.index.tolist()

X = financial_features.values

In [None]:
df2 = financial_features[['avg_ROA', 'avg_AssetTurnover','avg_CurrentRatio','avg_Leverage','avg_log_Assets','Revenue_growth','ROA_volatility','has_rnd','has_goodwill','core_completeness']]
df2

X = df2.values

In [5]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
labels = kmeans.fit_predict(X)

score = silhouette_score(X, labels)

In [6]:
from sklearn.preprocessing import StandardScaler

# Try different k values
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
k_values = range(2, 100)  # Test from 2 to 10 clusters
best_score = -1
best_k = None
best_labels = None

print("Testing different numbers of clusters...")
for k in k_values:
    # Run K-means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    
    # Calculate silhouette score
    score = silhouette_score(X_scaled, labels)
    
    print(f"k={k}, silhouette score: {score:.4f}")
    
    # Keep track of the best model
    if score > best_score:
        best_score = score
        best_k = k
        best_labels = labels

print(f"\nBest number of clusters: {best_k} (silhouette score: {best_score:.4f})")

# Count samples in each cluster
unique_clusters, counts = np.unique(best_labels, return_counts=True)
print("\nCluster sizes:")
for cluster, count in zip(unique_clusters, counts):
    print(f"Cluster {cluster}: {count} samples")

# Save the results
results_df = pd.DataFrame({
    'ticker': tickers,
    'cluster': best_labels
})

print("\nResults saved to 'kmeans_clusters.csv'")

Testing different numbers of clusters...
k=2, silhouette score: 0.1940
k=3, silhouette score: 0.1984
k=4, silhouette score: 0.2192
k=5, silhouette score: 0.2339
k=6, silhouette score: 0.1945
k=7, silhouette score: 0.2374
k=8, silhouette score: 0.2608
k=9, silhouette score: 0.2383
k=10, silhouette score: 0.2407
k=11, silhouette score: 0.2466
k=12, silhouette score: 0.2315
k=13, silhouette score: 0.2059
k=14, silhouette score: 0.2051
k=15, silhouette score: 0.2019
k=16, silhouette score: 0.2102
k=17, silhouette score: 0.2088
k=18, silhouette score: 0.1997
k=19, silhouette score: 0.1987
k=20, silhouette score: 0.2066
k=21, silhouette score: 0.2046
k=22, silhouette score: 0.1946
k=23, silhouette score: 0.2116
k=24, silhouette score: 0.2059
k=25, silhouette score: 0.2038
k=26, silhouette score: 0.1919
k=27, silhouette score: 0.1939
k=28, silhouette score: 0.1873
k=29, silhouette score: 0.2168
k=30, silhouette score: 0.2065
k=31, silhouette score: 0.1890
k=32, silhouette score: 0.1966
k=33, 

In [9]:
# Weights and biases test

import random

import wandb

# Start a new wandb run to track this script.
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="solo-mireg",
    # Set the wandb project where this run will be logged.
    project="clustering-portfolios",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.02,
        "architecture": "CNN",
        "dataset": "CIFAR-100",
        "epochs": 10,
    },
)

# Simulate training.
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2**-epoch - random.random() / epoch - offset
    loss = 2**-epoch + random.random() / epoch + offset

    # Log metrics to wandb.
    run.log({"acc": acc, "loss": loss})

# Finish the run and upload any remaining data.
run.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


KeyboardInterrupt: 