In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import QuantileTransformer
from tqdm import tqdm

def cluster_based_ranking(df, n_clusters=3, weights=None, show_progress=True):
    """
    Cluster-based ranking for tabular data.

    Args:
        df: DataFrame of features (higher = better)
        n_clusters: number of clusters
        weights: optional array of weights per column
        show_progress: whether to show tqdm progress bar

    Returns:
        DataFrame with cluster labels, scores, and ranks
    """
    X = df.values
    n_features = X.shape[1]

    # Step 1: Normalize skewed data (QuantileTransform handles heavy tails well)
    if show_progress:
        print("Normalizing data...")
    qt = QuantileTransformer(output_distribution="normal", random_state=42)
    X_scaled = qt.fit_transform(X)

    # Step 2: Apply weights (default: equal)
    if weights is None:
        weights = np.ones(n_features)
    weights = weights / np.sum(weights)

    # Step 3: KMeans clustering
    if show_progress:
        print(f"Clustering into {n_clusters} groups...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
    cluster_labels = kmeans.fit_predict(X_scaled)

    # Step 4: Score clusters by centroid's weighted sum
    centroids = kmeans.cluster_centers_
    centroid_scores = centroids @ weights  # (n_clusters, )

    # Step 5: Rank clusters from best to worst
    cluster_order = np.argsort(-centroid_scores)  # higher = better
    cluster_rank_map = {cluster: rank for rank, cluster in enumerate(cluster_order)}

    # Step 6: Assign rank to each row based on its cluster
    row_ranks = np.array([cluster_rank_map[label] for label in cluster_labels])

    return pd.DataFrame({
        "cluster": cluster_labels,
        "cluster_score": [centroid_scores[label] for label in cluster_labels],
        "rank": row_ranks
    }, index=df.index)



In [3]:
df = pd.read_csv("allsources_percentiles.csv")
ranked_df = cluster_based_ranking(df.iloc[:,6:], n_clusters=10)
print(ranked_df)

Normalizing data...
Clustering into 10 groups...


[WinError 2] The system cannot find the file specified
  File "C:\Users\met48\AppData\Local\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\met48\AppData\Local\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\met48\AppData\Local\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\met48\AppData\Local\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


       cluster  cluster_score  rank
0            2      -4.538440     6
1            2      -4.538440     6
2            4      -4.099552     4
3            6      -2.914991     1
4            3      -2.995402     2
...        ...            ...   ...
24995        3      -2.995402     2
24996        7      -3.658146     3
24997        9      -4.853149     8
24998        7      -3.658146     3
24999        7      -3.658146     3

[25000 rows x 3 columns]


In [4]:
df_with_cluster_info = pd.concat([df, ranked_df[["cluster", "cluster_score", "rank"]]], axis=1)

In [7]:
df_with_cluster_info.to_csv('df_with_cluster_info.csv', index=False)