In [15]:
import numpy as np
from tqdm import tqdm

def pareto_front_ranking(objectives, show_progress=True):
    n = len(objectives)
    ranks = np.full(n, np.inf)
    domination_counts = np.zeros(n, dtype=int)
    dominated_sets = [[] for _ in range(n)]
    current_front = []

    total_comparisons = n * (n - 1)
    progress = tqdm(total=total_comparisons, desc="Pareto ranking", disable=not show_progress)

    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            progress.update(1)
            if np.all(objectives[i] >= objectives[j]) and np.any(objectives[i] > objectives[j]):
                dominated_sets[i].append(j)
            elif np.all(objectives[j] >= objectives[i]) and np.any(objectives[j] > objectives[i]):
                domination_counts[i] += 1
        if domination_counts[i] == 0:
            ranks[i] = 0
            current_front.append(i)

    progress.close()

    # Now rank the fronts (this part is usually much faster, so no need for a second bar)
    front = 0
    while current_front:
        next_front = []
        for i in current_front:
            for j in dominated_sets[i]:
                domination_counts[j] -= 1
                if domination_counts[j] == 0:
                    ranks[j] = front + 1
                    next_front.append(j)
        front += 1
        current_front = next_front

    return ranks.astype(int)

In [16]:
def rank_tabular_by_pareto(df):
    # 1. Normalize for skewed distributions
    qt = QuantileTransformer(output_distribution="normal")
    X_scaled = qt.fit_transform(df.values)

    # 2. Run Pareto front ranking (assuming higher = better)
    ranks = pareto_front_ranking(X_scaled)

    return pd.DataFrame({
        "pareto_rank": ranks
    }, index=df.index)

In [17]:
df = pd.read_csv("allsources_percentiles.csv")

ranked = rank_tabular_by_pareto(df.iloc[:,6:])
print(ranked)

Pareto ranking: 100%|██████████| 624975000/624975000 [1:38:27<00:00, 105785.11it/s]


       pareto_rank
0               53
1                6
2               16
3               10
4                6
...            ...
24995            4
24996            5
24997            5
24998            1
24999            6

[25000 rows x 1 columns]


In [18]:
df_with_pareto = pd.concat([df, ranked], axis=1)

In [21]:
df_with_pareto.to_csv('df_pareto.csv', index=False)