**Creating a small synthetic dataset and computing two different metrics :**

- Metric 1: Weighted Euclidean with a strong weight on Mass (to reflect unit-dominance)
- Metric 2: Standardized Euclidean (divide differences by feature std) so all features balance

We'll show that the closest and farthest pairs differ between the two metrics.

In [1]:
import numpy as np
import pandas as pd
import itertools
from math import sqrt

# Construct a small dataset of 8 samples with 4 features (with units in the column names)
data = np.array([
    # Length_cm, Mass_kg, Temp_C, ColorIntensity_unitless
    [10.0, 1000.0, 20.0, 0.50],   # sample 0: very large mass
    [12.0, 1001.0, 22.0, 0.60],   # sample 1: very close mass to sample 0
    [40.0,   50.0, 19.0, 2.55],   # sample 2: small mass cluster A
    [31.0,   49.0, 19.2, 0.53],   # sample 3: close to sample 2 in many features
    [15.0,  300.0, 40.0, 1.20],   # sample 4: medium mass, very different temp & color
    [14.5,  305.0, 39.8, 1.22],   # sample 5: similar to sample 4
    [80.0,   10.0, -5.0, 0.10],   # sample 6: extreme length, tiny mass, low temp
    [79.0,   12.0, -4.8, 0.12],   # sample 7: close to sample 6
])

cols = ["Length_cm", "Mass_kg", "Temp_C", "ColorIntensity"]

df = pd.DataFrame(data, columns=cols)
df.index.name = "sample_id"

print(df)

           Length_cm  Mass_kg  Temp_C  ColorIntensity
sample_id                                            
0               10.0   1000.0    20.0            0.50
1               12.0   1001.0    22.0            0.60
2               40.0     50.0    19.0            2.55
3               31.0     49.0    19.2            0.53
4               15.0    300.0    40.0            1.20
5               14.5    305.0    39.8            1.22
6               80.0     10.0    -5.0            0.10
7               79.0     12.0    -4.8            0.12


In [2]:
# Metric 1: Weighted Euclidean (Mass has high weight to reflect its unit magnitude)
weights = np.array([1.0, 100.0, 1.0, 1.0])  # Mass_kg heavily weighted

def weighted_euclidean(x, y, w):
    return np.sqrt(np.sum(w * (x - y)**2))

# Metric 2: Standardized Euclidean (divide by std of each feature)
stds = df.std(axis=0).values
def standardized_euclidean(x, y, stds):
    return np.linalg.norm((x - y) / stds)

# Compute all pairwise distances
pairs = list(itertools.combinations(range(len(df)), 2))
distances_m1 = [(i, j, weighted_euclidean(df.iloc[i].values, df.iloc[j].values, weights)) for i, j in pairs]
distances_m2 = [(i, j, standardized_euclidean(df.iloc[i].values, df.iloc[j].values, stds)) for i, j in pairs]

min_m1 = min(distances_m1, key=lambda t: t[2])
max_m1 = max(distances_m1, key=lambda t: t[2])
min_m2 = min(distances_m2, key=lambda t: t[2])
max_m2 = max(distances_m2, key=lambda t: t[2])

In [3]:
# Prepare results
results = {
    "metric1_name": "Weighted Euclidean (Mass weighted 100x)",
    "metric2_name": "Standardized Euclidean (z-score scaling)",
    "metric1_closest": min_m1,
    "metric1_farthest": max_m1,
    "metric2_closest": min_m2,
    "metric2_farthest": max_m2,
    "stds": stds,
    "weights": weights
}

# Print computed distances and pairs
# Metric 1 results
print(f"Metric 1: {results['metric1_name']}")
print(f"  Closest pair: samples {min_m1[0]} & {min_m1[1]}  -> distance = {min_m1[2]:.6f}")
print(df.loc[[min_m1[0], min_m1[1]]])
print()
print(f"  Farthest pair: samples {max_m1[0]} & {max_m1[1]}  -> distance = {max_m1[2]:.6f}")
print(df.loc[[max_m1[0], max_m1[1]]])
print("\n" + "-"*60 + "\n")

# Metric 2 results
print(f"Metric 2: {results['metric2_name']}\n")
print(f"Feature std deviations (used for Metric 2):\n{pd.Series(stds, index=cols)}\n")
print(f"  Closest pair: samples {min_m2[0]} & {min_m2[1]}  -> distance = {min_m2[2]:.6f}")
print(df.loc[[min_m2[0], min_m2[1]]])
print()
print(f"  Farthest pair: samples {max_m2[0]} & {max_m2[1]}  -> distance = {max_m2[2]:.6f}")
print(df.loc[[max_m2[0], max_m2[1]]])

Metric 1: Weighted Euclidean (Mass weighted 100x)
  Closest pair: samples 0 & 1  -> distance = 10.392786
           Length_cm  Mass_kg  Temp_C  ColorIntensity
sample_id                                            
0               10.0   1000.0    20.0             0.5
1               12.0   1001.0    22.0             0.6

  Farthest pair: samples 1 & 6  -> distance = 9910.270090
           Length_cm  Mass_kg  Temp_C  ColorIntensity
sample_id                                            
1               12.0   1001.0    22.0             0.6
6               80.0     10.0    -5.0             0.1

------------------------------------------------------------

Metric 2: Standardized Euclidean (z-score scaling)

Feature std deviations (used for Metric 2):
Length_cm          29.218313
Mass_kg           424.365892
Temp_C             17.011404
ColorIntensity      0.804643
dtype: float64

  Closest pair: samples 4 & 5  -> distance = 0.034463
           Length_cm  Mass_kg  Temp_C  ColorIntensity
sampl