In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.metrics import pairwise_distances

# -------------------------------
# STEP 1: Create example dataset
# -------------------------------
data = pd.DataFrame({
    "nominal": ["red", "blue", "green", "blue", "red"],
    "ordinal": ["low", "medium", "high", "low", "medium"],
    "numeric": [10.5, 12.0, 11.0, 13.5, 10.0]
})

# -------------------------------
# STEP 2: Encode Nominal
# (using Hamming distance)
# -------------------------------
nominal_encoded = pd.get_dummies(data["nominal"])  # One-hot
nominal_dist = pairwise_distances(nominal_encoded, metric="hamming")

# -------------------------------
# STEP 3: Encode Ordinal
# (using order-aware encoding)
# -------------------------------
# Define meaningful order
ordinal_order = [["low", "medium", "high"]]
ordinal_encoder = OrdinalEncoder(categories=ordinal_order)
ordinal_encoded = ordinal_encoder.fit_transform(data[["ordinal"]])

# Normalize to [0,1] range
ordinal_norm = MinMaxScaler().fit_transform(ordinal_encoded)
ordinal_dist = pairwise_distances(ordinal_norm, metric="euclidean")

# -------------------------------
# STEP 4: Normalize Numeric
# (using Euclidean)
# -------------------------------
numeric_norm = MinMaxScaler().fit_transform(data[["numeric"]])
numeric_dist = pairwise_distances(numeric_norm, metric="euclidean")

# -------------------------------
# STEP 5: Combine all distances
# -------------------------------
combined_dissimilarity = (nominal_dist + ordinal_dist + numeric_dist) / 3

# Convert to DataFrame
dissimilarity_df = pd.DataFrame(
    combined_dissimilarity,
    index=[f"Row{i}" for i in range(len(data))],
    columns=[f"Row{i}" for i in range(len(data))]
)

# Display
print("🔁 Combined Dissimilarity Matrix:")
print(dissimilarity_df.round(3))


🔁 Combined Dissimilarity Matrix:
       Row0   Row1   Row2   Row3   Row4
Row0  0.000  0.532  0.603  0.508  0.214
Row1  0.532  0.000  0.484  0.310  0.413
Row2  0.603  0.484  0.000  0.794  0.484
Row3  0.508  0.310  0.794  0.000  0.722
Row4  0.214  0.413  0.484  0.722  0.000


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.metrics import pairwise_distances
from sklearn.datasets import fetch_openml

# -----------------------------------
# STEP 1: Load dataset (from OpenML)
# -----------------------------------
adult = fetch_openml("adult", version=2, as_frame=True)
df = adult.frame

# -----------------------------------
# STEP 2: Select 3 features (1 of each type)
# -----------------------------------
data = df[["education", "sex", "age"]].dropna().head(10)  # small sample for clarity

# education: ordinal
# sex: nominal
# age: numeric

# -----------------------------------
# STEP 3: Encode nominal (sex)
# -----------------------------------
nominal_encoded = pd.get_dummies(data["sex"])
nominal_dist = pairwise_distances(nominal_encoded, metric="hamming")

# -----------------------------------
# STEP 4: Encode ordinal (education)
# Define order manually based on education hierarchy
# -----------------------------------
edu_order = [
    "Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th",
    "12th", "HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm",
    "Bachelors", "Masters", "Prof-school", "Doctorate"
]
ordinal_encoder = OrdinalEncoder(categories=[edu_order])
ordinal_encoded = ordinal_encoder.fit_transform(data[["education"]])
ordinal_scaled = MinMaxScaler().fit_transform(ordinal_encoded)
ordinal_dist = pairwise_distances(ordinal_scaled, metric="euclidean")

# -----------------------------------
# STEP 5: Normalize numeric (age)
# -----------------------------------
numeric_scaled = MinMaxScaler().fit_transform(data[["age"]])
numeric_dist = pairwise_distances(numeric_scaled, metric="euclidean")

# -----------------------------------
# STEP 6: Combine all three distances
# -----------------------------------
combined_dissimilarity = (nominal_dist + ordinal_dist + numeric_dist) / 3

# -----------------------------------
# STEP 7: Format and display
# -----------------------------------
dissimilarity_df = pd.DataFrame(
    combined_dissimilarity,
    index=[f"Row{i}" for i in range(len(data))],
    columns=[f"Row{i}" for i in range(len(data))]
)

print("🔁 Dissimilarity Matrix (using Adult Income data):")
print(dissimilarity_df.round(3))


  warn(


🔁 Dissimilarity Matrix (using Adult Income data):
       Row0   Row1   Row2   Row3   Row4   Row5   Row6   Row7   Row8   Row9
Row0  0.000  0.157  0.174  0.232  0.476  0.097  0.090  0.524  0.432  0.313
Row1  0.157  0.000  0.165  0.075  0.512  0.121  0.067  0.367  0.467  0.277
Row2  0.174  0.165  0.000  0.179  0.468  0.226  0.098  0.350  0.424  0.442
Row3  0.232  0.075  0.179  0.000  0.526  0.195  0.141  0.292  0.481  0.263
Row4  0.476  0.512  0.468  0.526  0.000  0.573  0.445  0.818  0.044  0.789
Row5  0.097  0.121  0.226  0.195  0.573  0.000  0.128  0.488  0.529  0.216
Row6  0.090  0.067  0.098  0.141  0.445  0.128  0.000  0.434  0.401  0.344
Row7  0.524  0.367  0.350  0.292  0.818  0.488  0.434  0.000  0.774  0.393
Row8  0.432  0.467  0.424  0.481  0.044  0.529  0.401  0.774  0.000  0.745
Row9  0.313  0.277  0.442  0.263  0.789  0.216  0.344  0.393  0.745  0.000
