In [37]:
# -----------------------------------------
# Step 1: Generate synthetic high-dimensional data
# -----------------------------------------

# Create a dataset with:
# - 300 samples (rows)
# - 6 features (columns)# 🤖 DBSCAN Clustering on a Human-Like Synthetic Dataset
# This example creates fake "people" with features like height, weight, age, etc.,
# and clusters them into lifestyle groups using DBSCAN.

# -----------------------------------------
# 📦 Step 1: Import Required Libraries
# -----------------------------------------

import numpy as np  # 🔢 NumPy is the standard library for numerical operations in Python
from sklearn.datasets import make_classification  # 🔬 For generating synthetic, labeled datasets
from sklearn.cluster import DBSCAN  # 📊 The core DBSCAN algorithm from scikit-learn
from dbscan import DBSCANCluster  # 🧩 Your custom wrapper for sklearn's DBSCAN (defined elsewhere)

In [38]:
# -----------------------------------------
# 🧠 Step 2: Generate Raw Synthetic Data
# -----------------------------------------

# make_classification returns:
# - X_raw: features (NumPy array), shape = [n_samples, n_features]
# - _: labels (unused here, so we assign to _ which is Python’s "I don’t care" variable)

# -----------------------------------------
# 📊 What is X_raw and where do its values come from?
# -----------------------------------------

# X_raw is the raw feature matrix generated by sklearn's make_classification().
# It contains 300 samples (rows), each with 6 features (columns), generated to simulate
# real-world classification data. However, we use it for clustering.

# 🔬 The values inside X_raw are not purely random. They're generated using
# multivariate Gaussian (normal) distributions with structured patterns.

# 🧠 Of the 6 total features:
# - 4 are "informative": they contain real signals that separate the (synthetic) classes.
# - 0 are "redundant": we didn’t ask for any features that are linear combinations of others.
# - The remaining 2 are essentially noise (still structured, but not useful for separation).

# 📈 Range of values:
# - Because the features are sampled from normal distributions,
#   most values fall within the range of approximately -3.0 to +3.0.
# - These are not hard limits, but about 99.7% of the values will stay in this range.
# - Each feature may have a different mean and spread (std), depending on the class separation.

# ⚠️ Important: These values are not yet scaled to real-world interpretable units.
# That's why we later rescale them using np.interp() to represent things like:
#   - height (cm) → [150, 200]
#   - weight (kg) → [50, 100]
#   - age (years) → [18, 70]
#   - income (k/year) → [20, 120]
#   - etc.

# 💡 Summary:
# - X_raw is structured fake data made to mimic real class-based clustering behavior.
# - It is ideal for testing clustering algorithms like DBSCAN.
# - Scaling is required to make the features interpretable and comparable in distance-based methods.


X_raw, _ = make_classification(
    n_samples=300,         # 🧪 300 rows = 300 fake "people"
    n_features=6,          # 📐 Each person has 6 features (e.g. height, weight, etc.)
    n_informative=4,       # ✅ 4 features actually contribute to class separation
    n_redundant=0,         # 🔁 No duplicate or combined features
    n_clusters_per_class=1,# 🧩 Each class is generated as one cluster in feature space
    n_classes=3,           # 🎯 Total of 3 lifestyle classes (we ignore these for clustering)
    random_state=42        # 🔁 Fixes randomness for reproducibility (same data every run)
)

print(X_raw.shape)  # 🖨️ Print shape to confirm we have 300 samples with 6 feature
print(X_raw[:5])  # 🖨️ Print first 5 rows to see the generated data
print(_[:5])  # 🖨️ Print first 5 labels (not used in clustering)


(300, 6)
[[-1.70020487 -1.47399908 -2.53505051  0.52830303  1.25343421  2.55157169]
 [ 0.68656601  0.03577646 -2.02979925 -2.21322735 -1.11958064  1.62908285]
 [-1.45071192 -0.56028716 -0.2855498  -0.31675333  0.78710345  0.10678175]
 [-1.3277159   0.43763299 -1.30427838 -0.01581971 -0.55473821  0.99606491]
 [-0.43253226 -1.30982893 -0.56996467  1.3270781  -1.0736925   0.04971871]]
[2 1 2 1 0]


In [39]:
# -----------------------------------------
# ⚙️ Step 3: Rescale Features to Human-Like Ranges
# -----------------------------------------

# 🤓 We want each feature to represent something real-world and interpretable.
# We use NumPy’s `np.interp()` to rescale each column (feature) from its raw range to a defined real-world range.

# Initialize an empty array to hold rescaled data (same shape as X_raw)
X = np.empty_like(X_raw)  # ❗ np.empty_like creates an uninitialized array with same shape/type as X_raw

# 🔁 Rescale each feature column individually:

# Feature 0: Height in cm → map to range [150, 200]
X[:, 0] = np.interp(
    X_raw[:, 0],                          # Input data (all rows, column 0)
    (X_raw[:, 0].min(), X_raw[:, 0].max()), # Old range
    [150, 200]                            # New desired range
)

# Feature 1: Weight in kg → [50, 100]
X[:, 1] = np.interp(X_raw[:, 1], (X_raw[:, 1].min(), X_raw[:, 1].max()), [50, 100])

# Feature 2: Age in years → [18, 70]
X[:, 2] = np.interp(X_raw[:, 2], (X_raw[:, 2].min(), X_raw[:, 2].max()), [18, 70])

# Feature 3: Income in thousands/year → [20, 120]
X[:, 3] = np.interp(X_raw[:, 3], (X_raw[:, 3].min(), X_raw[:, 3].max()), [20, 120])

# Feature 4: Spending score (1–100)
X[:, 4] = np.interp(X_raw[:, 4], (X_raw[:, 4].min(), X_raw[:, 4].max()), [1, 100])

# Feature 5: Weekly exercise hours → [0, 12]
X[:, 5] = np.interp(X_raw[:, 5], (X_raw[:, 5].min(), X_raw[:, 5].max()), [0, 12])

# 🔍 At this point, each row in X represents a "person" with realistic features:
# Example:
#   [176.2 cm, 82.5 kg, 29 yrs, $55k/year, 75 score, 5.5 hrs/week]

In [40]:
# -----------------------------------------
# 🧪 Step 4: Apply DBSCAN Clustering
# -----------------------------------------

# eps (epsilon): max distance between two samples for one to be considered in the neighborhood of the other
# min_samples: minimum number of points to form a dense region (i.e., a cluster)

clusterer = DBSCANCluster(eps=25, min_samples=5)

# Fit the DBSCAN model and assign cluster labels
labels = clusterer.fit_predict(X)  # returns 1D array like: [0, 0, 1, 1, -1, ...]

# 💡 Note: 
# - Each number represents a cluster.
# - -1 means DBSCAN considered that point to be noise (outlier).

In [41]:
# -----------------------------------------
# 📤 Step 5: Print Clustering Results
# -----------------------------------------

# Get the set of unique cluster labels (excluding duplicates)
print("🏷️ Unique clusters found:", set(labels))
# Example output: {0, 1, -1}

# Count how many points were labeled as noise (-1)
print("❌ Noise points (label = -1):", list(labels).count(-1))

# 👉 You can use these labels to:
# - Group people by lifestyle
# - Flag outliers for further inspection

# - 3 classes
# - 4 informative features (the rest are noise)
X, _ = make_classification(
    n_samples=300,
    n_features=6,
    n_informative=4,
    n_redundant=0,
    n_clusters_per_class=1,
    n_classes=3,
    random_state=42
)

🏷️ Unique clusters found: {np.int64(0), np.int64(-1)}
❌ Noise points (label = -1): 1
