TP : Implémentattion de l'algorithme de nuée dynamique en Python

In [36]:
import numpy as np
import pandas as pd

In [None]:
import numpy as np
import pandas as pd

class DynamicClusteringGeneralized:
    def __init__(self, num_clusters, numeric_features, categorical_features,
                 max_iterations=100, tolerance=1e-4):

        self.num_clusters = num_clusters
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.max_iterations = max_iterations
        self.tolerance = tolerance

        self.col_order = None
        self.numeric_idx = None
        self.categorical_idx = None
        
        self.cluster_prototypes = None

    # -------------------------
    #  PREPARE DATA
    # -------------------------
    def _prepare_data(self, data):

        df = pd.DataFrame(data)
        self.col_order = list(df.columns)

        self.numeric_idx = [self.col_order.index(col) for col in self.numeric_features]
        self.categorical_idx = [self.col_order.index(col) for col in self.categorical_features]

        return df

    # -------------------------
    #  GOWER DISTANCE
    # -------------------------
    def _gower_distance(self, x, y):

        dist = 0
        count = 0

        # Numerical part
        for idx in self.numeric_idx:
            rng = self.numeric_ranges[idx]
            dist += abs(x[idx] - y[idx]) / (rng + 1e-9)
            count += 1

        # Categorical part
        for idx in self.categorical_idx:
            dist += 0 if x[idx] == y[idx] else 1
            count += 1

        return dist / count

    # -------------------------
    #  PROTOTYPES IN ORIGINAL COLUMN ORDER
    # -------------------------
    def _compute_prototypes(self, df, assignments):

        prototypes = []

        for k in range(self.num_clusters):
            cluster_points = df[assignments == k]

            if cluster_points.empty:
                # sample one row and convert to native Python types
                sampled = df.sample(1).iloc[0]
                proto = []
                for col in self.col_order:
                    if col in self.numeric_features:
                        val = sampled[col]
                        if pd.isna(val):
                            proto.append(None)
                        else:
                            f = float(val)
                            proto.append(int(f) if f.is_integer() else f)
                    elif col in self.categorical_features:
                        proto.append(sampled[col])
                prototypes.append(np.array(proto, dtype=object))
                continue

            proto = []

            for col in self.col_order:
                if col in self.numeric_features:
                    val = cluster_points[col].mean()
                    # convert numpy numeric to native Python int/float
                    if pd.isna(val):
                        proto.append(None)
                    else:
                        f = float(val)
                        proto.append(int(f) if f.is_integer() else f)
                elif col in self.categorical_features:
                    mode_val = cluster_points[col].mode()[0]
                    proto.append(mode_val)

            prototypes.append(np.array(proto, dtype=object))

        return np.array(prototypes)

    # -------------------------
    #  ASSIGN POINTS
    # -------------------------
    def _assign_clusters(self, df, prototypes):

        assignments = []

        for row in df.values:
            dists = [self._gower_distance(row, proto) for proto in prototypes]
            assignments.append(np.argmin(dists))

        return np.array(assignments)

    # -------------------------
    #  FIT
    # -------------------------
    def fit(self, data):

        df = self._prepare_data(data)

        # Compute numeric ranges
        self.numeric_ranges = {
            self.col_order.index(col): df[col].max() - df[col].min()
            for col in self.numeric_features
        }

        assignments = np.random.randint(0, self.num_clusters, size=len(df))

        for it in range(self.max_iterations):

            new_prototypes = self._compute_prototypes(df, assignments)
            new_assignments = self._assign_clusters(df, new_prototypes)

            if np.array_equal(assignments, new_assignments):
                print(f"Convergence reached at iteration {it}")
                break

            assignments = new_assignments
            self.cluster_prototypes = new_prototypes

        return assignments


In [26]:
# Example usage
data = np.array([
    [1, 2], [1, 3], [2, 2],
    [8, 8], [9, 8], [8, 9]
])

model = DynamicClustering(num_clusters=2)
assignments = model.fit(data)

print("Cluster Assignments:", assignments)
print("Cluster Centers:", model.cluster_centers)


Convergence reached at iteration 1
Cluster Assignments: [1 1 1 0 0 0]
Cluster Centers: [[6.         6.33333333]
 [3.66666667 4.33333333]]


Je vais maintenant mettre en lumière la différence entre nuée dynamique et k-means par un exemple


In [5]:
pip install pandas


Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 744.3 kB/s eta 0:00:15
   - -------------------------------------- 0.5/11.0 MB 744.3 kB/s eta 0:00:15
   - -------------------------------------- 0.5/11.0 MB 744.3 kB/s eta 0:00:15
   - -------------------------------------- 0.5/11.0 MB 744.3 kB/s eta 0:00:15
   - -------------------------------------- 0.5/11.0 MB 744.3 kB/s eta 0:00:15
   - --------------------------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
import pandas as pd

data = pd.DataFrame({
    "age": [20, 45, 30, 50],
    "income": [1500, 4000, 2200, 5000],
    "children": [0, 2, 1, 3],
    "housing": ["rent", "own", "rent", "own"],
    "transport": ["bus", "car", "bus", "car"]
})

print(data)


   age  income  children housing transport
0   20    1500         0    rent       bus
1   45    4000         2     own       car
2   30    2200         1    rent       bus
3   50    5000         3     own       car


In [41]:
numeric_cols = ["age", "income", "children"]
categorical_cols = ["housing", "transport"]

model = DynamicClusteringGeneralized(
    num_clusters=2,
    numeric_features=numeric_cols,
    categorical_features=categorical_cols
)

assignments = model.fit(data)

print("Assignments:", assignments)
print("Prototypes:", model.cluster_prototypes)


Convergence reached at iteration 1
Assignments: [0 1 0 1]
Prototypes: [[np.float64(35.0) np.float64(3250.0) np.float64(1.5) 'own' 'bus']
 [np.float64(37.5) np.float64(3100.0) np.float64(1.5) 'own' 'bus']]
