# Лабораторная работа №5: Кластеризация

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_classification, make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score, davies_bouldin_score, calinski_harabasz_score

import warnings
warnings.filterwarnings('ignore')


In [None]:

datasets = []

for i in range(3):
    X, y = make_classification(n_samples=300, n_features=2, n_informative=2, n_redundant=0,
                               n_clusters_per_class=1, n_classes=i+3, random_state=42+i)
    datasets.append((X, y))

for i in range(2):
    X, y = make_blobs(n_samples=300, centers=i+3, cluster_std=1.0, random_state=100+i)
    datasets.append((X, y))

fig, axs = plt.subplots(1, 5, figsize=(20, 4))
for i, (X, y) in enumerate(datasets):
    axs[i].scatter(X[:, 0], X[:, 1], c=y, cmap='Set1')
    axs[i].set_title(f'Dataset {i+1}')
plt.show()


In [None]:

# Убедитесь, что файл загружен в Colab
df = pd.read_csv('/content/data_newKredit.csv')
df_clean = df.drop(columns=['class'], errors='ignore')
scaler = StandardScaler()
X_real = scaler.fit_transform(df_clean)


In [None]:

def apply_clustering(X, method, **kwargs):
    if method == 'kmeans':
        model = KMeans(n_clusters=3, random_state=42, **kwargs).fit(X)
        return model.labels_
    elif method == 'dbscan':
        model = DBSCAN(**kwargs).fit(X)
        return model.labels_
    elif method == 'gmm':
        model = GaussianMixture(n_components=3, random_state=42).fit(X)
        return model.predict(X)
    elif method == 'affinity':
        model = AffinityPropagation(**kwargs).fit(X)
        return model.labels_
    elif method == 'agglomerative':
        model = AgglomerativeClustering(n_clusters=3).fit(X)
        return model.labels_


In [None]:

X, _ = datasets[0]
inertias = []
silhouettes = []

for k in range(2, 10):
    model = KMeans(n_clusters=k, random_state=42).fit(X)
    inertias.append(model.inertia_)
    silhouettes.append(silhouette_score(X, model.labels_))

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(range(2, 10), inertias, marker='o')
plt.title('Метод локтя')
plt.xlabel('k'); plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(range(2, 10), silhouettes, marker='o')
plt.title('Метод силуэта')
plt.xlabel('k'); plt.ylabel('Silhouette Score')
plt.show()


In [None]:

def evaluate_clustering(X, labels, true_labels=None):
    result = {}
    result['Silhouette'] = silhouette_score(X, labels)
    result['Davies-Bouldin'] = davies_bouldin_score(X, labels)
    result['Calinski-Harabasz'] = calinski_harabasz_score(X, labels)
    if true_labels is not None:
        result['Adjusted Rand'] = adjusted_rand_score(true_labels, labels)
        result['NMI'] = normalized_mutual_info_score(true_labels, labels)
    return result


In [None]:

labels_real = KMeans(n_clusters=3, random_state=42).fit_predict(X_real)
df_clean['Cluster'] = labels_real
df_clean.groupby('Cluster').mean()


In [None]:

class MyKMeans:
    def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state

    def fit(self, X):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        initial_idx = np.random.choice(n_samples, self.n_clusters, replace=False)
        self.centroids_ = X[initial_idx]

        for _ in range(self.max_iter):
            distances = np.linalg.norm(X[:, np.newaxis] - self.centroids_, axis=2)
            labels = np.argmin(distances, axis=1)
            new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(self.n_clusters)])
            if np.linalg.norm(self.centroids_ - new_centroids) < self.tol:
                break
            self.centroids_ = new_centroids

        self.labels_ = labels
        self.inertia_ = np.sum((X - self.centroids_[labels]) ** 2)

    def predict(self, X):
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids_, axis=2)
        return np.argmin(distances, axis=1)


In [None]:

my_kmeans = MyKMeans(n_clusters=3, random_state=42)
my_kmeans.fit(X_real)

plt.scatter(X_real[:, 0], X_real[:, 1], c=my_kmeans.labels_, cmap='Set1')
plt.scatter(my_kmeans.centroids_[:, 0], my_kmeans.centroids_[:, 1], c='black', marker='x', s=100)
plt.title("Собственная реализация K-Means")
plt.show()

print(f"Inertia: {my_kmeans.inertia_}")
