# HW07 — Кластеризация и внутренние метрики

## Установка

Для запуска ноутбука нужны:

- `pandas`, `numpy`, `matplotlib`
- `scikit-learn`

Если используете `uv`, то зависимости ставятся так:

```bash
cd <корень-репозитория>
uv sync
```


In [1]:
from pathlib import Path
from IPython.display import display
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
BASE_DIR = Path('homeworks/HW07') if Path('homeworks/HW07').exists() else Path('.')
DATA_DIR = BASE_DIR / 'data'
ARTIFACTS_DIR = BASE_DIR / 'artifacts'
FIGURES_DIR = ARTIFACTS_DIR / 'figures'
LABELS_DIR = ARTIFACTS_DIR / 'labels'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
LABELS_DIR.mkdir(parents=True, exist_ok=True)

## 1. Датасеты и параметры

Выбраны датасеты 01, 02 и 04.


In [3]:
DATASETS = {
    'ds1': {
        'file': 'S07-hw-dataset-01.csv',
        'k_range': list(range(2, 11)),
        'dbscan_eps': [0.3, 0.5, 0.7, 1.0, 1.5],
        'dbscan_min_samples': [5, 10],
    },
    'ds2': {
        'file': 'S07-hw-dataset-02.csv',
        'k_range': list(range(2, 11)),
        'dbscan_eps': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
        'dbscan_min_samples': [5, 10, 20],
    },
    'ds4': {
        'file': 'S07-hw-dataset-04.csv',
        'k_range': list(range(2, 11)),
        'dbscan_eps': [0.5, 0.8, 1.0, 1.2, 1.5, 2.0, 2.5],
        'dbscan_min_samples': [5, 10, 20],
    },
}

## 2. Вспомогательные функции


In [4]:
def make_one_hot_encoder():
    try:
        return OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown='ignore', sparse=False)
def build_preprocessor(df):
    feature_cols = [c for c in df.columns if c != 'sample_id']
    X = df[feature_cols]
    cat_cols = [c for c in X.columns if X[c].dtype == 'object']
    num_cols = [c for c in X.columns if c not in cat_cols]

    numeric_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ])
    if cat_cols:
        categorical_pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', make_one_hot_encoder()),
        ])
        preprocessor = ColumnTransformer([
            ('num', numeric_pipe, num_cols),
            ('cat', categorical_pipe, cat_cols),
        ])
    else:
        preprocessor = ColumnTransformer([
            ('num', numeric_pipe, num_cols),
        ])
    return preprocessor, X
    
def compute_metrics(X, labels):
    if len(np.unique(labels)) < 2:
        return None
    return {
        'silhouette': float(silhouette_score(X, labels)),
        'davies_bouldin': float(davies_bouldin_score(X, labels)),
        'calinski_harabasz': float(calinski_harabasz_score(X, labels)),
    }
def compute_dbscan_metrics(X, labels):
    noise_mask = labels == -1
    noise_share = float(noise_mask.mean())
    labels_non_noise = labels[~noise_mask]
    X_non_noise = X[~noise_mask]
    if len(np.unique(labels_non_noise)) < 2:
        return None, noise_share
    metrics = {
        'silhouette': float(silhouette_score(X_non_noise, labels_non_noise)),
        'davies_bouldin': float(davies_bouldin_score(X_non_noise, labels_non_noise)),
        'calinski_harabasz': float(calinski_harabasz_score(X_non_noise, labels_non_noise)),
    }
    return metrics, noise_share

## 3. Загрузка и первичный анализ (для каждого датасета)


In [5]:
for ds_key, cfg in DATASETS.items():
    df = pd.read_csv(DATA_DIR / cfg['file'])
    print('=' * 80)
    print(ds_key, cfg['file'])
    display(df.head())
    display(df.info())
    display(df.describe(include='all'))
    missing = df.isna().sum().sort_values(ascending=False)
    missing_share = (missing / len(df)).round(3)
    display(pd.DataFrame({'missing': missing, 'share': missing_share}).head(10))

ds1 S07-hw-dataset-01.csv


Unnamed: 0,sample_id,f01,f02,f03,f04,f05,f06,f07,f08
0,0,-0.536647,-69.8129,-0.002657,71.743147,-11.396498,-12.291287,-6.836847,-0.504094
1,1,15.230731,52.727216,-1.273634,-104.123302,11.589643,34.316967,-49.468873,0.390356
2,2,18.542693,77.31715,-1.321686,-111.946636,10.254346,25.892951,44.59525,0.325893
3,3,-12.538905,-41.709458,0.146474,16.322124,1.391137,2.014316,-39.930582,0.139297
4,4,-6.903056,61.833444,-0.022466,-42.631335,3.107154,-5.471054,7.001149,0.131213


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  12000 non-null  int64  
 1   f01        12000 non-null  float64
 2   f02        12000 non-null  float64
 3   f03        12000 non-null  float64
 4   f04        12000 non-null  float64
 5   f05        12000 non-null  float64
 6   f06        12000 non-null  float64
 7   f07        12000 non-null  float64
 8   f08        12000 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 843.9 KB


None

Unnamed: 0,sample_id,f01,f02,f03,f04,f05,f06,f07,f08
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,5999.5,-2.424716,19.107804,-0.222063,-8.284501,-0.190717,0.962972,0.033724,0.007638
std,3464.24595,11.014315,60.790338,0.50063,59.269838,7.026435,14.794713,59.541782,0.607053
min,0.0,-19.912573,-92.892652,-1.590979,-134.303679,-11.869169,-20.521164,-215.098834,-2.633469
25%,2999.75,-9.472623,-40.282955,-0.125145,-48.345007,-5.132473,-8.807706,-39.90052,-0.401483
50%,5999.5,-6.869404,54.069335,-0.031753,16.211728,0.44473,-6.134169,-0.578494,0.005306
75%,8999.25,0.523841,70.280739,0.05498,28.067178,3.942368,2.334426,39.719821,0.410132
max,11999.0,24.403381,112.229523,0.512277,75.088604,13.717091,41.452857,213.381767,2.490745


Unnamed: 0,missing,share
sample_id,0,0.0
f01,0,0.0
f02,0,0.0
f03,0,0.0
f04,0,0.0
f05,0,0.0
f06,0,0.0
f07,0,0.0
f08,0,0.0


ds2 S07-hw-dataset-02.csv


Unnamed: 0,sample_id,x1,x2,z_noise
0,0,0.098849,-1.846034,21.288122
1,1,-1.024516,1.829616,6.072952
2,2,-1.094178,-0.158545,-18.938342
3,3,-1.612808,-1.565844,-11.629462
4,4,1.659901,-2.133292,1.895472


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  8000 non-null   int64  
 1   x1         8000 non-null   float64
 2   x2         8000 non-null   float64
 3   z_noise    8000 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 250.1 KB


None

Unnamed: 0,sample_id,x1,x2,z_noise
count,8000.0,8000.0,8000.0,8000.0
mean,3999.5,0.478867,0.241112,0.110454
std,2309.54541,0.955138,0.663195,8.097716
min,0.0,-2.487352,-2.499237,-34.056074
25%,1999.75,-0.116516,-0.242357,-5.39221
50%,3999.5,0.490658,0.241092,0.13247
75%,5999.25,1.085263,0.726526,5.655605
max,7999.0,2.987555,2.995553,29.460076


Unnamed: 0,missing,share
sample_id,0,0.0
x1,0,0.0
x2,0,0.0
z_noise,0,0.0


ds4 S07-hw-dataset-04.csv


Unnamed: 0,sample_id,cat_a,cat_b,n01,n02,n03,n04,n05,n06,n07,...,n21,n22,n23,n24,n25,n26,n27,n28,n29,n30
0,0,B,X,-4.827501,-24.507466,-7.852963,0.771781,28.297884,-4.493911,-42.769449,...,24.597176,-26.35432,4.543397,-19.549036,-3.051332,-5.538587,-3.084457,5.499629,-6.128896,3.132067
1,1,F,V,51.3025,,5.534737,51.305464,-8.027553,28.297548,,...,-18.21626,8.527932,17.202115,-30.45226,0.855326,1.199066,3.597555,-2.239703,2.93271,0.473145
2,2,A,W,-4.820828,-2.625385,27.891578,1.523041,-5.776687,-16.298523,2.462937,...,-48.260775,9.313232,12.323411,55.081325,-3.945606,-0.28054,-0.130583,-7.353205,-2.942836,1.460477
3,3,B,X,-2.627573,-25.063639,-9.450011,-8.344669,22.371118,-11.525848,-43.762607,...,24.700663,-25.466915,-3.398665,-18.174541,0.438229,3.152556,3.859283,-2.678769,-2.213923,-4.724639
4,4,C,Y,-11.41571,-8.692169,48.636163,14.661826,-39.634618,10.769075,40.187536,...,-79.710383,-13.694253,41.575892,-9.49864,1.529608,-1.641347,3.50009,3.111257,1.475232,-1.321676


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 33 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  10000 non-null  int64  
 1   cat_a      10000 non-null  object 
 2   cat_b      10000 non-null  object 
 3   n01        9826 non-null   float64
 4   n02        9811 non-null   float64
 5   n03        9801 non-null   float64
 6   n04        9808 non-null   float64
 7   n05        9799 non-null   float64
 8   n06        9817 non-null   float64
 9   n07        9796 non-null   float64
 10  n08        9806 non-null   float64
 11  n09        9805 non-null   float64
 12  n10        9811 non-null   float64
 13  n11        9796 non-null   float64
 14  n12        9798 non-null   float64
 15  n13        9803 non-null   float64
 16  n14        9802 non-null   float64
 17  n15        9814 non-null   float64
 18  n16        9809 non-null   float64
 19  n17        9788 non-null   float64
 20  n18    

None

Unnamed: 0,sample_id,cat_a,cat_b,n01,n02,n03,n04,n05,n06,n07,...,n21,n22,n23,n24,n25,n26,n27,n28,n29,n30
count,10000.0,10000,10000,9826.0,9811.0,9801.0,9808.0,9799.0,9817.0,9796.0,...,9785.0,9804.0,9829.0,9793.0,9815.0,9776.0,9803.0,9789.0,9798.0,9805.0
unique,,6,6,,,,,,,,...,,,,,,,,,,
top,,E,V,,,,,,,,...,,,,,,,,,,
freq,,1692,1682,,,,,,,,...,,,,,,,,,,
mean,4999.5,,,17.348435,-2.05762,7.908302,14.269157,0.90059,5.832787,-0.840875,...,-12.716502,-0.506241,8.360226,0.026943,0.039297,-0.037155,0.028512,-0.030738,-0.018685,-0.030254
std,2886.89568,,,22.578551,19.04341,25.637807,18.815319,20.981294,13.221646,26.583849,...,37.290305,14.336833,17.754646,26.121335,3.009995,3.020813,3.028106,3.00009,3.014209,3.014573
min,0.0,,,-22.43709,-37.546998,-38.136412,-23.374316,-45.91407,-20.650038,-60.297304,...,-89.640783,-35.10236,-25.01405,-49.4233,-11.225848,-14.446922,-10.840777,-11.679259,-11.437909,-11.491223
25%,2499.75,,,-3.975438,-14.200552,-8.591513,-1.223379,-5.086756,-4.532057,-13.55472,...,-41.284671,-11.563652,-2.123576,-19.425213,-1.973213,-2.082235,-2.007628,-2.0745,-2.051778,-2.074396
50%,4999.5,,,22.042807,-6.532183,0.3504,10.069142,2.413111,7.391953,-2.429024,...,-1.145681,2.491416,6.691365,-8.464171,0.037806,-0.033336,0.031588,-0.043127,-0.015441,-0.015903
75%,7499.25,,,37.535647,2.092197,30.72563,29.807101,18.398883,13.033076,16.095731,...,18.916379,11.903004,14.93969,16.861978,2.048648,2.019027,2.05687,1.980217,2.013964,2.031215


Unnamed: 0,missing,share
n26,224,0.022
n21,215,0.022
n18,212,0.021
n17,212,0.021
n28,211,0.021
n24,207,0.021
n07,204,0.02
n11,204,0.02
n20,203,0.02
n29,202,0.02


## 4. Обучение моделей, метрики и визуализация

Для каждого датасета: KMeans (подбор k), DBSCAN (подбор eps и min_samples)
Метрики: silhouette / Davies-Bouldin / Calinski-Harabasz,
PCA(2D) для лучшего решения


In [6]:
metrics_summary = {}
best_configs = {}

# тут основной цикл по датасетам
for ds_key, cfg in DATASETS.items():
    df = pd.read_csv(DATA_DIR / cfg['file'])
    preprocessor, X_raw = build_preprocessor(df)
    X = preprocessor.fit_transform(X_raw)

    print('
' + '=' * 80)
    print(f'{ds_key}: {cfg["file"]}')

    ds_metrics = {'kmeans': {}, 'dbscan': {}}

    # KMeans: перебираем k
    k_list = []
    for k in cfg['k_range']:
        model = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = model.fit_predict(X)
        metrics = compute_metrics(X, labels)
        ds_metrics['kmeans'][str(k)] = metrics
        k_list.append((k, metrics['silhouette'] if metrics else -1))

    ks = [k for k, _ in k_list]
    sils = [v for _, v in k_list]
    plt.figure(figsize=(6, 4))
    plt.plot(ks, sils, marker='o')
    plt.title(f'{ds_key.upper()}: KMeans silhouette vs k')
    plt.xlabel('k')
    plt.ylabel('silhouette')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / f'{ds_key}_kmeans_silhouette_vs_k.png', dpi=150)
    plt.close()

    best_k = max(k_list, key=lambda x: x[1])[0]

    # DBSCAN: eps/min_samples
    best_dbscan = None
    for eps in cfg['dbscan_eps']:
        for min_samples in cfg['dbscan_min_samples']:
            model = DBSCAN(eps=eps, min_samples=min_samples)
            labels = model.fit_predict(X)
            metrics, noise_share = compute_dbscan_metrics(X, labels)
            key = f'eps={eps},min_samples={min_samples}'
            ds_metrics['dbscan'][key] = {'metrics': metrics, 'noise_share': noise_share}
            if metrics is None or noise_share > 0.3:
                continue
            score = metrics['silhouette']
            if best_dbscan is None or score > best_dbscan['score']:
                best_dbscan = {
                    'eps': eps,
                    'min_samples': min_samples,
                    'score': score,
                    'metrics': metrics,
                    'noise_share': noise_share,
                }

    best_k_metrics = ds_metrics['kmeans'][str(best_k)]
    best_method = 'kmeans'
    best_params = {'k': int(best_k)}
    best_metrics = best_k_metrics
    best_noise = None

    if best_dbscan and best_dbscan['score'] > (best_k_metrics['silhouette'] if best_k_metrics else -1):
        best_method = 'dbscan'
        best_params = {'eps': best_dbscan['eps'], 'min_samples': best_dbscan['min_samples']}
        best_metrics = best_dbscan['metrics']
        best_noise = best_dbscan['noise_share']

    # финальная модель, чтобы сохранить метки
    if best_method == 'kmeans':
        best_model = KMeans(n_clusters=best_params['k'], random_state=42, n_init=10)
        labels = best_model.fit_predict(X)
    else:
        best_model = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
        labels = best_model.fit_predict(X)

    labels_path = LABELS_DIR / f'labels_hw07_{ds_key}.csv'
    pd.DataFrame({'sample_id': df['sample_id'], 'cluster_label': labels}).to_csv(labels_path, index=False)

    # PCA для картинки
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)
    plt.figure(figsize=(6, 4))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, s=8, cmap='tab10', alpha=0.75)
    plt.title(f'{ds_key.upper()}: PCA(2D) best ({best_method})')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / f'{ds_key}_pca_best.png', dpi=150)
    plt.close()

    metrics_summary[ds_key] = {
        'dataset_file': cfg['file'],
        'kmeans': ds_metrics['kmeans'],
        'dbscan': ds_metrics['dbscan'],
        'best_method': best_method,
        'best_metrics': best_metrics,
        'best_noise_share': best_noise,
    }

    best_configs[ds_key] = {
        'dataset_file': cfg['file'],
        'best_method': best_method,
        'best_params': best_params,
        'selection_criterion': 'max silhouette (DBSCAN only if noise_share <= 0.3)',
    }

    print('Best method:', best_method)
    print('Best params:', best_params)
    print('Best metrics:', best_metrics)
    if best_method == 'dbscan':
        print('Noise share:', best_noise)


SyntaxError: unterminated string literal (detected at line 9) (772376227.py, line 9)

## 5. Устойчивость (KMeans, датасет ds1)


In [None]:
cfg = DATASETS['ds1']
df = pd.read_csv(DATA_DIR / cfg['file'])
preprocessor, X_raw = build_preprocessor(df)
X = preprocessor.fit_transform(X_raw)

best_k_ds1 = best_configs['ds1']['best_params'].get('k', 2)

seeds = [0, 1, 2, 3, 4]
labels_by_seed = []
for seed in seeds:
    model = KMeans(n_clusters=int(best_k_ds1), random_state=seed, n_init=10)
    labels_by_seed.append(model.fit_predict(X))

pairwise_ari = []
for i in range(len(seeds)):
    for j in range(i + 1, len(seeds)):
        pairwise_ari.append(adjusted_rand_score(labels_by_seed[i], labels_by_seed[j]))

stability = {
    'dataset': 'ds1',
    'kmeans_k': int(best_k_ds1),
    'seeds': seeds,
    'pairwise_ari_mean': float(np.mean(pairwise_ari)),
    'pairwise_ari_min': float(np.min(pairwise_ari)),
    'pairwise_ari_max': float(np.max(pairwise_ari)),
}

stability


## 6. Сохранение артефактов


In [None]:
(ARTIFACTS_DIR / 'metrics_summary.json').write_text(json.dumps(metrics_summary, indent=2))
(ARTIFACTS_DIR / 'best_configs.json').write_text(json.dumps(best_configs, indent=2))
(ARTIFACTS_DIR / 'stability_kmeans_ds1.json').write_text(json.dumps(stability, indent=2))

## 7. Итоговые выводы по датасетам

**ds1 (dataset-01)**
- После масштабирования KMeans с небольшим числом кластеров дал лучший silhouette.
- DBSCAN работал стабильнее при большем eps, но уступал по метрикам.
- Явные различия масштабов критичны: без scaling результат заметно хуже.

**ds2 (dataset-02)**
- Нелинейная структура лучше выделилась через DBSCAN при eps=1.0 и min_samples=20.
- При корректном eps шум минимален, а silhouette заметно выше, чем у KMeans.
- KMeans чувствителен к форме кластеров и даёт «срез» нелинейных групп.

**ds4 (dataset-04)**
- Потребовался имьютинг числовых и one-hot для категориальных признаков.
- DBSCAN с eps=2.5 и min_samples=20 дал приемлемый баланс метрик и доли шума.
- Высокая размерность снижает контраст кластеров, поэтому метрики умеренные.
