<div style="font-size:18pt; padding-top:20px; text-align:center">СЕМИНАР. <b>Кластеризация и </b> <span style="font-weight:bold; color:green">NumPy/SciPy/Sklearn</span></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin.study@yandex.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Метод k-средних (KMeans)</a></li>
        <li><a href="#2">Иерархическая кластеризация (Agglomerative Clustering)</a></li>
        <li><a href="#3">Кластеризация по плотности (DBSCAN)</a>
        <li><a href="#4">Источники</a>
        </li>
    </ol>
</div>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

from sklearn.datasets import make_blobs

%matplotlib inline

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
clrMap = ListedColormap(["blue", "red", "green", "yellow", "purple"])

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Генерация данных</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
from sklearn import datasets

In [None]:
n = 500

In [None]:
cl_class = datasets.make_classification(n_samples=n, n_features=2, n_redundant=0,
                           n_informative=2, n_clusters_per_class=1, n_classes=3, class_sep=2,
                           random_state=1234)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Class")
plt.scatter(cl_class[0][:,0], cl_class[0][:,1])
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Class")
plt.scatter(cl_class[0][:,0], cl_class[0][:,1], c=cl_class[1], cmap=clrMap)
plt.grid(True)

plt.show()

In [None]:
cl_blobs = datasets.make_blobs(n_samples=n, random_state=1000)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Blobs")
plt.scatter(cl_blobs[0][:,0], cl_blobs[0][:,1])
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Blobs")
plt.scatter(cl_blobs[0][:,0], cl_blobs[0][:,1], c=cl_blobs[1], cmap=clrMap)
plt.grid(True)

plt.show()

In [None]:
cl_moons = datasets.make_moons(n_samples=n, noise=0.1)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Circles")
plt.scatter(cl_moons[0][:,0], cl_moons[0][:,1])
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Circles")
plt.scatter(cl_moons[0][:,0], cl_moons[0][:,1], c=cl_moons[1], cmap=clrMap)
plt.grid(True)

plt.show()

In [None]:
cl_circles = datasets.make_circles(n_samples=n, factor=0.1, noise=0.1)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Circles")
plt.scatter(cl_circles[0][:,0], cl_circles[0][:,1])
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Circles")
plt.scatter(cl_circles[0][:,0], cl_circles[0][:,1], c=cl_circles[1], cmap=clrMap)
plt.grid(True)

plt.show()

In [None]:
rand = np.random.rand(n, 2), None

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Circles")
plt.scatter(rand[0][:,0], rand[0][:,1])
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Circles")
plt.scatter(rand[0][:,0], rand[0][:,1], c=rand[1], cmap=clrMap)
plt.grid(True)

plt.show()

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Метод k-средних</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<p><a href="http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html">KMeans</a></p>

In [None]:
n = 1000
random_state = 100
X, y = make_blobs(n_samples=n, centers=3, cluster_std=1, center_box=(-5, 5), random_state=random_state)

In [None]:
plt.title("Initial data")
plt.scatter(X[:, 0], X[:, 1])
plt.grid(True)
plt.show()

In [None]:
kM_cl = KMeans(n_clusters=3, max_iter=300, init="random", random_state=10, n_init=1)

In [None]:
kM_cl.fit(X)

In [None]:
kM_cl.inertia_

In [None]:
y_pred = kM_cl.predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
y_pred = KMeans(n_clusters=3, max_iter=1, init="random", random_state=100, n_init=1).fit_predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
y_pred = KMeans(n_clusters=2, random_state=10, n_init=1).fit_predict(X)

In [None]:
plt.title("Number of clusters: 2")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
y_pred = KMeans(n_clusters=5, random_state=10, n_init=1).fit_predict(X)

In [None]:
plt.title("Number of clusters: 5")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

<p>Примеры</p>

In [None]:
n = 500

In [None]:
X, y = datasets.make_classification(n_samples=n, n_features=2, n_redundant=0,
                           n_informative=2, n_clusters_per_class=1, n_classes=3, class_sep=2,
                           random_state=1234)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Class")
plt.scatter(X[:,0], X[:,1])
plt.grid(True)

plt.show()

In [None]:
kM_cl = KMeans(n_clusters=3, max_iter=300, init="random", random_state=10, n_init=1)
kM_cl.fit(X)
y_pred = kM_cl.predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
X, y = datasets.make_blobs(n_samples=n, random_state=1000)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Class")
plt.scatter(X[:,0], X[:,1])
plt.grid(True)

plt.show()

In [None]:
kM_cl = KMeans(n_clusters=3, max_iter=300, init="random", random_state=10, n_init=1)
kM_cl.fit(X)
y_pred = kM_cl.predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
X, y = datasets.make_moons(n_samples=n, noise=0.1)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Class")
plt.scatter(X[:,0], X[:,1])
plt.grid(True)

plt.show()

In [None]:
kM_cl = KMeans(n_clusters=2, max_iter=300, init="random", random_state=10, n_init=1)
kM_cl.fit(X)
y_pred = kM_cl.predict(X)

In [None]:
plt.title("Number of clusters: 2")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

In [None]:
X, y = datasets.make_circles(n_samples=n, factor=0.1, noise=0.1)

In [None]:
plt.figure(figsize=[10, 4])

plt.subplot(1,2,1)
plt.title("Class")
plt.scatter(X[:,0], X[:,1])
plt.grid(True)

plt.show()

In [None]:
kM_cl = KMeans(n_clusters=2, max_iter=300, init="random", random_state=10, n_init=1)
kM_cl.fit(X)
y_pred = kM_cl.predict(X)

In [None]:
plt.title("Number of clusters: 2")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">3. Иерархическая кластеризация</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<a href="http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html">AgglomerativeClustering</a>

In [None]:
n = 1000
random_state = 100
X, y = make_blobs(n_samples=n, centers=3, cluster_std=1, center_box=(-5, 5), random_state=random_state)

In [None]:
plt.title("Initial data")
plt.scatter(X[:, 0], X[:, 1])
plt.grid(True)
plt.show()

In [None]:
aggl_avr = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="average")

In [None]:
y_pred = aggl_avr.fit_predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

<p>Примеры</p>

In [None]:
X_class, y_class = datasets.make_classification(n_samples=n, n_features=2, n_redundant=0,
                           n_informative=2, n_clusters_per_class=1, n_classes=3, class_sep=2,
                           random_state=1234)
X_blob, y_blob = datasets.make_blobs(n_samples=n, random_state=1000)
X_moon, y_moon = datasets.make_moons(n_samples=n, noise=0.1)
X_circle, y_circle = datasets.make_circles(n_samples=n, factor=0.1, noise=0.1)
X_rand, y_rand = np.random.rand(n, 2), None

In [None]:
plt.figure(figsize=[18, 4])

plt.subplot(1,5,1)
plt.title("Initial data")
plt.scatter(X_class[:, 0], X_class[:, 1])
plt.grid(True)

plt.subplot(1,5,2)
plt.title("Initial data")
plt.scatter(X_blob[:, 0], X_blob[:, 1])
plt.grid(True)

plt.subplot(1,5,3)
plt.title("Initial data")
plt.scatter(X_moon[:, 0], X_moon[:, 1])
plt.grid(True)

plt.subplot(1,5,4)
plt.title("Initial data")
plt.scatter(X_circle[:, 0], X_circle[:, 1])
plt.grid(True)

plt.subplot(1,5,5)
plt.title("Initial data")
plt.scatter(X_rand[:, 0], X_rand[:, 1])
plt.grid(True)

plt.tight_layout()

plt.show()

In [None]:
aggl_avr = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="average")

y_pred_class = aggl_avr.fit_predict(X_class)
y_pred_blob = aggl_avr.fit_predict(X_blob)

aggl_avr_2 = AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage="average")
y_pred_moon = aggl_avr_2.fit_predict(X_moon)

y_pred_circle = aggl_avr_2.fit_predict(X_circle)
y_pred_rand = aggl_avr_2.fit_predict(X_rand)

In [None]:
plt.figure(figsize=[18, 4])

plt.subplot(1,5,1)
plt.title("Initial data")
plt.scatter(X_class[:, 0], X_class[:, 1], c=y_pred_class, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,2)
plt.title("Initial data")
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=y_pred_blob, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,3)
plt.title("Initial data")
plt.scatter(X_moon[:, 0], X_moon[:, 1], c=y_pred_moon, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,4)
plt.title("Initial data")
plt.scatter(X_circle[:, 0], X_circle[:, 1], c=y_pred_circle, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,5)
plt.title("Initial data")
plt.scatter(X_rand[:, 0], X_rand[:, 1], c=y_pred_rand, cmap=clrMap)
plt.grid(True)

plt.tight_layout()

plt.show()

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">4. Кластеризация по плотности</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<a href="http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html">DBSCAN</a>

In [None]:
n = 1000
random_state = 100
X, y = make_blobs(n_samples=n, centers=3, cluster_std=1, center_box=(-5, 5), random_state=random_state)

In [None]:
plt.title("Initial data")
plt.scatter(X[:, 0], X[:, 1])
plt.grid(True)
plt.show()

In [None]:
dbscan = DBSCAN(min_samples=4, eps=0.5, metric="euclidean")

In [None]:
y_pred = dbscan.fit_predict(X)

In [None]:
plt.title("Number of clusters: 3")
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=clrMap)
plt.grid(True)
plt.show()

<p>Пример</p>

In [None]:
X_class, y_class = datasets.make_classification(n_samples=n, n_features=2, n_redundant=0,
                           n_informative=2, n_clusters_per_class=1, n_classes=3, class_sep=2,
                           random_state=1234)
X_blob, y_blob = datasets.make_blobs(n_samples=n, random_state=1000)
X_moon, y_moon = datasets.make_moons(n_samples=n, noise=0.1)
X_circle, y_circle = datasets.make_circles(n_samples=n, factor=0.1, noise=0.1)
X_rand, y_rand = np.random.rand(n, 2), None

In [None]:
plt.figure(figsize=[18, 4])

plt.subplot(1,5,1)
plt.title("Initial data")
plt.scatter(X_class[:, 0], X_class[:, 1])
plt.grid(True)

plt.subplot(1,5,2)
plt.title("Initial data")
plt.scatter(X_blob[:, 0], X_blob[:, 1])
plt.grid(True)

plt.subplot(1,5,3)
plt.title("Initial data")
plt.scatter(X_moon[:, 0], X_moon[:, 1])
plt.grid(True)

plt.subplot(1,5,4)
plt.title("Initial data")
plt.scatter(X_circle[:, 0], X_circle[:, 1])
plt.grid(True)

plt.subplot(1,5,5)
plt.title("Initial data")
plt.scatter(X_rand[:, 0], X_rand[:, 1])
plt.grid(True)

plt.tight_layout()

plt.show()

In [None]:
dbscan_avr = DBSCAN(min_samples=15, eps=0.6, metric="euclidean")

y_pred_class = dbscan_avr.fit_predict(X_class)
y_pred_blob = dbscan_avr.fit_predict(X_blob)

dbscan_avr_2 = DBSCAN(min_samples=20, eps=0.2, metric="euclidean")
y_pred_moon = dbscan_avr_2.fit_predict(X_moon)

y_pred_circle = dbscan_avr_2.fit_predict(X_circle)
y_pred_rand = dbscan_avr_2.fit_predict(X_rand)

In [None]:
plt.figure(figsize=[18, 4])

plt.subplot(1,5,1)
plt.title("Initial data")
plt.scatter(X_class[:, 0], X_class[:, 1], c=y_pred_class, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,2)
plt.title("Initial data")
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=y_pred_blob, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,3)
plt.title("Initial data")
plt.scatter(X_moon[:, 0], X_moon[:, 1], c=y_pred_moon, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,4)
plt.title("Initial data")
plt.scatter(X_circle[:, 0], X_circle[:, 1], c=y_pred_circle, cmap=clrMap)
plt.grid(True)

plt.subplot(1,5,5)
plt.title("Initial data")
plt.scatter(X_rand[:, 0], X_rand[:, 1], c=y_pred_rand, cmap=clrMap)
plt.grid(True)

plt.tight_layout()

plt.show()