In [1]:
from sklearn.datasets import make_blobs
import numpy as np
blob_centers = np.array([
    [0.2, 2.3], 
    [-1.5, 2.3], 
    [-2.8, 1.8], 
    [-2.8, 2.8], 
    [-2.8, 1.3]
])
blobs_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
X, y = make_blobs(n_samples=2000, centers=blob_centers, 
                 cluster_std=blobs_std, random_state=7)

In [2]:
from sklearn.cluster import KMeans
k=5
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)

In [3]:
y_pred is kmeans.labels_, kmeans.cluster_centers_

(True,
 array([[-2.80037642,  1.30082566],
        [ 0.20876306,  2.25551336],
        [-2.79290307,  2.79641063],
        [-1.46679593,  2.28585348],
        [-2.80389616,  1.80117999]]))

In [4]:
X_new = np.array([[0, 2], [3, 2], [-3, 3], [-3, 2.5]])
kmeans.predict(X_new)

array([1, 1, 2, 2])

In [5]:
kmeans.transform(X_new)

array([[2.88633901, 0.32995317, 2.9042344 , 1.49439034, 2.81093633],
       [5.84236351, 2.80290755, 5.84739223, 4.4759332 , 5.80730058],
       [1.71086031, 3.29399768, 0.29040966, 1.69136631, 1.21475352],
       [1.21567622, 3.21806371, 0.36159148, 1.54808703, 0.72581411]])

In [6]:
good_init = np.array([[-3, 3], [-3, 2], [-3, 1], [-1, 2], [0, 2]])
kmeans = KMeans(n_clusters=5, init=good_init, n_init=1)
kmeans.fit_predict(X)

array([2, 1, 4, ..., 0, 4, 1])

In [7]:
kmeans.inertia_

211.5985372581684

In [8]:
kmeans.score(X)

-211.5985372581684

In [9]:
from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters=5)
minibatch_kmeans.fit(X)

MiniBatchKMeans(n_clusters=5)

In [10]:
from sklearn.metrics import silhouette_score
silhouette_score(X, kmeans.labels_)

0.655517642572828

In [15]:
import os
import urllib

images_path = os.path.join(os.getcwd(), "images")
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
filename ="ladybug.png"
url = DOWNLOAD_ROOT + "images/unsupervised_learning/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))

('C:\\Users\\SG704\\ml\\images\\ladybug.png',
 <http.client.HTTPMessage at 0x2324ca88908>)

In [18]:
from matplotlib.image import imread
image = imread(os.path.join(images_path, filename))
image.shape

(533, 800, 3)

In [27]:
X = image.reshape(-1,3)
kmeans = KMeans(n_clusters=8, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

In [44]:
from sklearn.datasets import load_digits
X_digits, y_digits = load_digits(return_X_y=True)

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

In [46]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", 
                             max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train)

LogisticRegression(max_iter=5000, multi_class='ovr', random_state=42)

In [47]:
log_reg_score = log_reg.score(X_test, y_test)
log_reg_score

0.9666666666666667

In [48]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("logis", LogisticRegression(multi_class="ovr", solver="lbfgs", 
                                max_iter=5000, random_state=42))
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('kmeans', KMeans(n_clusters=50, random_state=42)),
                ('logis',
                 LogisticRegression(max_iter=5000, multi_class='ovr',
                                    random_state=42))])

In [49]:
pipeline_score = pipeline.score(X_test, y_test)
pipeline_score

0.9844444444444445

In [50]:
# from sklearn.model_selection import GridSearchCV

# param_grid = dict(kmeans__n_clusters=range(2, 100))
# grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)

In [55]:
n_labeled = 50
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", 
                            max_iter=5000, random_state=42)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
log_reg.score(X_test, y_test)

0.7511111111111111

In [56]:
k=50
kmeans = KMeans(n_clusters=k)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

In [57]:
y_representative_digits = y_train[representative_digit_idx]

In [58]:
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", 
                            max_iter=5000, random_state=42)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.9044444444444445

In [65]:
y_train_propagated = np.empty(len(X_train), dtype=np.int32)
for i in range(k):
    y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]

In [66]:
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", 
                            max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train_propagated)
log_reg.score(X_test, y_test)

0.9377777777777778

In [71]:
percentile_closest = 20
X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
    in_cluster = (kmeans.labels_==i)
    cluster_dist = X_cluster_dist[in_cluster]
    cutoff_distance = np.percentile(cluster_dist, percentile_closest)
    above_cutoff = (X_cluster_dist > cutoff_distance)
    X_cluster_dist[in_cluster & above_cutoff] = -1
partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train[partially_propagated]

In [72]:
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", 
                            max_iter=5000, random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)
log_reg.score(X_test, y_test)

0.94

In [1]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.05)

In [3]:
dbscan.labels_[:10]

array([0, 1, 1, 1, 0, 2, 0, 2, 3, 1], dtype=int64)

In [6]:
dbscan.core_sample_indices_[:10], len(dbscan.core_sample_indices_)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64), 803)

In [7]:
dbscan.components_

array([[ 1.4750708 , -0.43509414],
       [-0.45503034,  0.87673331],
       [-0.74500613,  0.74741187],
       ...,
       [ 0.24076121, -0.11688908],
       [-0.8014058 ,  0.55266003],
       [-1.00094442,  0.37804719]])

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)

In [10]:
import numpy as np
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)

array([5, 4, 3, 0], dtype=int64)

In [11]:
knn.predict_proba(X_new)

array([[0.02, 0.  , 0.  , 0.  , 0.18, 0.8 , 0.  ],
       [0.06, 0.04, 0.02, 0.  , 0.88, 0.  , 0.  ],
       [0.36, 0.  , 0.  , 0.64, 0.  , 0.  , 0.  ],
       [0.98, 0.  , 0.  , 0.  , 0.  , 0.  , 0.02]])

In [12]:
y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx]
y_pred[y_dist > 0.2] = -1
y_pred.ravel()

array([-1,  4,  3, -1], dtype=int64)

## Gaussian Mixture

In [3]:
import numpy as np
from sklearn.datasets import make_blobs
X1, y1 = make_blobs(n_samples=1000, centers=((-4, 4), (0, 0)), 
                    random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]

In [4]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=3, n_init=10, random_state=42)
gm.fit(X)

GaussianMixture(n_components=3, n_init=10, random_state=42)

In [5]:
gm.weights_, gm.means_, gm.covariances_

(array([0.3691158 , 0.24310082, 0.38778338]),
 array([[ 1.36694892, -1.52070558],
        [ 3.25921214,  0.93270945],
        [ 0.04704152,  0.07882469]]),
 array([[[0.5545037 , 0.61612027],
         [0.61612027, 0.97543142]],
 
        [[1.20764909, 0.12733675],
         [0.12733675, 0.94475273]],
 
        [[0.69619211, 0.81103875],
         [0.81103875, 1.23251215]]]))

In [6]:
gm.converged_

True

In [7]:
gm.n_iter_

5

In [8]:
gm.predict(X)

array([2, 2, 0, ..., 1, 1, 1], dtype=int64)

In [9]:
gm.predict_proba(X)

array([[1.72424784e-09, 3.26883336e-02, 9.67311665e-01],
       [1.41777842e-12, 2.55562542e-02, 9.74443746e-01],
       [9.99847726e-01, 1.52273811e-04, 2.53139635e-12],
       ...,
       [1.11445603e-01, 8.88554166e-01, 2.30441543e-07],
       [6.11583639e-04, 9.99388416e-01, 2.38685293e-16],
       [2.68243188e-02, 9.73175681e-01, 8.71741534e-16]])

In [10]:
X_new, y_new = gm.sample(6)
y_new

array([0, 0, 1, 1, 1, 2])

In [11]:
gm.score_samples(X)

array([-2.58839759, -3.55461436, -3.33388476, ..., -3.23884872,
       -4.34697093, -3.61170653])

In [14]:
densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 4)
anomalies = X[densities < density_threshold]
anomalies[:10]

array([[-0.11118987,  1.61334992],
       [ 2.28127232,  2.98020674],
       [-2.53940854, -2.76549965],
       [ 1.23952031, -3.27370022],
       [-1.74680316, -0.78385111],
       [-0.71908316, -4.09849917],
       [-0.51480414, -2.16270124],
       [ 1.05384796, -3.40526015],
       [-1.79651788, -0.97031471],
       [-2.11225785, -2.09712817]])

In [15]:
gm.bic(X), gm.aic(X)

(7951.201562565168, 7863.97628245013)

In [4]:
from sklearn.mixture import BayesianGaussianMixture
bgm = BayesianGaussianMixture(n_components=10, n_init=10, 
                              random_state=42)
bgm.fit(X)
np.round(bgm.weights_, 2)

array([0.4 , 0.22, 0.39, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])