In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift, \
    Birch, AffinityPropagation, MiniBatchKMeans
import warnings
warnings.filterwarnings("ignore")

In [9]:
iris_df = pd.read_csv("./datasets/iris.csv", skiprows = 1,
                     names = ["sepal-length", "sepal-width",
                             "petal-length", "petal-width", "class"])

iris_df = iris_df.sample(frac=1).reset_index(drop=True)
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,3.1,1.5,0.1,Iris-setosa
1,5.1,2.5,3.0,1.1,Iris-versicolor
2,5.1,3.8,1.5,0.3,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,4.8,3.0,1.4,0.3,Iris-setosa


In [10]:
# cat to num
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()
iris_df["class"] = label_encoding.fit_transform(iris_df["class"].astype(str))

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,3.1,1.5,0.1,0
1,5.1,2.5,3.0,1.1,1
2,5.1,3.8,1.5,0.3,0
3,4.6,3.1,1.5,0.2,0
4,4.8,3.0,1.4,0.3,0


In [11]:
iris_features = iris_df.drop("class", axis = 1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,4.9,3.1,1.5,0.1
1,5.1,2.5,3.0,1.1
2,5.1,3.8,1.5,0.3
3,4.6,3.1,1.5,0.2
4,4.8,3.0,1.4,0.3


In [12]:
iris_labels = iris_df["class"]

iris_labels.sample(5)

98    2
29    1
54    0
23    2
14    0
Name: class, dtype: int32

In [31]:
# helper fun to train and eval clust model score
def build_model(clustering_model, data, labels):
    model = clustering_model(data)
    
    print("homo\tcompl\tv-score\tARI\tANI\tsilhouette")
    print(50 * "-")
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        %(metrics.homogeneity_score(labels, model.labels_),
          metrics.completeness_score(labels, model.labels_),
          metrics.v_measure_score(labels, model.labels_),
          metrics.adjusted_rand_score(labels, model.labels_),
          metrics.adjusted_mutual_info_score(labels, model.labels_),
          metrics.silhouette_score(data, model.labels_)))

In [29]:
def k_means(data, n_clusters = 3, max_iter = 1000):
    model = KMeans(n_clusters = n_clusters, max_iter = max_iter).fit(data)
    return model
    

In [32]:
# use the helper fun
build_model(k_means, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


In [33]:
def agglomerative_fn(data, n_clusters = 3):
    model = AgglomerativeClustering(n_clusters=n_clusters).fit(data)
    return model

build_model(agglomerative_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


In [34]:
def dbscan_fn(data, eps = 0.45, min_samples = 4):
    # eps = 0.45 means all points within this distance are neighbors and can be in same cluster
    # min samples = 4, means for region to be considered a cluster it must have 4 close data points or more
    model = DBSCAN(eps = eps, min_samples=min_samples).fit(data)
    return model

build_model(dbscan_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.577	0.609	0.593	0.508	0.584	0.372


In [36]:
def mean_shift_fn(data, bandwidth = 0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

build_model(mean_shift_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.763	0.551


In [37]:
def birch_fn(data, n_clusters = 3):
    model = Birch(n_clusters = n_clusters).fit(data)
    return model

build_model(birch_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.635	0.792	0.705	0.566	0.700	0.534


In [38]:
def affinity_propagation_fn(data, damping=0.6, max_iter = 1000):
    # damping is like the lr, that's whether a data point remains in current exampler or update to new one
    model = AffinityPropagation(damping = damping, max_iter = max_iter).fit(data)
    return model

build_model(affinity_propagation_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.851	0.492	0.623	0.437	0.612	0.349


In [39]:
def mini_batch_kmeans_fn(data, n_clusters = 3):
    # faster than normal kmeans, but slightly less acc than full Kmeans
    model = MiniBatchKMeans(n_clusters=n_clusters).fit(data)
    return model

build_model(mini_batch_kmeans_fn, iris_features, iris_labels)

homo	compl	v-score	ARI	ANI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


**Spectral Clustering using a defined Affinity/Similarity Matrix**

In [40]:
# spectral clustering can accept a precomputed affinity matrix or raw data as before
from sklearn.cluster import SpectralClustering

In [42]:
SS = 1000 # self-similarity
IS= 10 # intra-cluster similarity
LS = 0.01 # similarity btw points in diff clusters


# each row corresponds to every point in dataset
# each col corresponds to every point in dataset
similarity_mat = [
    [SS, IS, IS, LS, LS, LS],
    [IS, SS, IS, LS, LS, LS],
    [IS, IS, SS, LS, LS, LS],
    [LS, LS, LS, SS, IS, IS],
    [LS, LS, LS, IS, SS, LS],
    [LS, LS, LS, LS, LS, SS]
]

# affinity = precomputed since we are passing the affinity mat, if raw data use default
spectral_model = SpectralClustering(n_clusters = 3, affinity="precomputed").fit(similarity_mat)
spectral_model.labels_

array([1, 1, 1, 2, 2, 0])