In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import preprocessing

## Load Data

In [2]:
data, target = load_iris(return_X_y=True, as_frame=True)

In [3]:
df = pd.concat((data, target), axis=1)

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
X, y = data.to_numpy(), target.to_numpy()

In [6]:
X_normalized = preprocessing.normalize(X)

In [7]:
df = pd.concat((pd.DataFrame(X_normalized, columns = df.columns[:-1]), target), axis=1)

In [8]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0.803773,0.551609,0.220644,0.031521,0
1,0.828133,0.50702,0.236609,0.033801,0
2,0.805333,0.548312,0.222752,0.034269,0
3,0.80003,0.539151,0.260879,0.034784,0
4,0.790965,0.569495,0.22147,0.031639,0


## Data Visualization

In [9]:
from sklearn.manifold import TSNE

In [10]:
model = TSNE(n_components=2)

In [11]:
X_proj = model.fit_transform(X)

In [12]:
X_proj[:2]

array([[-26.935776 ,  -2.4674087],
       [-29.600248 ,  -1.3549646]], dtype=float32)

The `altair` library expects a `pd.DataFrame` input, so let's make one.

In [13]:
df_proj_without_label = pd.DataFrame(X_proj, columns=["x", "y"])
df_proj_with_label = pd.concat((df_proj_without_label, df['target']), axis=1)

In [14]:
df_proj_with_label.head()

Unnamed: 0,x,y,target
0,-26.935776,-2.467409,0
1,-29.600248,-1.354965,0
2,-29.418329,-2.56925,0
3,-29.890512,-2.17533,0
4,-26.896187,-2.817233,0


In [15]:
import altair as alt

In [16]:
alt.Chart(df_proj_without_label).mark_circle(size=60).encode(
    x = "x",
    y = "y",
).interactive().properties(width=400, height=400)

The diagram indicates that `K=3` is a good choice.

## K-Means
Let's train a `KMeans` model with `K=3` to check if it makes sense.

In [17]:
from sklearn.cluster import KMeans

In [18]:
model = KMeans(n_clusters=3, random_state=42)

In [19]:
model.fit(X_normalized)

0,1,2
,n_clusters,3
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [20]:
df_with_new_pred = pd.concat((df_proj_without_label, pd.Series(model.labels_)), axis=1)
df_with_new_pred.columns = ["x", "y", "target"]

In [21]:
df_with_new_pred

Unnamed: 0,x,y,target
0,-26.935776,-2.467409,1
1,-29.600248,-1.354965,1
2,-29.418329,-2.569250,1
3,-29.890512,-2.175330,1
4,-26.896187,-2.817233,1
...,...,...,...
145,12.684958,-0.377166,0
146,9.319782,-0.501736,0
147,11.684256,-0.398839,0
148,12.688314,-2.358396,0


In [22]:
alt.Chart(df_with_new_pred).mark_circle(size=60).encode(
    x = "x",
    y = "y",
    color='target:N',
).interactive().properties(width=400, height=400)

As the diagram suggests, the `K=3` is a quite reasonable choice.

## Find Best K
Let's use the measurements (inertia and silhouette score) to determine the best `K`.

In [23]:
from sklearn.metrics import silhouette_score

In [24]:
ks = range(1, 12)
inertias = []
silhouette_scores = []
models = []
for k in ks:
    model = KMeans(k, random_state=42)
    model.fit(X_normalized)

    models.append(model)
    
    inertias.append(model.inertia_)
    if k != 1:
        silhouette_scores.append(silhouette_score(X, model.labels_))

Note that the calculation of silhouette score *requires* at least 2 clusters. Thus, I insert a dummy value manually.

In [25]:
silhouette_scores = [silhouette_scores[0]] + silhouette_scores

In [26]:
curve = pd.DataFrame(
    zip(ks, inertias, silhouette_scores), columns=["K", "inertia", "silhouette"]
)

In [27]:
curve.head()

Unnamed: 0,K,inertia,silhouette
0,1,6.675331,0.686735
1,2,0.554405,0.686735
2,3,0.322682,0.486812
3,4,0.285832,0.367194
4,5,0.236902,0.316559


In [28]:
base = alt.Chart(curve).encode(x='K')

alt.layer(
    base.mark_line(color='blue').encode(y='inertia'),
    base.mark_line(color='red').encode(y='silhouette')
)

What about the ground truth?

In [29]:
alt.Chart(df_proj_with_label).mark_circle(size=60).encode(
    x = "x",
    y = "y",
    color='target:N',
).interactive().properties(width=400, height=400)