# Grouping Observations Using Clustering

If you know that you have k groups, you can use k-means clustering to group similar
observations and output a new feature containing each observation’s group member‐
ship:

In [1]:
# Load libraries
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
 n_features = 2,
 centers = 3,
 random_state = 1)

features

array([[ -9.87755355,  -3.33614544],
       [ -7.28721033,  -8.35398617],
       [ -6.94306091,  -7.0237442 ],
       [ -7.44016713,  -8.79195851],
       [ -6.64138783,  -8.07588804],
       [ -0.79415228,   2.10495117],
       [ -2.76017908,   5.55121358],
       [ -9.94690475,  -4.59034419],
       [ -0.52579046,   3.3065986 ],
       [ -1.98197711,   4.02243551],
       [ -5.8659643 ,  -7.96807169],
       [ -6.83478745,  -7.39121692],
       [ -6.74924724, -10.17542932],
       [-10.75211044,  -2.70048039],
       [ -8.50899599,  -8.65769397],
       [ -2.33080604,   4.39382527],
       [ -0.19745197,   2.34634916],
       [  0.08525186,   3.64528297],
       [-10.20660674,  -3.36672536],
       [ -9.15872909,  -3.02224647],
       [ -1.34052081,   4.15711949],
       [ -1.83198811,   3.52863145],
       [ -9.80679702,  -1.85309341],
       [ -0.75870396,   3.72276201],
       [-11.1402307 ,  -4.30269127],
       [ -7.8121371 ,  -5.34984488],
       [ -2.35122066,   4.00973634],
 

In [2]:
# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
dataframe

Unnamed: 0,feature_1,feature_2
0,-9.877554,-3.336145
1,-7.28721,-8.353986
2,-6.943061,-7.023744
3,-7.440167,-8.791959
4,-6.641388,-8.075888
5,-0.794152,2.104951
6,-2.760179,5.551214
7,-9.946905,-4.590344
8,-0.52579,3.306599
9,-1.981977,4.022436


In [3]:
# Make k-means clusterer
clusterer = KMeans(3, random_state=0)

In [4]:
# Fit clusterer
clusterer.fit(features)

KMeans(n_clusters=3, random_state=0)

In [5]:
dataframe["group"] = clusterer.predict(features)

In [6]:
dataframe


Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,0
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1
