In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

import wrangle_mall

In [None]:
iris = sns.load_dataset('iris')

In [None]:
X = iris[['petal_length', 'petal_width']]

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
iris['cluster'] = kmeans.labels_
iris.cluster = iris.cluster.astype('category')

sns.relplot(data=iris, y='petal_length', x='petal_width', hue='cluster')

In [None]:
inertias = {k: KMeans(n_clusters=k).fit(X).inertia_ for k in range(2, 11)}
pd.Series(inertias).plot()
plt.grid()

In [None]:
X = iris[['petal_length', 'petal_width']]

kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
iris['cluster'] = kmeans.labels_
iris.cluster = iris.cluster.astype('category')

sns.relplot(data=iris, y='petal_length', x='petal_width', hue='cluster')

In [None]:
X = iris[['sepal_length', 'sepal_width']]

kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
iris['cluster'] = kmeans.labels_
iris.cluster = iris.cluster.astype('category')

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))

sns.scatterplot(data=iris, y='petal_length', x='petal_width', hue='cluster', ax=ax1)
sns.scatterplot(data=iris, y='sepal_length', x='sepal_width', hue='cluster', ax=ax2)
sns.scatterplot(data=iris, y='sepal_width', x='petal_width', hue='cluster', ax=ax3)
fig.suptitle(f'clusters based on {X.columns.tolist()}')

## Mall Customers

In [None]:
customers = wrangle_mall.acquire()
train, validate, test = wrangle_mall.split(customers)
train_scaled, _, _ = wrangle_mall.scale(train, validate, test)

In [None]:
X = train[['annual_income', 'spending_score']]

kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
train['cluster'] = kmeans.labels_
train.cluster = train.cluster.astype('category')

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))

sns.scatterplot(data=train, y='spending_score', x='age', hue='cluster', ax=ax1)
sns.scatterplot(data=train, y='spending_score', x='annual_income', hue='cluster', ax=ax2)
sns.scatterplot(data=train, y='annual_income', x='age', hue='cluster', ax=ax3)
fig.suptitle(f'clusters based on {X.columns.tolist()}')

means_by_cluster = (train
 .assign(is_female=train.gender == 'Female')
 .drop(columns=['gender', 'customer_id'])
 .groupby('cluster')
 .mean())
pd.concat([
    means_by_cluster,
    train.cluster.value_counts().sort_index().rename('count')
], axis=1)

Takeaways:

- most data points in cluster 4
- clusters 0, 2, and 4 tend be quite varied wrt age
- clusters 1 and 3, the high spenders, tend to be younger

In [None]:
sns.relplot(data=train, hue='cluster', col='gender', y='spending_score', x='annual_income')

## Bonus: Scaling

In [None]:
url = 'https://docs.google.com/spreadsheets/d/1j5EgXVTR5ikUj3G5ZCQmkq6ziz_gvtASGAdw23-5_6M/export?format=csv'
df = pd.read_csv(url)
df.head()

In [None]:
df.plot.scatter(y='y', x='x')

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(df[['x', 'y']])
df['cluster'] = kmeans.labels_

sns.relplot(data=df, y='y', x='x', hue='cluster')

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [None]:
df_scaled = df.copy()
scaler = StandardScaler()
df_scaled[['x', 'y']] = scaler.fit_transform(df[['x', 'y']])

kmeans = KMeans(n_clusters=2)
kmeans.fit(df_scaled[['x', 'y']])
df['cluster'] = kmeans.labels_

sns.relplot(data=df, y='y', x='x', hue='cluster')

In [None]:
kmeans.cluster_centers_
kmeans.labels_
kmeans.inertia_