# Clustering

Creating sample data for clustering.

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

In [None]:
# create dataset
X, y = make_blobs(
    n_samples = 150,
    n_features = 2,
    centers = 3,
    cluster_std = 0.5,
    shuffle = True,
    random_state = 0
)

In [None]:
X, y

---
Visualise the data using `matplotlib`.

In [None]:
# plot
plt.scatter(
    X[:, 0],
    X[:, 1],
    c = 'white',
    marker = 'o',
    edgecolor = 'black',
    s = 50)
plt.show()

## Running K-means

We will be running K-Means, but there are other clustering methods available on scikit-learn:

https://scikit-learn.org/stable/modules/clustering.html

In [None]:
from sklearn.cluster import KMeans # required import

In [None]:
km = KMeans(
    n_clusters = 3,   # number of clusters
    init = 'random',  # centroid initialisation
    n_init = 10,      # number of executions
    max_iter = 300,   # number of iterations
    random_state = 0  # random seed
)

In [None]:
y_km = km.fit_predict(X)

`y_km` containts the prediction of `y` for our input data. In a clustering application, we don't normally know the values of `y` in advance.

In [None]:
y_km

## Visualising the clusters

In [None]:
# cluster 1
plt.scatter(
    X[y_km == 0, 0],
    X[y_km == 0, 1],
    s = 50,
    c = 'lightgreen',
    marker = 's',
    edgecolor = 'black',
    label = 'cluster 1'
)

# cluster 2
plt.scatter(
    X[y_km == 1, 0],
    X[y_km == 1, 1],
    s = 50,
    c = 'orange',
    marker = 'o',
    edgecolor = 'black',
    label = 'cluster 2'
)

# cluster 3
plt.scatter(
    X[y_km == 2, 0],
    X[y_km == 2, 1],
    s = 50,
    c = 'lightblue',
    marker = 'v',
    edgecolor = 'black',
    label = 'cluster 3'
)

plt.legend(scatterpoints=1)
plt.grid()
plt.show()

Adding the centroids.

In [None]:
km.cluster_centers_

In [None]:
# cluster 1
plt.scatter(
    X[y_km == 0, 0],
    X[y_km == 0, 1],
    s = 50,
    c = 'lightgreen',
    marker = 's',
    edgecolor = 'black',
    label = 'cluster 1'
)

# cluster 2
plt.scatter(
    X[y_km == 1, 0],
    X[y_km == 1, 1],
    s = 50,
    c = 'orange',
    marker = 'o',
    edgecolor = 'black',
    label = 'cluster 2'
)

# cluster 3
plt.scatter(
    X[y_km == 2, 0],
    X[y_km == 2, 1],
    s = 50,
    c = 'lightblue',
    marker = 'v',
    edgecolor = 'black',
    label = 'cluster 3'
)

# plot the centroids
plt.scatter(
    km.cluster_centers_[:, 0],
    km.cluster_centers_[:, 1],
    s = 250,
    marker = '*',
    c = 'red',
    edgecolor = 'black',
    label = 'centroids'
)

plt.legend(scatterpoints=1)
plt.grid()
plt.show()

# Handling Missing values

In [None]:
import pandas as pd

# titanic dataset
data_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
df = pd.read_csv(data_url)
df

In [None]:
# prints the number of NaN values per column
df.isna().sum()

----

There are a couple of ways to handle missing values:
- impute missing values (when a sensible value can be inferred)
- remove row/column with missing values

### Impute values

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
df

### Remove missing

In [None]:
# if there are too many missing values for a column
# consider dropping the entire column

df = df.drop(columns=['Cabin'])
df

In [None]:
# rows with missing values
df[df['Embarked'].isna()]

In [None]:

# remove rows with missing values
df.drop(df[df['Embarked'].isna()].index)

# Categorical data

We can use `OneHotEncoder` to create categorical attributes derived from numeric ones.

In [None]:
from sklearn.preprocessing import OneHotEncoder

# creates the encoder object
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# encodes and replaces the original values on the dataframe
values = enc.fit_transform(df.loc[:, ['Sex']])

# checks the categories name ['female', 'male']
enc.categories_

In [None]:
values

In [None]:
# adds the columns to the dataframe
df['Female'] = values[:, 0]
df['Male'] = values[:, 1]

# removes the categorical column
df.drop(columns=['Sex'])