<a href="https://colab.research.google.com/github/LaylaGrisell/CourseraDevelopingDataProducts/blob/main/Day_2_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 2  Clustering

### K-Means Clustering

In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Importing the dataset
dataset = pd.read_csv('Mall_Customers.csv')
dataset

In [None]:
X = dataset.iloc[:, [3, 4]].values
# y = dataset.iloc[:, 3].values

In [None]:
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)
y_kmeans

In [None]:
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

### Hierarchical Clustering

In [None]:
# Using the dendrogram to find the optimal number of clusters
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
# Fitting Hierarchical Clustering to the dataset
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)



In [None]:
# Visualising the clusters
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

## Calculating Silhouette Score
#### Importing libraries:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
%matplotlib inline

#### Generating some random data:

###### To run clustering algorithm we are generating 100 random points.

In [None]:
X= np.random.rand(50,2)
Y= 2 + np.random.rand(50,2)
Z= np.concatenate((X,Y))
Z=pd.DataFrame(Z) #converting into data frame for ease

###### Plotting the data:

In [None]:
sns.scatterplot(Z[0],Z[1])

###### Applying KMeans Clustering with 2 clusters:

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(Z)
label=KMean.predict(Z)

###### Calculating the silhouette score:

In [None]:
print(f'Silhouette Score(n=2): {silhouette_score(Z, label)}')

We can say that the clusters are well apart from each other as the silhouette score is closer to 1.

To check whether our silhouette score is providing the right information or not let’s create another scatter plot showing labelled data points.

In [None]:
sns.scatterplot(Z[0],Z[1],hue=label)

Let’s try with 3 clusters:

In [None]:
KMean= KMeans(n_clusters=3)
KMean.fit(Z)
label=KMean.predict(Z)
print(f"Silhouette Score(n=3): {silhouette_score(Z, label)}")
sns.scatterplot(Z[0],Z[1],hue=label,palette="inferno_r")

### K Means vs DBSCAN

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# reading the classic iris dataset into a df
iris_df = sns.load_dataset('iris')
iris_df

In [None]:
# Setting the independent features (input)
X = iris_df.drop("species", axis=1).values

In [None]:
# Arbitrarily selecting a range of values for K
K = range(1,10)
sum_of_squared_distances = []
# Using Scikit Learn’s KMeans Algorithm to find sum of squared distances
for k in K:
    model = KMeans(n_clusters=k).fit(X)
    sum_of_squared_distances.append(model.inertia_)
plt.plot(K, sum_of_squared_distances, "bx-")
plt.xlabel("K values")
plt.ylabel("Sum of Squared Distances")
plt.title("Elbow Method")
plt.show()

In [None]:
# Creating the KMeans object and fitting it to the Iris data
iris_kmeans = KMeans(n_clusters=3)
iris_kmeans.fit(X)


In [None]:
# Predicting the cluster labels
labels = iris_kmeans.predict(X)
print(labels)


In [None]:
# Finding the final centroids
centroids = iris_kmeans.cluster_centers_
centroids

In [None]:
# Evaluating the quality of clusters
s = metrics.silhouette_score(X, labels, metric="euclidean")
print(f"Silhouette Coefficient for the Iris Dataset Clusters: {s:.2f}")

In [None]:
# plotting the clusters using sepal_length and sepal_width
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="rainbow")
plt.show()

In [None]:
# DBSCAN Clustering for the Iris Dataset using Scikit Learn
import pandas as pd
from sklearn import metrics
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
# reading the classic iris dataset into a df
iris_df = sns.load_dataset("iris")
print('iris_df', iris_df)

In [None]:
X = iris_df.drop("species", axis=1).values

In [None]:
iris_dbscan = DBSCAN(eps=0.5, min_samples=5)
iris_dbscan.fit(X)
labels = iris_dbscan.labels_
# label=-1 means the point is an outlier. Rest of the values represent the label/cluster number starting from 0
print(labels)


In [None]:
# Creating a numpy array with all values set to false by default
core_samples_mask = np.zeros_like(labels, dtype=bool)
# Setting core and border points (all points that are not -1) to True
core_samples_mask[iris_dbscan.core_sample_indices_] = True


In [None]:
# Finding the number of clusters in labels (ignoring noise if present)
n_clusters_ = len(set(labels)) # (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [None]:
# Printing the number of clusters and number of noise points (outliers)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
# Evaluating the quality of clusters
s = metrics.silhouette_score(X, iris_dbscan.labels_)
print(f"Silhouette Coefficient for the Iris Dataset Clusters: {s:.2f}")