# GUC Clustering Project 

**Objective:** 
The objective of this project teach students how to apply clustering to real data sets

The projects aims to teach student: 
* Which clustering approach to use
* Compare between Kmeans, Hierarchal, DBScan, and Gaussian Mixtures  
* How to tune the parameters of each data approach
* What is the effect of different distance functions (optional) 
* How to evaluate clustering approachs 
* How to display the output
* What is the effect of normalizing the data 

Students in this project will use ready-made functions from Sklearn, plotnine, numpy and pandas 
 



In [None]:
# if plotnine is not installed in Jupter then use the following command to install it 


Running this project require the following imports 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
from sklearn.datasets import make_blobs
from plotnine import *   
# StandardScaler is a function to normalize the data 
# You may also check MinMaxScaler and MaxAbsScaler 
#from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from sklearn.metrics import silhouette_score
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN



%matplotlib inline

In [None]:
# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = 'brgcmyk'  #List colors
    alpha = 0.5  #color obaque
    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)

## Multi Blob Data Set 
* The Data Set generated below has 6 cluster with varying number of users and varing densities
* Cluster the data set below using 



In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sns.set_style("whitegrid")
sns.set_context("talk")

n_bins = 6  
centers = [(-3, -3), (0, 0), (5,2.5),(-1, 4), (4, 6), (9,7)]
Multi_blob_Data, y = make_blobs(n_samples=[100,150, 300, 400,300, 200], n_features=2, cluster_std=[1.3,0.6, 1.2, 1.7,0.9,1.7],
                  centers=centers, shuffle=False, random_state=42)
display_cluster(Multi_blob_Data)

### Kmeans 
* Use Kmeans with different values of K to cluster the above data 
* Display the outcome of each value of K 
* Plot distortion function versus K and choose the approriate value of k 
* Plot the silhouette_score versus K and use it to choose the best K 
* Store the silhouette_score for the best K for later comparison with other clustering techniques. 

In [None]:
def Kmeans(DATASET):
    DATASET_distortion= []
    DATASET_silhouette_scores = []
    best_score= -1
    X = DATASET
    # Set up subplots for each value of K
    fig, axs = plt.subplots(3, 3, figsize=(15, 15))
    axs = axs.ravel()
    K_range = range(2, 11)

    # Loop through different values of K
    for i, k in enumerate(range(2, 11)):
        # Create KMeans model with current value of K
        model = KMeans(n_clusters=k)
        # Fit the model to the data
        model.fit(X)
        #Inertia is calculated by measuring the distance between each data point and its centroid, squaring this distance,
        #kmeans.labels_ the index of the cluster it gets assigned to
        #Silhouette score is a metric used to calculate the goodness of a clustering technique.
        DATASET_distortion.append(model.inertia_)
        DATASET_silhouette_scores.append(silhouette_score(X, model.labels_)) 
        # Plot the outcome of the current value of K
        axs[i].scatter(X[:, 0], X[:, 1], c=model.labels_)
        axs[i].set_title(f"K={k}")
        if len(np.unique(model.labels_)) > 1:
                        score = silhouette_score(X, model.labels_)
                        if score > best_score:
                                best_score = score

    plt.show()
    plt.plot(K_range, DATASET_distortion, 'bx-')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Distortion Value')
    plt.title('Elbow Method for Optimal K')
    plt.show()
    plt.plot(K_range, DATASET_silhouette_scores , 'bx-')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.show()
    print("Best Shilhouette Score is")
    print(best_score)


In [None]:
Kmeans(Multi_blob_Data)


### Hierarchal Clustering
* Use AgglomerativeClustering function to  to cluster the above data 
* In the  AgglomerativeClustering change the following parameters 
    * Affinity (use euclidean, manhattan and cosine)
    * Linkage( use average and single )
    * Distance_threshold (try different)
* For each of these trials plot the Dendograph , calculate the silhouette_score and display the resulting clusters  
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observation 

In [None]:
def HierarchalClustering (Multi_blob_Data):
    Affinity = ['euclidean', 'cosine']
    Linkage = ['average', 'single']
    best_score= -1
    distance_thresholds = np.linspace(0.01, 10, 10)
    best_params = {}
    for a in Affinity:
        for l in Linkage:
            # Compute linkage matrix
                if a == "cosine" and l=="single":
                    distance_thresholds = np.linspace(0.001, 1, 10)
                #linkage methods are used to compute the distance d ( s , t ) between two clusters
                linkage_data = linkage(Multi_blob_Data, method=l, metric=a)
                # Plot dendrogram
                plt.figure()
                dendrogram(linkage_data,truncate_mode='lastp',p=20,leaf_rotation=65.,leaf_font_size=15.,show_contracted=True)
                plt.title(f"{l.capitalize()} linkage with {a} distance")
                plt.xlabel('Points index')
                plt.ylabel('Distance')
                plt.show()
                for d in distance_thresholds:
                    
                    # Create AgglomerativeClustering object with current parameters
                    clustering = AgglomerativeClustering(linkage=l, affinity=a,n_clusters = None,compute_full_tree=True,distance_threshold=d)
                    # Fit the model to the data
                    clustering.fit(Multi_blob_Data)
                    plt.scatter(Multi_blob_Data[:, 0], Multi_blob_Data[:, 1], c=clustering.labels_, cmap='viridis')
                    plt.title(f"{l.capitalize()} linkage with {a} distance , Distance_Threshold= {d}")
                    plt.show()
                    # Print the labels assigned to each point by the clustering algorithm
                    if len(np.unique(clustering.labels_)) > 1:
                        score = silhouette_score(Multi_blob_Data, clustering.labels_)
                        if score > best_score:
                                best_score = score
                                best_params = {'affinity': a, 'linkage': l, 'distance_threshold': d}
    
    print("Set of paramters that would find result in the best silhouette score")
    print(best_params)
    

In [None]:
HierarchalClustering(Multi_blob_Data)


### DBScan
* Use DBScan function to  to cluster the above data 
* In the  DBscan change the following parameters 
    * EPS (from 0.1 to 3)
    * Min_samples (from 5 to 25)
* Plot the silhouette_score versus the variation in the EPS and the min_samples
* Plot the resulting Clusters in this case 
* Find the set of paramters that would find result in the best silhouette_score and store this score for later comparison with other clustering techniques. 
* Record your observations and comments 

In [None]:
def dbscan (X):
    eps_values = np.arange(0.1, 3.1, 0.1)
    min_samples_values = np.arange(5, 26)
    best_silhouette_score = -1
    best_params = {}
    #Initialize the silhouette scores for each combination of eps and min_samples
    silhouette_scores = np.zeros((len(eps_values), len(min_samples_values)))


    
    for i, eps in enumerate(eps_values):
        for j, min_samples in enumerate(min_samples_values):
            # Create a DBSCAN object with the current parameters
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            dbscan.fit(X)    
            plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_)
            plt.title(f'DBSCAN Clustering (eps={eps}, min_samples={min_samples}')
            plt.show()
            # Compute the silhouette score
            if len(np.unique(dbscan.labels_)) > 1:
                score = silhouette_score(X, dbscan.labels_)
                silhouette_scores[i, j] = silhouette_score(X, dbscan.labels_)
                # Update the best silhouette score and corresponding parameters if necessary
                if score > best_silhouette_score:
                    best_silhouette_score = score
                    best_params = {'EPS': eps,'min_samples_values': min_samples}
    print("------------------------------------------------------------")     
    print("Set of paramters that would find result in the best silhouette score")
    print(best_params)
    print("------------------------------------------------------------")
    # Plot the silhouette score versus eps values
    fig, ax = plt.subplots(figsize=(10,6))
    for i, min_samples in enumerate(min_samples_values):
        ax.plot(eps_values, silhouette_scores[:,i], label=f'min_samples = {min_samples}')
    ax.set_xlabel('eps')
    ax.set_ylabel('Silhouette Score')
    ax.set_title('Silhouette Score versus eps for different min_samples values')
    ax.legend()
    plt.show()

    # Plot the silhouette score versus min_samples values
    fig, ax = plt.subplots(figsize=(10,6))
    for i, eps in enumerate(eps_values):
        ax.plot(min_samples_values, silhouette_scores[i,:], label=f'eps = {eps}')
    ax.set_xlabel('min_samples')
    ax.set_ylabel('Silhouette Score')
    ax.set_title('Silhouette Score versus min_samples for different eps values')
    ax.legend()
    plt.show()
    return best_silhouette_score


In [None]:
dbscan(Multi_blob_Data)


### Gaussian Mixture
* Use GaussianMixture function to cluster the above data 
* In GMM change the covariance_type and check the difference in the resulting proabability fit 
* Use a 2D contour plot to plot the resulting distribution (the components of the GMM) as well as the total Gaussian mixture 

In [None]:
from sklearn.metrics import silhouette_score

def gaussian(normalized_data):
    covariance_types = ['full', 'tied', 'diag', 'spherical']
    K_range = range(2, 6)
    max_silhouette_score=0.0
    for j, K in enumerate(K_range):
        silhouette_scores = []
        fig, axes = plt.subplots(2, 2, figsize=(12, 12))
        axes = axes.ravel()
        for i, cov_type in enumerate(covariance_types):
            gmm = GaussianMixture(n_components=K, covariance_type=cov_type, random_state=42)
            gmm.fit(normalized_data)
            labels = gmm.predict(normalized_data)
            silhouette_scores.append(silhouette_score(normalized_data, labels))
            sns.scatterplot(x=normalized_data[:,0], y=normalized_data[:,1], hue=labels, ax=axes[i])
            sns.kdeplot(x=normalized_data[:,0], y=normalized_data[:,1], levels=5, color='w', linewidths=1.5, ax=axes[i])
            axes[i].set_title('Covariance Type: {}, K={}'.format(cov_type, K))
        plt.tight_layout()
        plt.show()
        print("K={}, Silhouette Scores: {}".format(K, silhouette_scores))
        if np.max(silhouette_scores) > max_silhouette_score:
            max_silhouette_score = max(silhouette_scores)
            best_K = K
    print("Best K value: {}, Best Silhouette Score: {}".format(best_K, max_silhouette_score))
    
    

In [None]:
gaussian(Multi_blob_Data)


## iris data set 
The iris data set is test data set that is part of the Sklearn module 
which contains 150 records each with 4 features. All the features are represented by real numbers 

The data represents three classes 


In [None]:
from sklearn.datasets import load_iris

iris_data = load_iris()
iris_data.target[[10, 25, 50]]
#array([0, 0, 1])
list(iris_data.target_names)
['setosa', 'versicolor', 'virginica']


In [None]:
iris = load_iris()
iris_data = iris.data
scaler = MinMaxScaler()
iris_data_normalized = scaler.fit_transform(iris_data)
Kmeans(iris_data_normalized)



In [None]:
HierarchalClustering(iris_data_normalized)

In [None]:
dbscan(iris_data_normalized)

In [None]:
gaussian(iris_data_normalized)


* Repeat all the above clustering approaches and steps on the above data 
* Normalize the data then repeat all the above steps 
* Compare between the different clustering approaches 

## Customer dataset
Repeat all the above on the customer data set 

In [None]:
Customer_dataset = pd.read_csv(r"C:\Users\MostafaShehab\Desktop\GU\ML-1\Customerdata.csv")
Customer_dataset1=Customer_dataset[['Age','Income',"Settlement size"]]
scaler = MinMaxScaler()
Customer_dataset_normalized = scaler.fit_transform(Customer_dataset1)

In [None]:
Kmeans(Customer_dataset_normalized)



In [None]:
HierarchalClustering(Customer_dataset_normalized)

In [None]:
dbscan(Customer_dataset_normalized)

In [None]:
gaussian(Customer_dataset_normalized)