**Clustering Analysis of Mall Customers**

<img src="https://www.treebo.com/blog/wp-content/uploads/2018/02/Shopping-Malls-in-Delhi.jpg" width="900px">



**Importing some Basic Libraries**

In [None]:

# for basic operations
import numpy as np 
import pandas as pd 

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
py.offline.init_notebook_mode(connected = True)
import plotly.graph_objs as go
# for path
import os
print(os.listdir("../input"))


**Reading the Data**

In [None]:
# reading the data and finding the computation time for uploading the dataset

%time data = pd.read_csv('../input/Mall_Customers.csv')

# getting to know the shape of the data
print(data.shape)


**Analysis of the Data**

In [None]:
# checking the head of the data

data.head()

In [None]:
# describing the data

data.describe()

In [None]:
# checking if the dataset contains any NULL values

data.isnull().any()

**Data Visualization**

**Kdep Plot**

In [None]:
# plotting a kdep plot

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (15, 8)
g = sns.PairGrid(data[['Gender', 'Age', 'Annual Income (k$)','Spending Score (1-100)']])
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=8);

**Distribution Plots**

In [None]:
plt.style.use('bmh')

plt.figure(1 , figsize = (18 , 7))
n = 0 
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(data[x] ,color = 'red', bins = 20)
    plt.title('Distplot of {}'.format(x))
plt.show()


**Swarm and Violin Plots**

In [None]:
plt.style.use('seaborn-deep')
plt.figure(1 , figsize = (15 , 7))
n = 0 
for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1 
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.boxenplot(x = cols , y = 'Gender' , data = data , palette = 'spring')
    sns.swarmplot(x = cols , y = 'Gender' , data = data, palette = 'rainbow')
    plt.ylabel('Gender' if n == 1 else '')
    plt.title('Boxplots & Swarmplots' if n == 2 else '')
plt.show()


**Pie Chart**

In [None]:
# counting the unique value count in the gender category

data['Gender'].value_counts()

In [None]:
# drawing a pie chart to represent the ratio of male and female customers in the mall

labels = ['Female', 'Male']
size = [112, 88]
colors = ['lightgreen', 'orange']
explode = [0, 0.1]

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (7, 7)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('A pie chart Representing the Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
# checking the unique value counts for the age attribute for the customers
# also, plotting a bar graph

plt.rcParams['figure.figsize'] = (18, 7)
plt.style.use('seaborn-dark-palette')
sns.countplot(data['Age'], palette = 'spring')
plt.title('Age Variations', fontsize = 20)

In [None]:
# checking the spendscore

plt.rcParams['figure.figsize'] = (18, 7)
plt.style.use('seaborn-deep')
sns.countplot(data['Spending Score (1-100)'], palette = 'rainbow')
plt.title('Spend Score Variations', fontsize = 20)
plt.xticks(rotation = 90)

In [None]:
# plotting a pairplot

plt.style.use('seaborn-deep')
sns.pairplot(data)
plt.title('Pair plot for the Data', fontsize = 20)

In [None]:
# plotting a scatter matrix from the pandas library

plt.style.use('dark_background')
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
plt.title('Scatter Matrix', fontsize = 20)

In [None]:
# plotting a line plot between customerid and annual income

y = data['Spending Score (1-100)']
x = data['Annual Income (k$)']

plt.style.use('seaborn-muted')
plt.plot(x, y, color = 'green')
plt.title('Customer id vs Annual income', fontsize = 20)
plt.ylabel('Spending Score (1-100)')
plt.xlabel('Annual Income')

In [None]:
# plotting a line plot between annual income and age

plt.rcParams['figure.figsize'] = (18, 7)
x = data['Annual Income (k$)']
y = data['Age']

plt.style.use('Solarize_Light2')
plt.plot(x, y, color = 'red')
plt.title('Annual Income vs Age', fontsize = 20)
plt.xlabel('Annual Income')
plt.ylabel('Age')

**Clustering Analysis**

In [None]:
# making a data to be clustered (it should be only two-dimensional)

# lets say it x
x = data.iloc[:, [3, 4]].values

print(x.shape)

**Kmeans Clustering**

**The Elbow Method**

It is used to find the Optimal no. of Clusters in any Dataset

In [None]:
# applying kmeans clustering and using the elbow method

# the no. of clusters are where there deviations start decreasing

from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
  km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
  km.fit(x)
  wcss.append(km.inertia_)
  
plt.style.use('fivethirtyeight')
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()

**Visualizing the Cluster**

In [None]:
# plotting the clusters

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)

plt.style.use('fivethirtyeight')
plt.scatter(x[y_means == 0, 0], x[y_means == 0, 1], s = 200, c = 'pink', label = 'miser')
plt.scatter(x[y_means == 1, 0], x[y_means == 1, 1], s = 200, c = 'yellow', label = 'general')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], s = 200, c = 'cyan', label = 'target')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 200, c = 'magenta', label = 'spendthrift')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 200, c = 'orange', label = 'careful')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')

plt.title('K Means Clustering', fontsize = 20)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()



**Hierarcial Clustering**

**Dendrograms**

It is a Method to find the Optimal No of Clusters in a Dataset

In [None]:
# method number 2
# hierarchial clustering using dendrograms

import scipy.cluster.hierarchy as sch

dendrogram = sch.dendrogram(sch.linkage(x, method = 'ward'))
plt.title('Dendrogam', fontsize = 20)
plt.xlabel('Customers')
plt.ylabel('Euclidean Distance')
plt.show()

**Visualizing the Cluster**

In [None]:
# plotting the clusters

from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(x)

plt.scatter(x[y_hc == 0, 0], x[y_hc == 0, 1], s = 200, c = 'pink', label = 'miser')
plt.scatter(x[y_hc == 1, 0], x[y_hc == 1, 1], s = 200, c = 'yellow', label = 'general')
plt.scatter(x[y_hc == 2, 0], x[y_hc == 2, 1], s = 200, c = 'cyan', label = 'target')
plt.scatter(x[y_hc == 3, 0], x[y_hc == 3, 1], s = 200, c = 'magenta', label = 'spendthrift')
plt.scatter(x[y_hc == 4, 0], x[y_hc == 4, 1], s = 200, c = 'orange', label = 'careful')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')

plt.title('Hierarchial Clustering', fontsize = 20)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

We can see that Both the K-means and Hierarchial Clusterring almost the same, as it is a small dataset, as they very minute differences in them. In most of the cases where the dataset is small, they work symmetrically.

**Thank you for Reading the Kernnel!**