In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
df = pd.read_csv('Mall_Customers.csv')
df.head

# EDA

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

## CustomerID

In [None]:
df.drop('CustomerID',axis=1,inplace=True)
df.head

## Gender

In [None]:
sns.countplot(x= df.Gender)

## Age

In [None]:
sns.histplot(df.Age, kde=True);

## Annual Income (k$)

In [None]:
df.rename({'Annual Income (k$)': 'Income'}, axis =1, inplace = True)

In [None]:
sns.histplot(df.Income, kde=True);

In [None]:
sns.scatterplot(y= df.Income, x= df.Age, hue= df.Gender);

## Spending Score (1-100)

In [None]:
df.rename({'Spending Score (1-100)': 'Score'}, axis =1, inplace = True)

In [None]:
sns.histplot(df.Score, kde=True);

In [None]:
sns.scatterplot(y= df.Income, x= df.Score);

In [None]:
sns.scatterplot(y= df.Age, x= df.Score);

In [None]:
sns.scatterplot(y= df.Income, x= df.Score);

This is the most informative visualization till now, as we can observe about 5 clusters:
1. Low Score, Low Income
2. Low Score, High Income
3. Mid Score, Mid Income
4. High Score, Low Income
5. High Score, High Income

**Considering only those two features, we can build our first**

In [None]:
X= df[['Income', 'Score']].values

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Clustering using K-means

In [None]:
from sklearn.cluster import KMeans

km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
y_pred = km5.fit_predict(X_scaled)

In [None]:
km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
km5.fit(X_scaled)

In [None]:
(y_pred == km5.labels_).all()

In [None]:
km5.labels_.shape

In [None]:
km5.labels_

In [None]:
km5.inertia_

In [None]:
km5 = KMeans(n_clusters= 5, init='k-means++', random_state=0)
km5.fit(X_scaled)
km5.score(X_scaled)

In [None]:
X.shape

In [None]:
X[y_pred == 0]

In [None]:
X[y_pred == 0][:,0]

In [None]:
X[y_pred == 0,0]

In [None]:
X[y_pred == 0,1]

In [None]:
plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')

In [None]:
#Visualizing all the clusters 
plt.figure(figsize=(10,10))
plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_pred == 2, 0], X[y_pred == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_pred == 3, 0], X[y_pred == 3, 1], s = 100, c = 'yellow', label = 'Cluster 4')
plt.scatter(X[y_pred == 4, 0], X[y_pred == 4, 1], s = 100, c = 'brown', label = 'Cluster 5')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

**We can try different number of clusters to find the optimum number of clusters using Elbow Method**

## Elbow Method

In [None]:
# Within-cluster Sum of Squares (Inertia)
inertia=[]
k_range= range(2,11)

for i in k_range:
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

#Visualizing the ELBOW method to get the optimal value of K 
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=k_range, y=inertia, ax=ax)
plt.title('The Elbow Method')
plt.xlabel('No of clusters "k"')
plt.ylabel('Inertia')

# Annotate arrow
ax.annotate('Optimal No. of clusters', xy=(5, 46000), xytext=(5, 80000), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

plt.show()


> ## Elbow Method shows that the optimal number of clusters is 5