In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv('Mall_Customers.csv')

In [3]:
data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [5]:
data.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [6]:
data.tail()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18
199,200,Male,30,137,83


In [7]:
data['Gender'].value_counts()

Gender
Female    112
Male       88
Name: count, dtype: int64

In [8]:
data['Gender'] = data['Gender'].apply(lambda x: 1 if x=='Male' else 0)

In [9]:
data['Age'].value_counts()

Age
32    11
35     9
19     8
31     8
30     7
49     7
40     6
38     6
47     6
27     6
36     6
23     6
34     5
20     5
29     5
50     5
48     5
21     5
24     4
18     4
28     4
67     4
59     4
54     4
43     3
60     3
45     3
39     3
33     3
37     3
22     3
25     3
46     3
68     3
52     2
44     2
66     2
57     2
26     2
53     2
42     2
63     2
70     2
51     2
58     2
65     2
41     2
55     1
69     1
64     1
56     1
Name: count, dtype: int64

In [10]:
# 0 -- 18 - 28
# 1 -- 29 - 38
# 2 -- 39 - 48
# 3 -- 49 - 60
# 4 -- 61 - 70
data['Age_Group'] = data['Age'].apply(
    lambda x: 0 if (18 <= x <= 28) else 
              (1 if (29 <= x <= 38) else 
              (2 if (39 <= x <= 48) else 
              (3 if (49 <= x <= 60) else 
              (4 if (61 <= x <= 70) else 5)))))

In [11]:
data['Age_Group'].value_counts()

Age_Group
1    63
0    50
3    35
2    35
4    17
Name: count, dtype: int64

In [12]:
data.drop('Age', axis=1, inplace= True)

In [13]:
data.tail(10)

Unnamed: 0,CustomerID,Gender,Annual Income (k$),Spending Score (1-100),Age_Group
190,191,0,103,23,1
191,192,0,103,69,1
192,193,1,113,8,1
193,194,0,113,91,1
194,195,0,120,16,2
195,196,0,120,79,1
196,197,0,126,28,2
197,198,1,126,74,1
198,199,1,137,18,1
199,200,1,137,83,1


In [14]:
X = data[['Gender', 'Annual Income (k$)', 'Spending Score (1-100)', 'Age_Group']].values

In [15]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)

KMeans(n_clusters=5, random_state=42)

In [16]:
data['Cluster'] = kmeans.labels_
data = data.sort_values(by='Cluster')

In [17]:
data.head(10)

Unnamed: 0,CustomerID,Gender,Annual Income (k$),Spending Score (1-100),Age_Group,Cluster
99,100,1,61,49,0,0
83,84,0,54,44,2,0
82,83,1,54,41,4,0
81,82,1,54,55,1,0
80,81,1,54,51,3,0
79,80,0,54,42,3,0
78,79,0,54,52,0,0
77,78,1,54,48,2,0
76,77,0,54,53,2,0
75,76,1,54,54,0,0


In [18]:
data['Gender'] = data['Gender'].apply(lambda x: 'Male' if x==1 else 'Female')
data.head(6)

Unnamed: 0,CustomerID,Gender,Annual Income (k$),Spending Score (1-100),Age_Group,Cluster
99,100,Male,61,49,0,0
83,84,Female,54,44,2,0
82,83,Male,54,41,4,0
81,82,Male,54,55,1,0
80,81,Male,54,51,3,0
79,80,Female,54,42,3,0


In [19]:
data['Cluster'].value_counts()

Cluster
0    81
4    39
1    35
2    23
3    22
Name: count, dtype: int64

In [20]:
data.to_csv('Mall_Customers_Grouped.csv', index=False)