In [1]:
# import and read the data
import pandas
data =  pandas.read_csv("datasets/Mall_Customers.csv")
data

Unnamed: 0,CustomerID,Gender,Age,Annual Income,Spending Score
0,1,Male,19,15.0,39.0
1,2,Male,21,15.0,81.0
2,3,Female,20,16.0,6.0
3,4,Female,23,16.0,77.0
4,5,Female,31,17.0,40.0
...,...,...,...,...,...
195,196,Female,35,120.0,79.0
196,197,Female,45,126.0,
197,198,Male,32,126.0,74.0
198,199,Male,32,137.0,18.0


In [3]:
data.tail(2)

Unnamed: 0,CustomerID,Gender,Age,Annual Income,Spending Score
198,199,Male,32,137.0,18.0
199,200,Male,30,137.0,83.0


In [4]:
#check if dataset contains empties
data.isnull().sum()

CustomerID        0
Gender            4
Age               0
Annual Income     1
Spending Score    3
dtype: int64

In [6]:
#we need to clean the data
#we are going to replace the empty fields of gender with "unknown"
data["Gender"].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Gender"].fillna("unknown", inplace=True)


In [7]:
#Confirm if the gender is empty
data.isnull().sum()

CustomerID        0
Gender            0
Age               0
Annual Income     1
Spending Score    3
dtype: int64

In [10]:
#we are going to use the mean to replace the annual income empty.
mean = data["Annual Income"].mean()
mean

60.35678391959799

In [11]:
#since we have the mean as 60.356, we will use it
data["Annual Income"].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Annual Income"].fillna("60.356", inplace=True)
  data["Annual Income"].fillna("60.356", inplace=True)


In [12]:

data.isnull().sum()

CustomerID        0
Gender            0
Age               0
Annual Income     0
Spending Score    3
dtype: int64

In [13]:
#we replace the spending score with the mean
mean = data["Spending Score"].mean()
mean

50.3248730964467

In [14]:
#replace 
data["Spending Score"].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Spending Score"].fillna(mean, inplace=True)


In [15]:
data.isnull().sum()

CustomerID        0
Gender            0
Age               0
Annual Income     0
Spending Score    0
dtype: int64

In [16]:
#Split the data
#Here we want age,annual income and the spending score.
array = data.values
array

array([[1, 'Male', 19, 15.0, 39.0],
       [2, 'Male', 21, 15.0, 81.0],
       [3, 'Female', 20, 16.0, 6.0],
       [4, 'Female', 23, 16.0, 77.0],
       [5, 'Female', 31, 17.0, 40.0],
       [6, 'Female', 22, 17.0, 76.0],
       [7, 'Female', 35, 18.0, 6.0],
       [8, 'Female', 23, 18.0, 94.0],
       [9, 'Male', 64, 19.0, 3.0],
       [10, 'Female', 30, 19.0, 72.0],
       [11, 'Male', 67, 19.0, 14.0],
       [12, 'Female', 35, 19.0, 99.0],
       [13, 'Female', 58, 20.0, 15.0],
       [14, 'Female', 24, 20.0, 77.0],
       [15, 'Male', 37, 20.0, 13.0],
       [16, 'Male', 22, 20.0, 79.0],
       [17, 'Female', 35, 21.0, 35.0],
       [18, 'Male', 20, 21.0, 66.0],
       [19, 'Male', 52, 23.0, 29.0],
       [20, 'Female', 35, 23.0, 98.0],
       [21, 'Male', 35, 24.0, 35.0],
       [22, 'Male', 25, 24.0, 73.0],
       [23, 'Female', 46, 25.0, 5.0],
       [24, 'Male', 31, 25.0, 73.0],
       [25, 'Female', 54, 28.0, 14.0],
       [26, 'Male', 29, 28.0, 82.0],
       [27, 'Female', 4

In [18]:
#unsupervised learning(clustering) contains only X variable 
#it does NOT contain y variable
X = array[:,2:5]
X

array([[19, 15.0, 39.0],
       [21, 15.0, 81.0],
       [20, 16.0, 6.0],
       [23, 16.0, 77.0],
       [31, 17.0, 40.0],
       [22, 17.0, 76.0],
       [35, 18.0, 6.0],
       [23, 18.0, 94.0],
       [64, 19.0, 3.0],
       [30, 19.0, 72.0],
       [67, 19.0, 14.0],
       [35, 19.0, 99.0],
       [58, 20.0, 15.0],
       [24, 20.0, 77.0],
       [37, 20.0, 13.0],
       [22, 20.0, 79.0],
       [35, 21.0, 35.0],
       [20, 21.0, 66.0],
       [52, 23.0, 29.0],
       [35, 23.0, 98.0],
       [35, 24.0, 35.0],
       [25, 24.0, 73.0],
       [46, 25.0, 5.0],
       [31, 25.0, 73.0],
       [54, 28.0, 14.0],
       [29, 28.0, 82.0],
       [45, 28.0, 32.0],
       [35, 28.0, 61.0],
       [40, 29.0, 31.0],
       [23, 29.0, 87.0],
       [60, 30.0, 4.0],
       [21, 30.0, 73.0],
       [53, 33.0, 4.0],
       [18, 33.0, 92.0],
       [49, 33.0, 14.0],
       [21, 33.0, 81.0],
       [42, 34.0, 17.0],
       [30, 34.0, 73.0],
       [36, 37.0, 26.0],
       [20, 37.0, 75.0],
      

In [20]:
#We are going to fit data into a model
# we are going to use Kmeans
# k means organizes data into disjoint groups
#specify the number of clusters
# NB: The more the clusters the better the groups will contain good selection
#we have random state which helps in randomizing the data during grouping


from sklearn.cluster import KMeans
model = KMeans(n_clusters=10, random_state= 42)
model.fit(X)

In [21]:
# Find clusters
# we need to find the clusters created by KMeans(10)
means = model.cluster_centers_
clusters = pandas.DataFrame(means, columns=['Age','Annual Income','Spending Score'])
clusters

Unnamed: 0,Age,Annual Income,Spending Score
0,56.340909,53.704545,49.386364
1,33.1,74.95,82.4
2,45.928571,98.428571,22.428571
3,25.272727,25.727273,79.363636
4,49.0,24.461538,11.076923
5,27.166667,58.371,50.703469
6,38.05,77.05,14.2
7,32.0,92.384615,82.307692
8,35.416667,30.75,36.083333
9,35.5,122.5,71.274958


In [22]:
X = array[:, 1:5]
X

array([['Male', 19, 15.0, 39.0],
       ['Male', 21, 15.0, 81.0],
       ['Female', 20, 16.0, 6.0],
       ['Female', 23, 16.0, 77.0],
       ['Female', 31, 17.0, 40.0],
       ['Female', 22, 17.0, 76.0],
       ['Female', 35, 18.0, 6.0],
       ['Female', 23, 18.0, 94.0],
       ['Male', 64, 19.0, 3.0],
       ['Female', 30, 19.0, 72.0],
       ['Male', 67, 19.0, 14.0],
       ['Female', 35, 19.0, 99.0],
       ['Female', 58, 20.0, 15.0],
       ['Female', 24, 20.0, 77.0],
       ['Male', 37, 20.0, 13.0],
       ['Male', 22, 20.0, 79.0],
       ['Female', 35, 21.0, 35.0],
       ['Male', 20, 21.0, 66.0],
       ['Male', 52, 23.0, 29.0],
       ['Female', 35, 23.0, 98.0],
       ['Male', 35, 24.0, 35.0],
       ['Male', 25, 24.0, 73.0],
       ['Female', 46, 25.0, 5.0],
       ['Male', 31, 25.0, 73.0],
       ['Female', 54, 28.0, 14.0],
       ['Male', 29, 28.0, 82.0],
       ['Female', 45, 28.0, 32.0],
       ['Male', 35, 28.0, 61.0],
       ['Female', 40, 29.0, 31.0],
       ['Female

In [23]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=10, random_state= 42)
model.fit(X)

ValueError: could not convert string to float: 'Male'