In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("RawDataset.csv")

In [3]:
df.shape

(200, 5)

In [4]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19.0,15.0,39.0
1,2,Male,21.0,,81.0
2,3,Female,20.0,16.0,6.0
3,4,Female,23.0,,77.0
4,5,Female,31.0,17.0,40.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CustomerID              200 non-null    int64  
 1   Gender                  200 non-null    object 
 2   Age                     151 non-null    float64
 3   Annual Income (k$)      160 non-null    float64
 4   Spending Score (1-100)  156 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 7.9+ KB


In [6]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,151.0,160.0,156.0
mean,100.5,38.953642,61.09375,51.211538
std,57.879185,14.602894,26.068559,25.520898
min,1.0,18.0,15.0,1.0
25%,50.75,28.0,42.75,35.0
50%,100.5,35.0,62.0,51.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [7]:
df.isnull().sum()

CustomerID                 0
Gender                     0
Age                       49
Annual Income (k$)        40
Spending Score (1-100)    44
dtype: int64

## Handling Missing Values

In [8]:
def handle_missing_values(df, variable, strategy):
    impute = SimpleImputer(missing_values=np.nan, strategy=strategy)
    impute.fit(df[[variable]])
    df[variable] = impute.transform(df[[variable]])
    df.isnull().sum()

# Age
handle_missing_values(df, "Age", "median")
# Annual Income (k$)
handle_missing_values(df, "Annual Income (k$)", "mean")
# Spending Score (1-100)
handle_missing_values(df, "Spending Score (1-100)", "mean")

df.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

## Encoding Categorical Variable - Gender

In [9]:
# Binary encoding of Gender 
df['Gender'].unique()
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,0,19.0,15.00000,39.0
1,2,0,21.0,61.09375,81.0
2,3,1,20.0,16.00000,6.0
3,4,1,23.0,61.09375,77.0
4,5,1,31.0,17.00000,40.0
...,...,...,...,...,...
195,196,1,35.0,120.00000,79.0
196,197,1,45.0,126.00000,28.0
197,198,0,32.0,126.00000,74.0
198,199,0,32.0,137.00000,18.0


In [12]:
# Dropping CustomerID
X = df.iloc[:, [1,2,3,4]].values