In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('CustomerData.csv')

In [3]:
df.head()

Unnamed: 0,CustID,Age,Gender,Country,Employed,Income,ItemsPurchased(monthly),ProductType,PaymentType,Mode
0,1,37.0,Male,USA,True,,99.0,BabyCare,Cash,Offline
1,2,44.0,,Russia,False,463034.0,21.0,BabyCare,Bank Transfer,
2,3,900.0,Female,India,True,402865.0,31.0,Medical,UPI,Offline
3,4,46.0,,USA,True,,21.0,BabyCare,UPI,Offline
4,5,19.0,Female,Australia,True,221868.0,45.0,Medical,Bank Transfer,Online


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
CustID                     50 non-null int64
Age                        43 non-null float64
Gender                     42 non-null object
Country                    46 non-null object
Employed                   47 non-null object
Income                     46 non-null float64
ItemsPurchased(monthly)    46 non-null float64
ProductType                46 non-null object
PaymentType                47 non-null object
Mode                       46 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 2.8+ KB


# Data Preprocessing

Renoving anomalous data

In [5]:
df.loc[df['Age'] > 100, 'Age'] = np.nan

In [6]:
df.head()

Unnamed: 0,CustID,Age,Gender,Country,Employed,Income,ItemsPurchased(monthly),ProductType,PaymentType,Mode
0,1,37.0,Male,USA,True,,99.0,BabyCare,Cash,Offline
1,2,44.0,,Russia,False,463034.0,21.0,BabyCare,Bank Transfer,
2,3,,Female,India,True,402865.0,31.0,Medical,UPI,Offline
3,4,46.0,,USA,True,,21.0,BabyCare,UPI,Offline
4,5,19.0,Female,Australia,True,221868.0,45.0,Medical,Bank Transfer,Online


Removing null values

In [7]:
df.isnull().sum()

CustID                     0
Age                        9
Gender                     8
Country                    4
Employed                   3
Income                     4
ItemsPurchased(monthly)    4
ProductType                4
PaymentType                3
Mode                       4
dtype: int64

In [8]:
df['Age'] = df['Age'].fillna(int(df['Age'].mean()))
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Country'] = df['Country'].fillna(df['Country'].mode()[0])
df['Employed'] = df['Employed'].fillna(df['Employed'].mode()[0])
df['Income'] = df['Income'].fillna(df['Income'].mean())
df['ItemsPurchased(monthly)'] = df['ItemsPurchased(monthly)'].fillna(int(df['ItemsPurchased(monthly)'].mean()))
df['ProductType'] = df['ProductType'].fillna(df['ProductType'].mode()[0])
df['PaymentType'] = df['PaymentType'].fillna(df['PaymentType'].mode()[0])
df['Mode'] = df['Mode'].fillna(df['Mode'].mode()[0])

In [9]:
df.head()

Unnamed: 0,CustID,Age,Gender,Country,Employed,Income,ItemsPurchased(monthly),ProductType,PaymentType,Mode
0,1,37.0,Male,USA,True,242613.630435,99.0,BabyCare,Cash,Offline
1,2,44.0,Male,Russia,False,463034.0,21.0,BabyCare,Bank Transfer,Online
2,3,50.0,Female,India,True,402865.0,31.0,Medical,UPI,Offline
3,4,46.0,Male,USA,True,242613.630435,21.0,BabyCare,UPI,Offline
4,5,19.0,Female,Australia,True,221868.0,45.0,Medical,Bank Transfer,Online


In [10]:
df.isnull().sum()

CustID                     0
Age                        0
Gender                     0
Country                    0
Employed                   0
Income                     0
ItemsPurchased(monthly)    0
ProductType                0
PaymentType                0
Mode                       0
dtype: int64

Scaling the values

In [11]:
scaler = MinMaxScaler()

In [12]:
df[['Income', 'ItemsPurchased(monthly)']] = scaler.fit_transform(df[['Income', 'ItemsPurchased(monthly)']])

In [13]:
df.head()

Unnamed: 0,CustID,Age,Gender,Country,Employed,Income,ItemsPurchased(monthly),ProductType,PaymentType,Mode
0,1,37.0,Male,USA,True,0.470969,1.0,BabyCare,Cash,Offline
1,2,44.0,Male,Russia,False,0.925669,0.1875,BabyCare,Bank Transfer,Online
2,3,50.0,Female,India,True,0.801548,0.291667,Medical,UPI,Offline
3,4,46.0,Male,USA,True,0.470969,0.1875,BabyCare,UPI,Offline
4,5,19.0,Female,Australia,True,0.428174,0.4375,Medical,Bank Transfer,Online


Discretizing or binning the values

In [14]:
bins = [0, 12, 19, 59, 100]
labels = ['Child', 'Teen', 'Middle-Aged', 'Senior-Citizen']

df['CustCategory'] = pd.cut(df['Age'], bins=bins, labels=labels)

In [15]:
df.head()

Unnamed: 0,CustID,Age,Gender,Country,Employed,Income,ItemsPurchased(monthly),ProductType,PaymentType,Mode,CustCategory
0,1,37.0,Male,USA,True,0.470969,1.0,BabyCare,Cash,Offline,Middle-Aged
1,2,44.0,Male,Russia,False,0.925669,0.1875,BabyCare,Bank Transfer,Online,Middle-Aged
2,3,50.0,Female,India,True,0.801548,0.291667,Medical,UPI,Offline,Middle-Aged
3,4,46.0,Male,USA,True,0.470969,0.1875,BabyCare,UPI,Offline,Middle-Aged
4,5,19.0,Female,Australia,True,0.428174,0.4375,Medical,Bank Transfer,Online,Teen
