## Lib

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [15]:
data = pd.read_csv('DataPreprocessing.csv')
X = data.iloc[:, :-1].values # X --> matrix | x --> array
y = data.iloc[:, -1].values
data.head()

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes


# Normalization

In [3]:
from sklearn.preprocessing import Normalizer

In [4]:
data = data.dropna()
data = data[['Age','Income']]

norm = Normalizer(norm="l2")

data = norm.fit_transform(data)

data

array([[5.67129538e-04, 9.99999839e-01],
       [5.55555470e-04, 9.99999846e-01],
       [5.40123378e-04, 9.99999854e-01],
       [5.87431593e-04, 9.99999827e-01],
       [5.74712549e-04, 9.99999835e-01],
       [5.59071643e-04, 9.99999844e-01],
       [5.52208751e-04, 9.99999848e-01],
       [5.22387988e-04, 9.99999864e-01]])

In [5]:
#l2 ---> sum row = 1
data=data**2

data[0].sum()

1.0

# MinMax Scaling

In [6]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
scaler = MinMaxScaler(feature_range=(0, 1))
X[:,1:] = scaler.fit_transform(X[:,1:])
X

array([['India', 0.7391304347826089, 0.6857142857142859],
       ['Brazil', 0.0, 0.0],
       ['USA', 0.13043478260869557, 0.17142857142857149],
       ['Brazil', 0.4782608695652173, 0.37142857142857144],
       ['USA', 0.5652173913043479, nan],
       ['India', 0.34782608695652173, 0.2857142857142858],
       ['Brazil', nan, 0.11428571428571432],
       ['India', 0.9130434782608696, 0.8857142857142857],
       ['USA', 1.0, 1.0],
       ['India', 0.4347826086956521, 0.5428571428571429]], dtype=object)

# Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
ss = StandardScaler()
X[:,1:] = ss.fit_transform(X[:,1:])
X

array([['India', 0.7199314321591977, 0.7110127588722818],
       ['Brazil', -1.6236751448696793, -1.3643758345927561],
       ['USA', -1.2100975136292897, -0.8455286862264967],
       ['Brazil', -0.10722383032158307, -0.24020701313252762],
       ['USA', 0.1684945905053441, nan],
       ['India', -0.5208014615619728, -0.499630587315657],
       ['Brazil', nan, -1.0184777356819166],
       ['India', 1.2713682738130507, 1.3163344319662502],
       ['USA', 1.5470866946399773, 1.6622325308770898],
       ['India', -0.2450830407350463, 0.2786401352337319]], dtype=object)

# Binarizer

In [10]:
from sklearn.preprocessing import Binarizer

In [14]:
data = data.dropna()
data = data[['Age','Income']]

binarizer = Binarizer(threshold=50.0)

data = binarizer.fit_transform(data)
data

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [0., 1.]])

# qcut (partion scale)

In [16]:
data = data.dropna()
data = data[['Age','Income']]

#range --> 4 3 2 1
range_labels_age = range(4, 0, -1)
age_groups = pd.qcut(data['Age'], q=4, labels=range_labels_age)
data['Age'] = age_groups.values

#range --> 1 2 3 4
range_labels_Income = range(1, 5, 1)
Income_groups = pd.qcut(data['Income'], q=4, labels=range_labels_Income)
data['Income'] = Income_groups.values

data

Unnamed: 0,Age,Income
0,2,3
1,4,1
2,4,1
3,2,2
5,3,2
7,1,4
8,1,4
9,3,3
