# Data Preprocessing

## Importing and Slicing

In [40]:
import pandas as pd
import numpy as np

In [41]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [42]:
x = dataset.iloc[:, :-1]
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [43]:
y = dataset.iloc[:, 3]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Handling Missing Values

In [6]:
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [7]:
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
dataset.isnull().values.any()

True

In [9]:
dataset.isnull().sum().sum()

2

In [11]:
missing=["n/a", "na", "--", "NaN", "NA"]
df=pd.read_csv("Data.csv", na_values = missing)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [44]:
from sklearn.impute import SimpleImputer

imputer  = SimpleImputer(missing_values= np.nan, strategy='mean')
imputer

SimpleImputer()

In [45]:
imputer = imputer.fit(x.iloc[:, 1:3])
imputer

SimpleImputer()

In [46]:
x.iloc[:, 1:3] = imputer.transform(x.iloc[:, 1:3])
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


## Encoding categorical

### Label encoding

In [15]:
from sklearn.preprocessing import LabelEncoder

lEncoder = LabelEncoder()
y = lEncoder.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [16]:
pd.DataFrame(y)

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


### One-Hot Encoder

<img src="categorical.png">

In [47]:
pd.DataFrame(x)

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [50]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer([("Name", OneHotEncoder(sparse=False), [0])], remainder='passthrough')
x2 = columnTransformer.fit_transform(x)

pd.DataFrame(x2)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [23]:
from numpy import asarray

data = asarray([["red"], ["green"], ["blue"]])
print(data)

encoder = OneHotEncoder(sparse=False)
oneHot = encoder.fit_transform(data)

oneHot

[['red']
 ['green']
 ['blue']]


array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

## Splitting

In [24]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

x, y = make_blobs(n_samples=1000)
pd.DataFrame(x)

Unnamed: 0,0,1
0,2.330693,3.917960
1,3.348677,4.855597
2,-5.388933,1.195586
3,-3.102281,2.091181
4,4.653132,3.107182
...,...,...
995,-4.677278,3.611178
996,-4.869745,2.245624
997,-5.179009,1.436348
998,-2.808180,-7.175990


In [25]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.7)

print(xTrain.shape, xTest.shape, yTrain.shape, yTest.shape)
pd.DataFrame(xTrain)

(700, 2) (300, 2) (700,) (300,)


Unnamed: 0,0,1
0,-5.299403,3.123672
1,-1.742058,-6.469566
2,-2.236238,-6.876597
3,-2.679625,2.030174
4,-4.770076,1.350481
...,...,...
695,3.279155,3.241408
696,-4.492951,1.457414
697,-3.788852,3.415317
698,-3.576251,2.237679


In [27]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.7)

print(xTrain.shape, xTest.shape, yTrain.shape, yTest.shape)
pd.DataFrame(xTrain)

(700, 2) (300, 2) (700,) (300,)


Unnamed: 0,0,1
0,1.012113,4.971278
1,-1.122191,-8.173722
2,2.876842,4.032413
3,3.340886,4.828758
4,-4.229369,0.345981
...,...,...
695,-4.130710,2.939927
696,4.689105,2.548748
697,-4.034002,-7.061673
698,4.490709,2.909490


In [28]:
dataset = pd.read_csv("Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
x = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]

In [30]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.7, random_state=1)
pd.DataFrame(xTrain)

Unnamed: 0,Country,Age,Salary
4,Germany,40.0,
0,France,44.0,72000.0
3,Spain,38.0,61000.0
1,Spain,27.0,48000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
5,France,35.0,58000.0


In [32]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.7)
pd.DataFrame(xTrain)

Unnamed: 0,Country,Age,Salary
9,France,37.0,67000.0
3,Spain,38.0,61000.0
1,Spain,27.0,48000.0
6,Spain,,52000.0
4,Germany,40.0,
8,Germany,50.0,83000.0
2,Germany,30.0,54000.0


### Stratified

In [33]:
from collections import Counter
from sklearn.datasets import make_classification

x,y = make_classification(n_samples=1000, weights=[0.94], flip_y=0, random_state=1)

print("distribution", Counter(y))

distribution Counter({0: 940, 1: 60})


In [34]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.5, random_state=1)

print("Train distribution", Counter(yTrain))
print("Test distribution", Counter(yTest))

Train distribution Counter({0: 475, 1: 25})
Test distribution Counter({0: 465, 1: 35})


In [35]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.5, random_state=1, stratify=y)

print("Train distribution", Counter(yTrain))
print("Test distribution", Counter(yTest))

Train distribution Counter({0: 470, 1: 30})
Test distribution Counter({0: 470, 1: 30})


### Scaling

#### Normalization

In [36]:
from sklearn.preprocessing import MinMaxScaler

mm_scale = MinMaxScaler().fit_transform(dataset[['Age', 'Salary']])
pd.DataFrame(mm_scale)

Unnamed: 0,0,1
0,0.73913,0.685714
1,0.0,0.0
2,0.130435,0.171429
3,0.478261,0.371429
4,0.565217,
5,0.347826,0.285714
6,,0.114286
7,0.913043,0.885714
8,1.0,1.0
9,0.434783,0.542857


$$x' = \frac{x - min(x)}{max(x)-min(x)}$$

#### Standardization

In [37]:
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler().fit_transform(dataset[['Age', 'Salary']])
pd.DataFrame(std_scale)

Unnamed: 0,0,1
0,0.719931,0.711013
1,-1.623675,-1.364376
2,-1.210098,-0.845529
3,-0.107224,-0.240207
4,0.168495,
5,-0.520801,-0.499631
6,,-1.018478
7,1.271368,1.316334
8,1.547087,1.662233
9,-0.245083,0.27864


$$z = \frac{x - \mu}{\sigma}$$

$$\mu = \frac{\sum_{i=1}^n{x_i}}{n}$$

$$\sigma = \sqrt{\frac{\sum_{i=1}^n{x_i-\mu}}{n}}$$