# Standardization

* ## Standardization is a scaling technique in which the 
    * ### mean will be equal to zero
    * ### and standard devation equal to one
* # Standardization also called Z-score Normalization

## Formulae
### Xnew = Xi - Xmean / std

In [1]:
import pandas as pd
data = pd.read_csv('sna.csv')

data.head(10)

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
5,27,58000,0
6,27,84000,0
7,32,150000,1
8,25,33000,0
9,35,65000,0


In [2]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [3]:
x.head()

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000


In [4]:
y.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    1
8    0
9    0
Name: Purchased, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

# Scaling

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

## Scaling only on X_train

In [8]:
sc.fit(X_train)

## Now Transform both in X_train and X_test

In [10]:
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

## Convert into Data Frame

In [11]:
X_train_sc = pd.DataFrame(X_train_sc,columns=X_train.columns)
X_test_sc = pd.DataFrame(X_test_sc,columns=X_test.columns)

## Before Scaling X_train

In [12]:
X_train

Unnamed: 0,Age,EstimatedSalary
92,26,15000
223,60,102000
234,38,112000
232,40,107000
377,42,53000
...,...,...
323,48,30000
192,29,43000
117,36,52000
47,27,54000


## After Scaling X_train

In [13]:
X_train_sc

Unnamed: 0,Age,EstimatedSalary
0,-1.163172,-1.584970
1,2.170181,0.930987
2,0.013305,1.220177
3,0.209385,1.075582
4,0.405465,-0.486047
...,...,...
275,0.993704,-1.151185
276,-0.869053,-0.775237
277,-0.182774,-0.514966
278,-1.065133,-0.457127


## Before Scaling X_test

In [14]:
X_test

Unnamed: 0,Age,EstimatedSalary
132,30,87000
309,38,50000
341,35,75000
196,30,79000
246,35,50000
...,...,...
216,49,65000
259,45,131000
49,31,89000
238,46,82000


## After Scaling X_test

In [15]:
X_test_sc

Unnamed: 0,Age,EstimatedSalary
0,-0.771013,0.497201
1,0.013305,-0.572804
2,-0.280814,0.150172
3,-0.771013,0.265849
4,-0.280814,-0.572804
...,...,...
115,1.091743,-0.139018
116,0.699584,1.769639
117,-0.672973,0.555039
118,0.797624,0.352606


In [16]:
import numpy as np
np.round(X_train.describe(),1)

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,37.9,69807.1
std,10.2,34641.2
min,18.0,15000.0
25%,30.0,43000.0
50%,37.0,70500.0
75%,46.0,88000.0
max,60.0,150000.0


In [17]:
np.round(X_train_sc.describe(),1)

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,0.0,0.0
std,1.0,1.0
min,-1.9,-1.6
25%,-0.8,-0.8
50%,-0.1,0.0
75%,0.8,0.5
max,2.2,2.3


## Build a model

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(X_train,y_train)

In [21]:
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
accuracy_score(y_pred,y_test)

0.6583333333333333

## Build with scaling data

In [23]:
clf.fit(X_train_sc,y_train)

In [25]:
pred = clf.predict(X_test_sc)
pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1])

In [26]:
accuracy_score(pred,y_test)

0.8666666666666667

# Standardization are most used in these Algorithms
* ## K-Means
* ## K-Nearest-Neighbours
* ## Principle Component Analysis (PCA)
* ## Artificial Neural Network
* ## Gradient Descent