# Advanced Python 
## **Data Wrangling**  

----

 ### **Topics**
**2. Feature Scaling**
 * Normalization
 * Standardization

**3. Feature Encoding**

 * Category Codes
 * Label Encoding
 * One-Hot Encoding

----

## 2. Feature Scaling
### **Normalization**

1. simple feature scaling 
    - X(new)= X(old)/ X(max)
2. Min-Max Method 
3. Z-score (standard score) -3 to +3
4. Log transformation

In [202]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

# import dataset
data= sns.load_dataset('titanic')
data.head(10) # show first 5 rows

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [203]:
# 1- Simple feature scaling
data["fare"]/data["fare"].max()

0      0.014151
1      0.139136
2      0.015469
3      0.103644
4      0.015713
         ...   
886    0.025374
887    0.058556
888    0.045771
889    0.058556
890    0.015127
Name: fare, Length: 891, dtype: float64

In [204]:
# 2- Min-Max (formulla df["fare"]= (df["fare"]-df["fare"].min())/(df["fare"].max()-df["fare"].min())

# import libraries
from sklearn.preprocessing import MinMaxScaler

X = data[["fare"]]            # independent variable
mms = MinMaxScaler().fit(X) # fit the data
mms.transform(X)            # transform the data

array([[0.01415106],
       [0.13913574],
       [0.01546857],
       [0.1036443 ],
       [0.01571255],
       [0.0165095 ],
       [0.10122886],
       [0.04113566],
       [0.02173075],
       [0.05869429],
       [0.03259623],
       [0.05182215],
       [0.01571255],
       [0.06104473],
       [0.01533038],
       [0.03122992],
       [0.05684821],
       [0.02537431],
       [0.03513366],
       [0.01410226],
       [0.05074862],
       [0.02537431],
       [0.01567195],
       [0.06929139],
       [0.04113566],
       [0.06126432],
       [0.01410226],
       [0.51334181],
       [0.01537917],
       [0.01541158],
       [0.0541074 ],
       [0.28598956],
       [0.01512699],
       [0.02049464],
       [0.16038672],
       [0.10149724],
       [0.01411046],
       [0.01571255],
       [0.03513366],
       [0.02194234],
       [0.01849397],
       [0.04098927],
       [0.01541158],
       [0.08115719],
       [0.01537917],
       [0.01571255],
       [0.03025399],
       [0.015

In [205]:
# 3- Z-score (formulla df["fare"]= (df["fare"]-df["fare"].mean())/(df["fare"].std()))
# import libraries
from scipy.stats import zscore

zscore(data["fare"])

0     -0.502445
1      0.786845
2     -0.488854
3      0.420730
4     -0.486337
         ...   
886   -0.386671
887   -0.044381
888   -0.176263
889   -0.044381
890   -0.492378
Name: fare, Length: 891, dtype: float64

In [206]:
# 4- Log Transformation
np.log(data["fare"])

  result = getattr(ufunc, method)(*inputs, **kwargs)


0      1.981001
1      4.266662
2      2.070022
3      3.972177
4      2.085672
         ...   
886    2.564949
887    3.401197
888    3.154870
889    3.401197
890    2.047693
Name: fare, Length: 891, dtype: float64

### **Standardization**
- standardlization is a process of transforming data into a common format which allows the researcher to make the meaningful comparison.


In [207]:
# import libraries
from sklearn.preprocessing import StandardScaler
#
X = data[["fare"]]           # independent variable
ss = StandardScaler().fit(X) # fit the data
ss.transform(X)             # transform the data

array([[-5.02445171e-01],
       [ 7.86845294e-01],
       [-4.88854258e-01],
       [ 4.20730236e-01],
       [-4.86337422e-01],
       [-4.78116429e-01],
       [ 3.95813561e-01],
       [-2.24083121e-01],
       [-4.24256141e-01],
       [-4.29555021e-02],
       [-3.12172378e-01],
       [-1.13845709e-01],
       [-4.86337422e-01],
       [-1.87093118e-02],
       [-4.90279793e-01],
       [-3.26266659e-01],
       [-6.19988892e-02],
       [-3.86670720e-01],
       [-2.85997284e-01],
       [-5.02948539e-01],
       [-1.24919787e-01],
       [-3.86670720e-01],
       [-4.86756223e-01],
       [ 6.63597416e-02],
       [-2.24083121e-01],
       [-1.64441595e-02],
       [-5.02948539e-01],
       [ 4.64700108e+00],
       [-4.89776426e-01],
       [-4.89442190e-01],
       [-9.02720170e-02],
       [ 2.30172882e+00],
       [-4.92377828e-01],
       [-4.37007438e-01],
       [ 1.00606170e+00],
       [ 3.98582080e-01],
       [-5.02863973e-01],
       [-4.86337422e-01],
       [-2.8

## 3. Feature Encoding

In [208]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

# import dataset
data= sns.load_dataset('titanic')
data.head(10) # show first 5 rows

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


### **Category Codes**

In [209]:
data["class"].astype("category").cat.codes  # convert to numerical (First class=0, Second class=1, Third class=2)

0      2
1      0
2      2
3      0
4      2
      ..
886    1
887    0
888    2
889    0
890    2
Length: 891, dtype: int8

### **Label Encoding**
- label encoding is used to convert categorical data into numerical data 

In [210]:
# import libraries
from sklearn.preprocessing import LabelEncoder

X = data["class"]
LabelEncoder().fit_transform(X) 

array([2, 0, 2, 0, 2, 2, 0, 2, 2, 1, 2, 0, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1,
       2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 1, 0, 0, 2, 2, 2, 2, 2, 1, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 0, 1, 2, 1, 2, 2, 0, 0, 2, 0, 2,
       1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 1, 2, 2, 2,
       0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 0, 0, 2, 0, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 0, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2,
       2, 0, 1, 2, 2, 1, 2, 1, 2, 0, 2, 0, 2, 2, 1, 1, 2, 1, 0, 0, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 1, 2, 0, 2, 1, 0, 1,
       2, 1, 2, 2, 0, 2, 1, 2, 1, 2, 0, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2,
       1, 2, 2, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 2, 0, 0, 0, 1, 2, 2, 0, 0,
       2, 1, 2, 2, 0, 0, 0, 2, 1, 0, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 1, 2, 0, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2,

### **Dummies / One-Hot**
- dummies are used to convert categorical variables into dummy/indicator variables (0 or 1)

In [211]:
# dummy variables
pd.get_dummies(data["class"]) # convert to dummy variables (0 or 1) means (False or True)

Unnamed: 0,First,Second,Third
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
886,0,1,0
887,1,0,0
888,0,0,1
889,1,0,0


### **Binarizer**
- Binarizer are used to convert numerical values to boolean values

In [212]:
# import libraries 
from sklearn.preprocessing import LabelBinarizer 

lb = LabelBinarizer() 
X = data["class"]  # independent variable
lb.fit(X)             # fit the data
lb.transform(X)     # transform the data

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1]])

### **Label Encoding to One-Hot Encoding**
- label encoding to oneHotEncoder used in deep learning

In [213]:
# import libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown='ignore')

data["class_encoding"] = le.fit_transform(data["class"])
ohe.fit_transform(data[["class_encoding"]]).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])