# Normalizando o Conjunto de Dados - Machine Learning 03

### Importação

In [1]:
# Libs
import numpy as np
import pandas as pd

### Base de dados

In [2]:
# Dados
ds = pd.read_csv('../data/admission.csv', sep=';')
ds.head()

Unnamed: 0,Name,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Approval
0,Lucas,337,118,4,4.5,4.5,9.65,1,1
1,Ana,324,107,4,4.0,4.5,8.87,1,1
2,Jose,316,104,3,3.0,3.5,8.0,1,1
3,Carlos,322,110,3,3.5,2.5,8.67,1,1
4,Zileide,314,103,2,2.0,3.0,8.21,0,0


### Separando os Valores

In [3]:
# Amostra
X =ds.iloc[:, 0:8].values
X

array([['Lucas', 337, 118, 4, 4.5, 4.5, 9.65, 1],
       ['Ana', 324, 107, 4, 4.0, 4.5, 8.87, 1],
       ['Jose', 316, 104, 3, 3.0, 3.5, 8.0, 1],
       ['Carlos', 322, 110, 3, 3.5, 2.5, 8.67, 1],
       ['Zileide', 314, 103, 2, 2.0, 3.0, 8.21, 0],
       ['Joana', 330, 115, 5, 4.5, 3.0, 9.34, 1],
       ['Davi', 321, 109, 3, 3.0, 4.0, 8.2, 1],
       ['Daniel', 308, 101, 2, 3.0, 4.0, 7.9, 0],
       ['Marcelo', 302, 102, 1, 2.0, 1.5, 8.0, 0]], dtype=object)

In [4]:
# Variável independente 
y = ds.iloc[:, 8].values
y

array([1, 1, 1, 1, 0, 1, 1, 0, 0], dtype=int64)

### Transformando os valores

In [5]:
# Lib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
onehotencoder = make_column_transformer(
    (
        OneHotEncoder(categories='auto', sparse_output=False),
        [0]
    ), 
    remainder="passthrough"
)
X = onehotencoder.fit_transform(X)
X

array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 337, 118, 4, 4.5,
        4.5, 9.65, 1],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 324, 107, 4, 4.0,
        4.5, 8.87, 1],
       [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 316, 104, 3, 3.0,
        3.5, 8.0, 1],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 322, 110, 3, 3.5,
        2.5, 8.67, 1],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 314, 103, 2, 2.0,
        3.0, 8.21, 0],
       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 330, 115, 5, 4.5,
        3.0, 9.34, 1],
       [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 321, 109, 3, 3.0,
        4.0, 8.2, 1],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 308, 101, 2, 3.0,
        4.0, 7.9, 0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 302, 102, 1, 2.0,
        1.5, 8.0, 0]], dtype=object)

### Dividindo a base de Treino e Teste

In [6]:
# Treino e Teste
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)
X_train

array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 337, 118, 4, 4.5,
        4.5, 9.65, 1],
       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 330, 115, 5, 4.5,
        3.0, 9.34, 1],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 324, 107, 4, 4.0,
        4.5, 8.87, 1],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 322, 110, 3, 3.5,
        2.5, 8.67, 1],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 302, 102, 1, 2.0,
        1.5, 8.0, 0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 308, 101, 2, 3.0,
        4.0, 7.9, 0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 314, 103, 2, 2.0,
        3.0, 8.21, 0]], dtype=object)

### Normalizando o Conjunto de Dados

In [7]:
# Lib para a normalização
from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.fit_transform(X_test)
X_train

array([[-0.40824829, -0.40824829, -0.40824829,  0.        , -0.40824829,
         0.        ,  2.44948974, -0.40824829, -0.40824829,  1.52714907,
         1.62834737,  0.76376262,  1.15470054,  1.17873792,  1.58876185,
         0.8660254 ],
       [-0.40824829, -0.40824829, -0.40824829,  0.        ,  2.44948974,
         0.        , -0.40824829, -0.40824829, -0.40824829,  0.91378592,
         1.13984316,  1.52752523,  1.15470054, -0.2773501 ,  1.08983085,
         0.8660254 ],
       [ 2.44948974, -0.40824829, -0.40824829,  0.        , -0.40824829,
         0.        , -0.40824829, -0.40824829, -0.40824829,  0.38804607,
        -0.16283474,  0.76376262,  0.64951905,  1.17873792,  0.33338707,
         0.8660254 ],
       [-0.40824829,  2.44948974, -0.40824829,  0.        , -0.40824829,
         0.        , -0.40824829, -0.40824829, -0.40824829,  0.21279946,
         0.32566947,  0.        ,  0.14433757, -0.76271277,  0.01149611,
         0.8660254 ],
       [-0.40824829, -0.40824829, -0