# **USANDO ENCODERS COM PANDAS E SKLEARN**

### **Import dos dados**

In [1]:
import pandas as pd
import numpy as np

In [2]:
base = pd.read_csv('census.csv')
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
previsores = base.iloc[:,0:14].values
previsores

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [23, ' Private', 193090, ..., 0, 40, ' United-States'],
       [64, ' Private', 151364, ..., 0, 40, ' United-States'],
       [70, ' Local-gov', 88638, ..., 0, 50, ' United-States']],
      dtype=object)

In [4]:
classe = base.iloc[:,14].values
classe

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >5'],
      dtype=object)

### **Substituir strings por encoders**

In [5]:
# Import da função e instancia do objeto
from sklearn.preprocessing import LabelEncoder
labelencoder_previsores = LabelEncoder()

In [6]:
# Vendo antes do encoder o resultado em uma das linhas:
previsores[0,:]

array([39, ' State-gov', 77516, ' Bachelors', 13, ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', 2174, 0, 40,
       ' United-States'], dtype=object)

In [7]:
# Aplicando objeto encoder nas colunas desejadas:
previsores[:, 1] = labelencoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelencoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelencoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelencoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelencoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelencoder_previsores.fit_transform(previsores[:, 13])

In [8]:
# Vendo o resultado final em uma das linhas:
# Nota-se que todas os campos foram cobertos pelo encoder.
previsores[0,:]

array([39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 37], dtype=object)

### **USANDO O ONEHOTENCODER (MELHOR PARA DUMMY VARIABLES)**

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7,8,9,13])],remainder='passthrough')

In [10]:
# passando o Hotenconder para os previsores:
previsores = onehotencorder.fit_transform(previsores).toarray()

In [11]:
previsores[0,:]

array([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [12]:
# Encode na classe
labelencorder_classe = LabelEncoder()
classe = labelencorder_classe.fit_transform(classe)

In [13]:
classe[0:10,]

array([0, 0, 0, 0, 0, 0, 0, 2, 2, 2])

### **ESCALONAMENTO DE ATRIBUTOS**

In [14]:
# Import da função StandardScaler StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [15]:
previsores =  scaler.fit_transform(previsores)

In [16]:
previsores

array([[-0.25085523, -0.16831613, -0.25919835, ...,  0.15401978,
        -0.22195594, -0.04149311],
       [-0.25085523, -0.16831613, -0.25919835, ..., -0.14506114,
        -0.22195594, -2.24187969],
       [-0.25085523, -0.16831613, -0.25919835, ..., -0.14506114,
        -0.22195594, -0.04149311],
       ...,
       [-0.25085523, -0.16831613, -0.25919835, ...,  0.36037736,
        -0.22195594, -0.04149311],
       [-0.25085523, -0.16831613, -0.25919835, ..., -0.14506114,
        -0.22195594, -0.04149311],
       [-0.25085523, -0.16831613,  3.85804922, ...,  0.94120516,
        -0.22195594,  0.77346488]])

### **DIVISÃO TREINO E TESTE**

In [18]:
from sklearn.model_selection import train_test_split
previsores_train, previsores_test, classe_train, classe_test = train_test_split(previsores, classe, test_size=0.3, random_state=0)