In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Padronizando os dados com o tensorFlow 
## Aplicando no exemplo anterior (arquivo 13)

O objetivo é conseguirmos predizer se a pessoa terá uma renda maior ou menor que 50 mil por ano.
E vamos aplicar a RNA no banco de dados chamado 'census.csv', **porém, vamos padronizar os dados.**

In [2]:
# Lendo o banco de dados
base = pd.read_csv("census.csv")

# Para confirmarmos se está tudo certo com o banco olhamos o começinho dele.
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Observando a quantidade de classes da coluna que queremos prever
base['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [4]:
# Função para convertermos os labels em 0 e 1
def converte_classe(rotulo):
    if rotulo == ' >50K':
        return 1
    else:
        return 0

In [5]:
# Então aplicamos a conversão do labels 
base.income = base.income.apply(converte_classe)

In [6]:
# Visualizando a conversão
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
# Agora vamos separar os atributos das labels 
X = base.drop('income', axis = 1) # Atributos
Y = base.income                   # Labels

In [8]:
# Visualizando as labels
print(Y)
type(Y)

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income, Length: 32561, dtype: int64


pandas.core.series.Series

In [9]:
# Visualizando os atributos
print(type(X))
X.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [10]:
# Agora vamos separar o conjunto em treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

# Analisando as dimensões
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("Y_train: ", Y_train.shape)
print("Y_test: ", Y_test.shape)

X_train:  (22792, 14)
X_test:  (9769, 14)
Y_train:  (22792,)
Y_test:  (9769,)


In [11]:
# Obtem todas as colunas do conjunto de dados
base.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [12]:
# Agora vamos ver todas as possibilidades de classes que o atributo workclass possui
base.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [13]:
# Agora vamos criar as colunas categoricas utilizando o hash bucket
workclass = tf.feature_column.categorical_column_with_hash_bucket(key = 'workclass', 
                                                                  hash_bucket_size = 100) # N° máx de categorias permitidas
education = tf.feature_column.categorical_column_with_hash_bucket(key = 'education', hash_bucket_size = 100)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(key = 'marital-status', hash_bucket_size = 100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(key = 'occupation', hash_bucket_size = 100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(key = 'relationship', hash_bucket_size = 100)
race = tf.feature_column.categorical_column_with_hash_bucket(key = 'race', hash_bucket_size = 100)
country = tf.feature_column.categorical_column_with_hash_bucket(key = 'native-country', hash_bucket_size = 100)

In [14]:
# Vamos criar a coluna sex de um jeito um pouco diferente (com vocabulary list)
sex = tf.feature_column.categorical_column_with_vocabulary_list(key = 'sex', vocabulary_list=[' Male', ' Female'])

## Aqui vamos fazer funções manuais para padronizar os dados 


In [15]:
# Testando...
print("Média da coluna age: ", base.age.mean())
print("\nDesvio padrão da coluna age: ", base.age.std())

Média da coluna age:  38.58164675532078

Desvio padrão da coluna age:  13.640432553581341


In [16]:
# Testando...
print("Média da coluna age: ", base.age.mean())
print("\nDesvio padrão da coluna age: ", base.age.std())

Média da coluna age:  38.58164675532078

Desvio padrão da coluna age:  13.640432553581341


In [17]:
def padroniza_age(valor):
    dividendo = tf.subtract( tf.cast(valor, tf.float32), tf.constant(38.58) ) # valor de x menos a média
    divisor   = tf.constant(13.64) # É o desvio padrão
    return tf.divide(dividendo, divisor)

In [24]:
# Agora vamos fazer as outras colunas numéricas de modo menos manual
def padroniza_finalweight(valor):
    return tf.divide(tf.subtract(tf.cast(valor, tf.float32), tf.constant(189778.36)), tf.constant(105549.977))

def padroniza_education(valor):
    return tf.divide(tf.subtract(tf.cast(valor, tf.float32), tf.constant(10.08)), tf.constant(2.57))

def padroniza_capitalgain(valor):
    return tf.divide(tf.subtract(tf.cast(valor, tf.float32), tf.constant(1077.64)), tf.constant(7385.29))

def padroniza_capitalloos(valor):
    return tf.divide(tf.subtract(tf.cast(valor, tf.float32), tf.constant(87.30)), tf.constant(402.96))

def padroniza_hour(valor):
    return tf.divide(tf.subtract(tf.cast(valor, tf.float32), tf.constant(40.43)), tf.constant(12.34))

# Como já fizemos as funções de padronização, agora vamos aplicá-las.

In [25]:
# Como já definimos os atributos categóricos, vamos criar para os atributos numéricos
age = tf.feature_column.numeric_column(key = 'age', normalizer_fn = padroniza_age)
final_weight = tf.feature_column.numeric_column(key = 'final-weight', normalizer_fn = padroniza_finalweight)
education_num = tf.feature_column.numeric_column(key = 'education-num', normalizer_fn = padroniza_education)
capital_gain = tf.feature_column.numeric_column(key = 'capital-gain', normalizer_fn = padroniza_capitalgain)
capital_loos = tf.feature_column.numeric_column(key = 'capital-loos', normalizer_fn = padroniza_capitalloos)
hour = tf.feature_column.numeric_column(key = 'hour-per-week', normalizer_fn = padroniza_hour)

In [26]:
# Agora reunimos todas as colunas
colunas = [age, workclass, final_weight, education, education_num,
           marital_status, occupation, relationship, race, sex,
           capital_gain, capital_loos, hour, country]
print(colunas[0])

NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function padroniza_age at 0x000001C3DFF5CBF8>)


### *Para não termos o erro no treino, precisamos converter as colunas categóricas para valores embedded*

Podemos ver como esse processo é feito com a imagem:
![](embedding_column.jpg)

In [27]:
# Realizando o Embedding
embedded_workclass = tf.feature_column.embedding_column(workclass, 
                                                        # dimension é a quantidade de categorias da coluna 
                                                        dimension = len(base.workclass.unique()))
embedded_education = tf.feature_column.embedding_column(education, dimension = len(base.education.unique()))
embedded_marital = tf.feature_column.embedding_column(marital_status, dimension = len(base['marital-status'].unique()))
embedded_occupation = tf.feature_column.embedding_column(occupation, dimension = len(base.occupation.unique()))
embedded_relationship = tf.feature_column.embedding_column(relationship, dimension = len(base.relationship.unique()))
embedded_race = tf.feature_column.embedding_column(race, dimension = len(base.race.unique()))
embedded_sex = tf.feature_column.embedding_column(sex, dimension = len(base.sex.unique()))
embedded_country = tf.feature_column.embedding_column(country, dimension = len(base['native-country'].unique()))

In [28]:
# Agora vamos definir as colunas com embedding
colunas_rna = [age, embedded_workclass, final_weight, embedded_education, education_num,
               embedded_marital, embedded_occupation, embedded_relationship, 
               embedded_race, embedded_sex,
               capital_gain, capital_loos, hour, embedded_country]

In [29]:
# Agora fazer de novo os passos para treinarmos o modelo (com o detalhe de que vamos usar a colunas_rna)

# Vamos utilizar o estimator agora para implementarmos o modelo em si
funcao_treinamento = tf.estimator.inputs.pandas_input_fn(x = X_train, y = Y_train,
                                                        batch_size=32,# pega em lotes de 32
                                                        num_epochs=None, # quantidade de vezes que vai rodar
                                                        shuffle=True) # pega ordem aleatória

# Criando nosso modelo com DNN  (dense neural network), que é quando todos neurônios estão ligados a todos da próxima camada
classificador = tf.estimator.DNNClassifier(hidden_units = [8, 8], # quantidade de camadas ocultas e qtde de neurônios em cada
                                           feature_columns=colunas_rna,  # passando as colunas
                                           n_classes=2)  # maior que 50 mil ou menor. Temos duas categorias

# Agora vamos fazer o treinamento
classificador.train(input_fn = funcao_treinamento, steps = 10000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Natielle\\AppData\\Local\\Temp\\tmpa7_k2o7s', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C3E0A73780>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
Instructions for 

INFO:tensorflow:global_step/sec: 447.613
INFO:tensorflow:loss = 8.665455, step = 6101 (0.229 sec)
INFO:tensorflow:global_step/sec: 368.618
INFO:tensorflow:loss = 12.847151, step = 6201 (0.261 sec)
INFO:tensorflow:global_step/sec: 416.046
INFO:tensorflow:loss = 11.915162, step = 6301 (0.240 sec)
INFO:tensorflow:global_step/sec: 421.295
INFO:tensorflow:loss = 8.637221, step = 6401 (0.236 sec)
INFO:tensorflow:global_step/sec: 405.835
INFO:tensorflow:loss = 14.118211, step = 6501 (0.258 sec)
INFO:tensorflow:global_step/sec: 328.607
INFO:tensorflow:loss = 10.94862, step = 6601 (0.296 sec)
INFO:tensorflow:global_step/sec: 336.466
INFO:tensorflow:loss = 9.525547, step = 6701 (0.300 sec)
INFO:tensorflow:global_step/sec: 272.467
INFO:tensorflow:loss = 10.905117, step = 6801 (0.365 sec)
INFO:tensorflow:global_step/sec: 322.285
INFO:tensorflow:loss = 10.066402, step = 6901 (0.308 sec)
INFO:tensorflow:global_step/sec: 264.558
INFO:tensorflow:loss = 9.761349, step = 7001 (0.375 sec)
INFO:tensorflow

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x1c3e0a733c8>

Notamos que agora temos o treinamento.


In [30]:
# Como já fizemos o treinamento, iremos fazer o teste agora

# Criando a função para passarmos os dados do teste com tensorflow
funcao_teste = tf.estimator.inputs.pandas_input_fn(x = X_test, y = Y_test, 
                                                   batch_size = 32,
                                                   num_epochs = 1, 
                                                   shuffle = False)  # pega em ordem 

# Realizando de fato o teste do modelo
classificador.evaluate(input_fn=funcao_teste)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-05-08T22:55:41Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpa7_k2o7s\model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-05-08-22:55:43
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.8505477, accuracy_baseline = 0.754632, auc = 0.9085787, auc_precision_recall = 0.781474, average_loss = 0.31630942, global_step = 10000, label/mean = 0.245368, loss = 10.098127, precision = 0.7247002, prediction/mean = 0.24428028, recall = 0.6303713
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10000: C:\Users\Natielle\AppData\Loc

{'accuracy': 0.8505477,
 'accuracy_baseline': 0.754632,
 'auc': 0.9085787,
 'auc_precision_recall': 0.781474,
 'average_loss': 0.31630942,
 'label/mean': 0.245368,
 'loss': 10.098127,
 'precision': 0.7247002,
 'prediction/mean': 0.24428028,
 'recall': 0.6303713,
 'global_step': 10000}

Sem a padronização podemos notar uma taxa de acerto (accuracy) é de 72% mais ou menos.

Ao aplicarmos a padronização podemos notar uma taxa de acerto (accuracy) é de 85% mais ou menos.
Ou seja, houve uma melhora no algoritmo com o tratamento dos dados.