In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Redes neurais artificiais com a utilização de estimators

O objetivo é conseguirmos predizer se a pessoa terá uma renda maior ou menor que 50 mil por ano.
E vamos aplicar a RNA no banco de dados chamado 'census.csv'.

In [2]:
# Lendo o banco de dados
base = pd.read_csv("census.csv")

# Para confirmarmos se está tudo certo com o banco olhamos o começinho dele.
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Observando a quantidade de classes da coluna que queremos prever
base['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [4]:
# Função para convertermos os labels em 0 e 1
def converte_classe(rotulo):
    if rotulo == ' >50K':
        return 1
    else:
        return 0

In [5]:
# Então aplicamos a conversão do labels 
base.income = base.income.apply(converte_classe)

In [6]:
# Visualizando a conversão
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
# Agora vamos separar os atributos das labels 
X = base.drop('income', axis = 1) # Atributos
Y = base.income                   # Labels

In [8]:
# Visualizando as labels
print(Y)
type(Y)

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income, Length: 32561, dtype: int64


pandas.core.series.Series

In [9]:
# Visualizando os atributos
print(type(X))
X.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [10]:
# Agora vamos separar o conjunto em treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

# Analisando as dimensões
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("Y_train: ", Y_train.shape)
print("Y_test: ", Y_test.shape)

X_train:  (22792, 14)
X_test:  (9769, 14)
Y_train:  (22792,)
Y_test:  (9769,)


In [11]:
# Obtem todas as colunas do conjunto de dados
base.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [12]:
# Agora vamos ver todas as possibilidades de classes que o atributo workclass possui
base.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [13]:
# Agora vamos criar as colunas categoricas utilizando o hash bucket
workclass = tf.feature_column.categorical_column_with_hash_bucket(key = 'workclass', 
                                                                  hash_bucket_size = 100) # N° máx de categorias permitidas
education = tf.feature_column.categorical_column_with_hash_bucket(key = 'education', hash_bucket_size = 100)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(key = 'marital-status', hash_bucket_size = 100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(key = 'occupation', hash_bucket_size = 100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(key = 'relationship', hash_bucket_size = 100)
race = tf.feature_column.categorical_column_with_hash_bucket(key = 'race', hash_bucket_size = 100)
country = tf.feature_column.categorical_column_with_hash_bucket(key = 'native-country', hash_bucket_size = 100)

In [14]:
# Vamos criar a coluna sex de um jeito um pouco diferente (com vocabulary list)
sex = tf.feature_column.categorical_column_with_vocabulary_list(key = 'sex', vocabulary_list=[' Male', ' Female'])

In [15]:
# Como já definimos os atributos categóricos, vamos criar para os atributos numéricos
age = tf.feature_column.numeric_column(key = 'age')
final_weight = tf.feature_column.numeric_column(key = 'final-weight')
education_num = tf.feature_column.numeric_column(key = 'education-num')
capital_gain = tf.feature_column.numeric_column(key = 'capital-gain')
capital_loos = tf.feature_column.numeric_column(key = 'capital-loos')
hour = tf.feature_column.numeric_column(key = 'hour-per-week')

In [16]:
# Agora reunimos todas as colunas
colunas = [age, workclass, final_weight, education, education_num,
           marital_status, occupation, relationship, race, sex,
           capital_gain, capital_loos, hour, country]
print(colunas[0])

NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)


In [17]:
# Vamos utilizar o estimator agora para implementarmos o modelo em si
funcao_treinamento = tf.estimator.inputs.pandas_input_fn(x = X_train, y = Y_train,
                                                        batch_size=32,# pega em lotes de 32
                                                        num_epochs=None, # quantidade de vezes que vai rodar
                                                        shuffle=True) # pega ordem aleatória

In [18]:
# Criando nosso modelo com DNN  (dense neural network), que é quando todos neurônios estão ligados a todos da próxima camada
classificador = tf.estimator.DNNClassifier(hidden_units = [8, 8], # quantidade de camadas ocultas e qtde de neurônios em cada
                                           feature_columns=colunas,  # passando as colunas
                                           n_classes=2)  # maior que 50 mil ou menor. Temos duas categorias

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Natielle\\AppData\\Local\\Temp\\tmp0eehsevf', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002208DD77518>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
# Agora vamos fazer o treinamento
classificador.train(input_fn = funcao_treinamento)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.


ValueError: Items of feature_columns must be a DenseColumn. You can wrap a categorical column with an embedding_column or indicator_column. Given: HashedCategoricalColumn(key='education', hash_bucket_size=100, dtype=tf.string)

### *Para não termos o erro acima, precisamos converter as colunas categóricas para valores embedded*

Podemos ver como esse processo é feito com a imagem:
![](embedding_column.jpg)

In [20]:
# Realizando o Embedding
embedded_workclass = tf.feature_column.embedding_column(workclass, 
                                                        # dimension é a quantidade de categorias da coluna 
                                                        dimension = len(base.workclass.unique()))
embedded_education = tf.feature_column.embedding_column(education, dimension = len(base.education.unique()))
embedded_marital = tf.feature_column.embedding_column(marital_status, dimension = len(base['marital-status'].unique()))
embedded_occupation = tf.feature_column.embedding_column(occupation, dimension = len(base.occupation.unique()))
embedded_relationship = tf.feature_column.embedding_column(relationship, dimension = len(base.relationship.unique()))
embedded_race = tf.feature_column.embedding_column(race, dimension = len(base.race.unique()))
embedded_sex = tf.feature_column.embedding_column(sex, dimension = len(base.sex.unique()))
embedded_country = tf.feature_column.embedding_column(country, dimension = len(base['native-country'].unique()))

In [21]:
# Agora vamos definir as colunas com embedding
colunas_rna = [age, embedded_workclass, final_weight, embedded_education, education_num,
               embedded_marital, embedded_occupation, embedded_relationship, 
               embedded_race, embedded_sex,
               capital_gain, capital_loos, hour, embedded_country]

In [22]:
# Agora fazer de novo os passos para treinarmos o modelo (com o detalhe de que vamos usar a colunas_rna)

# Vamos utilizar o estimator agora para implementarmos o modelo em si
funcao_treinamento = tf.estimator.inputs.pandas_input_fn(x = X_train, y = Y_train,
                                                        batch_size=32,# pega em lotes de 32
                                                        num_epochs=None, # quantidade de vezes que vai rodar
                                                        shuffle=True) # pega ordem aleatória

# Criando nosso modelo com DNN  (dense neural network), que é quando todos neurônios estão ligados a todos da próxima camada
classificador = tf.estimator.DNNClassifier(hidden_units = [8, 8], # quantidade de camadas ocultas e qtde de neurônios em cada
                                           feature_columns=colunas_rna,  # passando as colunas
                                           n_classes=2)  # maior que 50 mil ou menor. Temos duas categorias

# Agora vamos fazer o treinamento
classificador.train(input_fn = funcao_treinamento, steps = 10000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Natielle\\AppData\\Local\\Temp\\tmpwwr6oe18', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002208ED26898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Instructions for updating:
The old _FeatureColumn APIs are being 

INFO:tensorflow:global_step/sec: 384.227
INFO:tensorflow:loss = 143.05055, step = 4901 (0.262 sec)
INFO:tensorflow:global_step/sec: 382.701
INFO:tensorflow:loss = 64.80531, step = 5001 (0.259 sec)
INFO:tensorflow:global_step/sec: 329.471
INFO:tensorflow:loss = 473.6425, step = 5101 (0.306 sec)
INFO:tensorflow:global_step/sec: 306.2
INFO:tensorflow:loss = 233.95566, step = 5201 (0.327 sec)
INFO:tensorflow:global_step/sec: 307.735
INFO:tensorflow:loss = 121.714066, step = 5301 (0.326 sec)
INFO:tensorflow:global_step/sec: 337.155
INFO:tensorflow:loss = 49.61904, step = 5401 (0.305 sec)
INFO:tensorflow:global_step/sec: 330.643
INFO:tensorflow:loss = 118.76278, step = 5501 (0.302 sec)
INFO:tensorflow:global_step/sec: 372.547
INFO:tensorflow:loss = 70.58853, step = 5601 (0.257 sec)
INFO:tensorflow:global_step/sec: 308.514
INFO:tensorflow:loss = 111.413124, step = 5701 (0.326 sec)
INFO:tensorflow:global_step/sec: 351.392
INFO:tensorflow:loss = 156.95445, step = 5801 (0.284 sec)
INFO:tensorflo

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x2208ed26080>

Notamos que agora temos o treinamento.


In [23]:
# Como já fizemos o treinamento, iremos fazer o teste agora

# Criando a função para passarmos os dados do teste com tensorflow
funcao_teste = tf.estimator.inputs.pandas_input_fn(x = X_test, y = Y_test, 
                                                   batch_size = 32,
                                                   num_epochs = 1, 
                                                   shuffle = False)  # pega em ordem 

# Realizando de fato o teste do modelo
classificador.evaluate(input_fn=funcao_teste)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-05-08T21:59:08Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpwwr6oe18\model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-05-08-21:59:10
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.7225919, accuracy_baseline = 0.7582148, auc = 0.8110743, auc_precision_recall = 0.7012839, average_loss = 5.4838147, global_step = 10000, label/mean = 0.24178524, loss = 175.06989, precision = 0.46251616, prediction/mean = 0.47506657, recall = 0.9089754
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10000: C:\Users\Natielle\AppData

{'accuracy': 0.7225919,
 'accuracy_baseline': 0.7582148,
 'auc': 0.8110743,
 'auc_precision_recall': 0.7012839,
 'average_loss': 5.4838147,
 'label/mean': 0.24178524,
 'loss': 175.06989,
 'precision': 0.46251616,
 'prediction/mean': 0.47506657,
 'recall': 0.9089754,
 'global_step': 10000}

Podemos notar que a taxa de acerto (accuracy) é de 72% +-. O que é menor do que conseguimos atingir com a regresssão logística.