## Build a linear model with Estimators.
This tutorial uses the tf.estimator API in TensorFlow to solve a benchmark binary classification problem. Estimators are TensorFlow's most scalable and production-oriented model type.

## Overview
Using census data which contains data a person's age, education, marital status, and occupation (the features), we will try to predict whether or not the person earns more than 50,000 dollars a year (the target label). We will train a logistic regression model that, given an individual's information, outputs a number between 0 and 1—this can be interpreted as the probability that the individual has an annual income of over 50,000 dollars.

In [2]:
# Setup.
import tensorflow as tf
import tensorflow.feature_column as fc 

import os
import sys

import matplotlib.pyplot as plt
from IPython.display import clear_output

tf.enable_eager_execution()

  from ._conv import register_converters as _register_converters


In [3]:
# Download the official implementation.
! pip install -q requests
! git clone --depth 1 https://github.com/tensorflow/models

fatal: destination path 'models' already exists and is not an empty directory.


In [4]:
# Add the root directory of the repository to the Python path.
models_path = os.path.join(os.getcwd(), 'models')

sys.path.append(models_path)

In [5]:
# Download the dataset.
from official.wide_deep import census_dataset
from official.wide_deep import census_main

census_dataset.download("/tmp/census_data/")

In [6]:
# Command line usage.
if "PYTHONPATH" in os.environ:
  os.environ['PYTHONPATH'] += os.pathsep +  models_path
else:
  os.environ['PYTHONPATH'] = models_path

In [7]:
!python -m official.wide_deep.census_main --help

  from ._conv import register_converters as _register_converters
Train DNN on census income dataset.
flags:

/Users/gmontes/Projects/Facultad/75.06 - Organización de Datos/TP1/75.06-Datos-Grupo22-TP1/tp2/TensorFlow/models/official/wide_deep/census_main.py:
  -bs,--batch_size:
    Batch size for training and evaluation. When using multiple gpus, this is
    the
    global batch size for all devices. For example, if the batch size is 32 and
    there are 4 GPUs, each GPU will get 8 examples on each step.
    (default: '40')
    (an integer)
  --[no]clean:
    If set, model_dir will be removed if it exists.
    (default: 'false')
  -dd,--data_dir:
    The location of the input data.
    (default: '/tmp/census_data')
  --[no]download_if_missing:
    Download data to data_dir if it is not already present.
    (default: 'true')
  -ebe,--epochs_between_evals:
    The number of training epochs to run between evaluations.
    (default: '2')
    (an integer)
  -ed,--export_dir:
    If set, a Sa

In [8]:
# Now run the model.
!python -m official.wide_deep.census_main --model_type=wide --train_epochs=2

  from ._conv import register_converters as _register_converters
I0627 06:42:42.674832 4430263744 estimator.py:201] Using config: {'_model_dir': '/tmp/census_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
  value: 0
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb2c47ea90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
W0627 06:42:42.675733 4430263744 tf_logging.py:161] 'cpuinfo' not imported. CPU info will not be logged.
2019-06-27 06:42:42.675931: I tensorflo

I0627 06:42:58.712058 4430263744 basic_session_run_hooks.py:680] global_step/sec: 99.0343
I0627 06:42:58.712459 4430263744 basic_session_run_hooks.py:247] average_loss = 0.33914065, loss = 13.565626 (1.010 sec)
I0627 06:42:58.712664 4430263744 basic_session_run_hooks.py:247] loss = 13.565626, step = 6088 (1.010 sec)
I0627 06:42:59.722688 4430263744 basic_session_run_hooks.py:680] global_step/sec: 98.9479
I0627 06:42:59.723067 4430263744 basic_session_run_hooks.py:247] average_loss = 0.38155884, loss = 15.262354 (1.011 sec)
I0627 06:42:59.723221 4430263744 basic_session_run_hooks.py:247] loss = 15.262354, step = 6188 (1.011 sec)
I0627 06:43:00.759825 4430263744 basic_session_run_hooks.py:680] global_step/sec: 96.4192
I0627 06:43:00.760270 4430263744 basic_session_run_hooks.py:247] average_loss = 0.33988985, loss = 13.595594 (1.037 sec)
I0627 06:43:00.760413 4430263744 basic_session_run_hooks.py:247] loss = 13.595594, step = 6288 (1.037 sec)
I0627 06:43:01.758954 4430263744 basic_session

In [9]:
# Read the U.S. Census data.
# Since the task is a binary classification problem, 
# we'll construct a label column named "label" whose value is 1 if the income is over 50K, and 0 otherwise.
!ls  /tmp/census_data/

adult.data adult.test


In [10]:
train_file = "/tmp/census_data/adult.data"
test_file = "/tmp/census_data/adult.test"

In [11]:
import pandas

train_df = pandas.read_csv(train_file, header = None, names = census_dataset._CSV_COLUMNS)
test_df = pandas.read_csv(test_file, header = None, names = census_dataset._CSV_COLUMNS)

train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Converting Data into Tensors
When building a tf.estimator model, the input data is specified by using an input function (or input_fn). This builder function returns a tf.data.Dataset of batches of (features-dict, label) pairs. It is not called until it is passed to tf.estimator.Estimator methods such as train and evaluate.

The input builder function returns the following pair:

1 - features: A dict from feature names to Tensors or SparseTensors containing batches of features.

2 - labels: A Tensor containing batches of labels.

The keys of the features are used to configure the model's input layer.

In [12]:
# For small problems like this, it's easy to make a tf.data.Dataset by slicing the pandas.DataFrame.
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df),label))

  if shuffle:
    ds = ds.shuffle(10000)

  ds = ds.batch(batch_size).repeat(num_epochs)

  return ds

In [13]:
# Since we have eager execution enabled, it's easy to inspect the resulting dataset.
ds = easy_input_function(train_df, label_key='income_bracket', num_epochs=5, shuffle=True, batch_size=10)

for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys())[:5])
  print()
  print('A batch of Ages  :', feature_batch['age'])
  print()
  print('A batch of Labels:', label_batch )

Instructions for updating:
Colocations handled automatically by placer.


W0627 06:43:10.142467 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/data/ops/iterator_ops.py:532: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


Some feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']

A batch of Ages  : tf.Tensor([18 51 37 28 32 39 32 52 46 22], shape=(10,), dtype=int32)

A batch of Labels: tf.Tensor(
[b'<=50K' b'>50K' b'>50K' b'<=50K' b'>50K' b'<=50K' b'<=50K' b'<=50K'
 b'<=50K' b'<=50K'], shape=(10,), dtype=string)


In [14]:
# But this approach has severly-limited scalability. 
# Larger datasets should be streamed from disk. 
# The census_dataset.input_fn provides an example of how to do this using tf.decode_csv and tf.data.TextLineDataset.
import inspect
print(inspect.getsource(census_dataset.input_fn))

def input_fn(data_file, num_epochs, shuffle, batch_size):
  """Generate an input function for the Estimator."""
  assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have run census_dataset.py and '
      'set the --data_dir argument to the correct path.' % data_file)

  def parse_csv(value):
    tf.logging.info('Parsing {}'.format(data_file))
    columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('income_bracket')
    classes = tf.equal(labels, '>50K')  # binary classification
    return features, classes

  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file)

  if shuffle:
    dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])

  dataset = dataset.map(parse_csv, num_parallel_calls=5)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = 

In [15]:
ds = census_dataset.input_fn(train_file, num_epochs=5, shuffle=True, batch_size=10)

for feature_batch, label_batch in ds.take(1):
  print('Feature keys:', list(feature_batch.keys())[:5])
  print()
  print('Age batch   :', feature_batch['age'])
  print()
  print('Label batch :', label_batch )

INFO:tensorflow:Parsing /tmp/census_data/adult.data


I0627 06:43:10.530493 4650505664 census_dataset.py:167] Parsing /tmp/census_data/adult.data


Feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']

Age batch   : tf.Tensor([30 63 42 43 26 36 21 69 28 33], shape=(10,), dtype=int32)

Label batch : tf.Tensor([False False False False False False False False False False], shape=(10,), dtype=bool)


In [16]:
# Because Estimators expect an input_fn that takes no arguments, we typically wrap configurable input function 
# into an obejct with the expected signature. 
# For this notebook configure the train_inpf to iterate over the data twice.
import functools

train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64)
test_inpf = functools.partial(census_dataset.input_fn, test_file, num_epochs=1, shuffle=False, batch_size=64)

## Selecting and Engineering Features for the Model
Estimators use a system called feature columns to describe how the model should interpret each of the raw input features. An Estimator expects a vector of numeric inputs, and feature columns describe how the model should convert each feature.

Selecting and crafting the right set of feature columns is key to learning an effective model. A feature column can be either one of the raw inputs in the original features dict (a base feature column), or any new columns created using transformations defined over one or multiple base columns (a derived feature columns).

A feature column is an abstract concept of any raw or derived variable that can be used to predict the target label.

### Base Feature Columns
#### Numeric columns
The simplest feature_column is numeric_column. This indicates that a feature is a numeric value that should be input to the model directly. For example:

In [17]:
age = fc.numeric_column('age')

# The model will use the feature_column definitions to build the model input. 
# You can inspect the resulting output using the input_layer function.
fc.input_layer(feature_batch, [age])

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:10.755248 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:205: NumericColumn._get_dense_tensor (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:10.756400 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:2121: NumericColumn._transform_feature (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
Use tf.cast instead.


W0627 06:43:10.758100 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:2703: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:10.759333 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:206: NumericColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


<tf.Tensor: id=136, shape=(10, 1), dtype=float32, numpy=
array([[30.],
       [63.],
       [42.],
       [43.],
       [26.],
       [36.],
       [21.],
       [69.],
       [28.],
       [33.]], dtype=float32)>

In [18]:
# The following will train and evaluate a model using only the age feature.
classifier = tf.estimator.LinearClassifier(feature_columns=[age])
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()  # used for display in notebook
print(result)

{'accuracy': 0.7562189, 'accuracy_baseline': 0.76377374, 'auc': 0.6783409, 'auc_precision_recall': 0.3113884, 'average_loss': 0.5238994, 'label/mean': 0.23622628, 'loss': 33.449432, 'precision': 0.16756757, 'prediction/mean': 0.25558934, 'recall': 0.008060322, 'global_step': 1018}


In [19]:
# Similarly, we can define a NumericColumn for each continuous feature column that we want to use in the model.
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')

my_numeric_columns = [age,education_num, capital_gain, capital_loss, hours_per_week]

fc.input_layer(feature_batch, my_numeric_columns)

<tf.Tensor: id=2129, shape=(10, 5), dtype=float32, numpy=
array([[  30.,    0.,    0.,    9.,   40.],
       [  63.,    0.,    0.,   10.,   12.],
       [  42.,    0.,    0.,    9.,   15.],
       [  43.,    0.,    0.,   14.,   55.],
       [  26., 3325.,    0.,   13.,   40.],
       [  36.,    0.,    0.,    9.,   38.],
       [  21.,    0.,    0.,    9.,   60.],
       [  69.,    0.,    0.,   14.,   25.],
       [  28.,    0., 2002.,   12.,   40.],
       [  33.,    0.,    0.,   13.,   60.]], dtype=float32)>

In [20]:
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns)
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()

for key,value in sorted(result.items()):
  print('%s: %s' % (key, value))

accuracy: 0.7815245
accuracy_baseline: 0.76377374
auc: 0.795743
auc_precision_recall: 0.55417717
average_loss: 1.755469
global_step: 1018
label/mean: 0.23622628
loss: 112.081535
precision: 0.56943774
prediction/mean: 0.31169283
recall: 0.30811232


#### Categorical columns
To define a feature column for a categorical feature, create a CategoricalColumn using one of the tf.feature_column.categorical_column* functions.

If you know the set of all possible feature values of a column—and there are only a few of them—use categorical_column_with_vocabulary_list. Each key in the list is assigned an auto-incremented ID starting from 0. For example, for the relationship column we can assign the feature string Husband to an integer ID of 0 and "Not-in-family" to 1, etc.

This creates a sparse one-hot vector from the raw input feature.

The input_layer function we're using is designed for DNN models and expects dense inputs. To demonstrate the categorical column we must wrap it in a tf.feature_column.indicator_column to create the dense one-hot output (Linear Estimators can often skip this dense-step).

In [21]:
relationship = fc.categorical_column_with_vocabulary_list(
    'relationship',
    ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'])

fc.input_layer(feature_batch, [age, fc.indicator_column(relationship)])

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.216730 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:205: IndicatorColumn._get_dense_tensor (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.218270 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:2121: IndicatorColumn._transform_feature (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.220144 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4295: VocabularyListCategoricalColumn._get_sparse_tensors (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.221554 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:2121: VocabularyListCategoricalColumn._transform_feature (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
Use tf.cast instead.


W0627 06:43:52.224778 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/lookup_ops.py:1137: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.227194 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4266: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.228356 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4321: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


<tf.Tensor: id=4866, shape=(10, 7), dtype=float32, numpy=
array([[30.,  0.,  0.,  0.,  1.,  0.,  0.],
       [63.,  0.,  0.,  1.,  0.,  0.,  0.],
       [42.,  1.,  0.,  0.,  0.,  0.,  0.],
       [43.,  1.,  0.,  0.,  0.,  0.,  0.],
       [26.,  0.,  1.,  0.,  0.,  0.,  0.],
       [36.,  0.,  1.,  0.,  0.,  0.,  0.],
       [21.,  0.,  0.,  0.,  0.,  0.,  1.],
       [69.,  1.,  0.,  0.,  0.,  0.,  0.],
       [28.,  1.,  0.,  0.,  0.,  0.,  0.],
       [33.,  0.,  0.,  0.,  0.,  1.,  0.]], dtype=float32)>

In [22]:
# If we don't know the set of possible values in advance, use the categorical_column_with_hash_bucket instead.
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    'occupation', hash_bucket_size=1000)

# Here, each possible value in the feature column occupation is hashed to an integer ID as we encounter 
# them in training.
for item in feature_batch['occupation'].numpy():
    print(item.decode())

Transport-moving
Prof-specialty
Farming-fishing
Prof-specialty
Craft-repair
Adm-clerical
Exec-managerial
Prof-specialty
Protective-serv
Protective-serv


In [23]:
occupation_result = fc.input_layer(feature_batch, [fc.indicator_column(occupation)])

occupation_result.numpy()

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.241648 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4295: HashedCategoricalColumn._get_sparse_tensors (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.242828 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:2121: HashedCategoricalColumn._transform_feature (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:43:52.244911 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4321: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
tf.argmax(occupation_result, axis=1).numpy()

array([420, 979, 936, 979, 466,  96, 800, 979, 684, 684])

No matter how we choose to define a SparseColumn, each feature string is mapped into an integer ID by looking up a fixed mapping or by hashing. Under the hood, the LinearModel class is responsible for managing the mapping and creating tf.Variable to store the model parameters (model weights) for each feature ID. The model parameters are learned through the model training process described later.

Let's do the similar trick to define the other categorical features:

In [25]:
education = tf.feature_column.categorical_column_with_vocabulary_list(
    'education', [
        'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
        'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
        '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'marital_status', [
        'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
        'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])

workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    'workclass', [
        'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
        'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])


my_categorical_columns = [relationship, occupation, education, marital_status, workclass]

In [26]:
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns+my_categorical_columns)
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()

for key,value in sorted(result.items()):
  print('%s: %s' % (key, value))

accuracy: 0.82003564
accuracy_baseline: 0.76377374
auc: 0.8219939
auc_precision_recall: 0.61432624
average_loss: 0.93544346
global_step: 1018
label/mean: 0.23622628
loss: 59.725315
precision: 0.6433041
prediction/mean: 0.22795415
recall: 0.53458136


### Derived feature columns
#### Make Continuous Features Categorical through Bucketization
Sometimes the relationship between a continuous feature and the label is not linear. For example, age and income—a person's income may grow in the early stage of their career, then the growth may slow at some point, and finally, the income decreases after retirement. In this scenario, using the raw age as a real-valued feature column might not be a good choice because the model can only learn one of the three cases:

1 - Income always increases at some rate as age grows (positive correlation),

2 - Income always decreases at some rate as age grows (negative correlation), or

3 - Income stays the same no matter at what age (no correlation).

If we want to learn the fine-grained correlation between income and each age group separately, we can leverage bucketization. Bucketization is a process of dividing the entire range of a continuous feature into a set of consecutive buckets, and then converting the original numerical feature into a bucket ID (as a categorical feature) depending on which bucket that value falls into. So, we can define a bucketized_column over age as:

In [27]:
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
age_buckets

BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))

In [28]:
# With bucketing, the model sees each bucket a one-hot feature.
fc.input_layer(feature_batch, [age, age_buckets])

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:44:18.555160 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:205: BucketizedColumn._get_dense_tensor (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:44:18.556488 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:2121: BucketizedColumn._transform_feature (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


W0627 06:44:18.558437 4650505664 deprecation.py:323] From /Users/gmontes/anaconda3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py:206: BucketizedColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed after 2018-11-30.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


<tf.Tensor: id=9435, shape=(10, 12), dtype=float32, numpy=
array([[30.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [63.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [42.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [43.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [26.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [36.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [21.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [69.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [28.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [33.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]],
      dtype=float32)>

#### Learn complex relationships with crossed column
Using each base feature column separately may not be enough to explain the data. For example, the correlation between education and the label (earning > 50,000 dollars) may be different for different occupations. Therefore, if we only learn a single model weight for education="Bachelors" and education="Masters", we won't capture every education-occupation combination (e.g. distinguishing between education="Bachelors" AND occupation="Exec-managerial" AND education="Bachelors" AND occupation="Craft-repair").

To learn the differences between different feature combinations, we can add crossed feature columns to the model:

In [29]:
education_x_occupation = tf.feature_column.crossed_column(
    ['education', 'occupation'], hash_bucket_size=1000)

In [30]:
# We can also create a crossed_column over more than two columns. 
# Each constituent column can be either a base feature column that is categorical (SparseColumn), 
# a bucketized real-valued feature column, or even another CrossColumn.
# These crossed columns always use hash buckets to avoid the exponential explosion in the number of categories, 
# and put the control over number of model weights in the hands of the user.
age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
    [age_buckets, 'education', 'occupation'], hash_bucket_size=1000)

### Define the logistic regression model
After processing the input data and defining all the feature columns, we can put them together and build a logistic regression model. The previous section showed several types of base and derived feature columns, including:

- CategoricalColumn
- NumericColumn
- BucketizedColumn
- CrossedColumn

All of these are subclasses of the abstract FeatureColumn class and can be added to the feature_columns field of a model:

In [31]:
import tempfile

base_columns = [
    education, marital_status, relationship, workclass, occupation,
    age_buckets,
]

crossed_columns = [
    tf.feature_column.crossed_column(
        ['education', 'occupation'], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
]

model = tf.estimator.LinearClassifier(
    model_dir=tempfile.mkdtemp(), 
    feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(learning_rate=0.1))

INFO:tensorflow:Using default config.


I0627 06:44:18.579401 4650505664 estimator.py:1739] Using default config.


INFO:tensorflow:Using config: {'_model_dir': '/var/folders/k8/jlk2vp8n2q1cb314v8ypl_dw0000gp/T/tmpfnl1gd1u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb33827860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0627 06:44:18.580864 4650505664 estimator.py:201] Using config: {'_model_dir': '/var/folders/k8/jlk2vp8n2q1cb314v8ypl_dw0000gp/T/tmpfnl1gd1u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb33827860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Train and evaluate the model.

In [32]:
train_inpf = functools.partial(census_dataset.input_fn, train_file, 
                               num_epochs=40, shuffle=True, batch_size=64)

model.train(train_inpf)

clear_output()  # used for notebook display

In [33]:
results = model.evaluate(test_inpf)

clear_output()

for key,value in sorted(result.items()):
  print('%s: %0.2f' % (key, value))

accuracy: 0.82
accuracy_baseline: 0.76
auc: 0.82
auc_precision_recall: 0.61
average_loss: 0.94
global_step: 1018.00
label/mean: 0.24
loss: 59.73
precision: 0.64
prediction/mean: 0.23
recall: 0.53


In [34]:
# After the model is evaluated, we can use it to predict whether an individual has an annual income of over 
# 50,000 dollars given an individual's information input.
import numpy as np

predict_df = test_df[:20].copy()

pred_iter = model.predict(
    lambda:easy_input_function(predict_df, label_key='income_bracket',
                               num_epochs=1, shuffle=False, batch_size=10))

classes = np.array(['<=50K', '>50K'])
pred_class_id = []

for pred_dict in pred_iter:
  pred_class_id.append(pred_dict['class_ids'])

predict_df['predicted_class'] = classes[np.array(pred_class_id)]
predict_df['correct'] = predict_df['predicted_class'] == predict_df['income_bracket']

clear_output()

predict_df[['income_bracket','predicted_class', 'correct']]

Unnamed: 0,income_bracket,predicted_class,correct
0,<=50K,<=50K,True
1,<=50K,<=50K,True
2,>50K,<=50K,False
3,>50K,<=50K,False
4,<=50K,<=50K,True
5,<=50K,<=50K,True
6,<=50K,<=50K,True
7,>50K,>50K,True
8,<=50K,<=50K,True
9,<=50K,<=50K,True


### Adding Regularization to Prevent Overfitting
Regularization is a technique used to avoid overfitting. Overfitting happens when a model performs well on the data it is trained on, but worse on test data that the model has not seen before. Overfitting can occur when a model is excessively complex, such as having too many parameters relative to the number of observed training data. Regularization allows you to control the model's complexity and make the model more generalizable to unseen data.

In [35]:
model_l1 = tf.estimator.LinearClassifier(
    feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=10.0,
        l2_regularization_strength=0.0))

model_l1.train(train_inpf)

results = model_l1.evaluate(test_inpf)
clear_output()
for key in sorted(results):
  print('%s: %0.2f' % (key, results[key]))

accuracy: 0.84
accuracy_baseline: 0.76
auc: 0.88
auc_precision_recall: 0.69
average_loss: 0.35
global_step: 20351.00
label/mean: 0.24
loss: 22.47
precision: 0.69
prediction/mean: 0.24
recall: 0.57


In [36]:
model_l2 = tf.estimator.LinearClassifier(
    feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=0.0,
        l2_regularization_strength=10.0))

model_l2.train(train_inpf)

results = model_l2.evaluate(test_inpf)
clear_output()
for key in sorted(results):
  print('%s: %0.2f' % (key, results[key]))

accuracy: 0.84
accuracy_baseline: 0.76
auc: 0.88
auc_precision_recall: 0.69
average_loss: 0.35
global_step: 20351.00
label/mean: 0.24
loss: 22.46
precision: 0.69
prediction/mean: 0.24
recall: 0.55


In [41]:
# These regularized models don't perform much better than the base model. 
# Let's look at the model's weight distributions to better see the effect of the regularization.
def get_flat_weights(model):
  weight_names = [
      name for name in model.get_variable_names()
      if "linear_model" in name and "Ftrl" not in name]

  weight_values = [model.get_variable_value(name) for name in weight_names]

  weights_flat = np.concatenate([item.flatten() for item in weight_values], axis=0)

  return weights_flat

weights_flat = get_flat_weights(model)
weights_flat_l1 = get_flat_weights(model_l1)
weights_flat_l2 = get_flat_weights(model_l2)

In [None]:
# The models have many zero-valued weights caused by unused hash bins (there are many more hash bins than categories in some columns). We can mask these weights when viewing the weight distributions:
