In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [3]:
train, test = train_test_split(dataframe, test_size=0.2)
train, valid = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(valid), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


In [4]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    '''将dataframe封装成dataset'''
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [5]:
batch_size = 5 # 方便演示查看
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(valid, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [6]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature: ', list(feature_batch.keys()))
    print('A batch of ages: ', feature_batch['age'])
    print('A batch of targets: ', label_batch)

Every feature:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages:  tf.Tensor([58 56 44 71 41], shape=(5,), dtype=int32)
A batch of targets:  tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)


In [8]:
example_batch = next(iter(train_ds))[0]
example_batch

{'age': <tf.Tensor: id=106, shape=(5,), dtype=int32, numpy=array([52, 58, 57, 44, 46], dtype=int32)>,
 'sex': <tf.Tensor: id=114, shape=(5,), dtype=int32, numpy=array([1, 1, 1, 1, 1], dtype=int32)>,
 'cp': <tf.Tensor: id=109, shape=(5,), dtype=int32, numpy=array([4, 4, 4, 3, 2], dtype=int32)>,
 'trestbps': <tf.Tensor: id=118, shape=(5,), dtype=int32, numpy=array([128, 150, 110, 120, 101], dtype=int32)>,
 'chol': <tf.Tensor: id=108, shape=(5,), dtype=int32, numpy=array([255, 270, 201, 226, 197], dtype=int32)>,
 'fbs': <tf.Tensor: id=111, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 1], dtype=int32)>,
 'restecg': <tf.Tensor: id=113, shape=(5,), dtype=int32, numpy=array([0, 2, 0, 0, 0], dtype=int32)>,
 'thalach': <tf.Tensor: id=117, shape=(5,), dtype=int32, numpy=array([161, 111, 126, 169, 156], dtype=int32)>,
 'exang': <tf.Tensor: id=110, shape=(5,), dtype=int32, numpy=array([1, 1, 1, 0, 0], dtype=int32)>,
 'oldpeak': <tf.Tensor: id=112, shape=(5,), dtype=float64, numpy=array([0. , 

In [9]:
def demo(feature_column):
    feature_layer = keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch))

In [10]:
age = feature_column.numeric_column('age')
age

NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [11]:
demo(age)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

tf.Tensor(
[[52.]
 [58.]
 [57.]
 [44.]
 [46.]], shape=(5, 1), dtype=float32)


In [12]:
age_buckets = feature_column.bucketized_column(age,
                                               boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) # 离散化

In [13]:
demo(age_buckets)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]], shape=(5, 11), dtype=float32)


In [14]:
age_buckets

BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))

In [15]:
thal = feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible'])
thal

VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversible'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [16]:
thal_one_hot = feature_column.indicator_column(thal)
thal_one_hot

IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversible'), dtype=tf.string, default_value=-1, num_oov_buckets=0))

In [17]:
demo(thal_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
tf.Tensor(
[[0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]], shape=(5, 3), dtype=float32)


In [18]:
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

tf.Tensor(
[[ 0.43215394 -0.6873984   0.33058783 -0.12892193  0.21784589  0.24694319
   0.07820354 -0.24338765]
 [ 0.43215394 -0.6873984   0.33058783 -0.12892193  0.21784589  0.24694319
   0.07820354 -0.24338765]
 [-0.3239316  -0.6430655   0.40910184 -0.5459068   0.38673645 -0.4113456
   0.00210527 -0.65231144]
 [-0.10424187 -0.36746833  0.58041143 -0.17895205 -0.18225203 -0.47601387
   0.14664781  0.52510333]
 [ 0.43215394 -0.6873984   0.33058783 -0.12892193  0.21784589  0.24694319
   0.07820354 -0.24338765]], shape=(5, 8), dtype=float32)


In [19]:
thal_hashed = feature_column.categorical_column_with_hash_bucket(
    'thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(5, 1000), dtype=float32)


In [20]:
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(5, 1000), dtype=float32)


In [21]:
feature_columns = []
# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

In [22]:
# bucketized cols
age_buckets = feature_column.bucketized_column(age,
                                               boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

In [23]:
# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

In [24]:
# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

In [25]:
# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                               hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [26]:
feature_columns

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='trestbps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='chol', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='thalach', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='oldpeak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='slope', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='ca', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversi

In [27]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [29]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(valid, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [30]:
model = tf.keras.Sequential([
    feature_layer,
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_ds, validation_data=val_ds, epochs=5)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f03b43e28d0>

In [31]:
loss, accuracy = model.evaluate(test_ds)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7868852615356445
