# classify structed data

* [source site](https://www.tensorflow.org/beta/tutorials/keras/feature_columns)

In [1]:
# !pip install -q sklearn # if you already have sklearn, you can delete the code

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [3]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0



|Column | Description |Feature Type | Data Type
|-|-|-|-|
|Age|Age in years|Numerical|integer
|Sex|(1 = male; 0 = female)|Categorical|integer
|CP|Chest pain type (0, 1, 2, 3, 4)|Categorical|integer
|Trestbpd|Resting blood pressure (in mm Hg on admission to the hospital)|Numerical|integer
|Chol|Serum cholestoral in mg/dl|Numerical|integer
|FBS|(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)|Categorical|integer
|RestECG|Resting electrocardiographic results (0, 1, 2)|Categorical|integer
|Thalach|Maximum heart rate achieved|Numerical|integer
|Exang|Exercise induced angina (1 = yes; 0 = no)|Categorical|integer
|Oldpeak|ST depression induced by exercise relative to rest|Numerical|integer
|Slope|The slope of the peak exercise ST segment|Numerical|float
|CA|Number of major vessels (0-3) colored by flourosopy|Numerical|integer
|Thal|3 = normal; 6 = fixed defect; 7 = reversable defect|Categorical|string
|Target|Diagnosis of heart disease (1 = true; 0 = false)|Classification|integer


In [4]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


In [41]:
print('Prevalence of heart disease of train set:{}'.format(train['target'].sum()/193))
print('Prevalence of heart disease of validation set:{}'.format(val['target'].sum()/49))
print('Prevalence of heart disease of test set:{}'.format(test['target'].sum()/61))


Prevalence of heart disease of train set:0.27979274611398963
Prevalence of heart disease of validation set:0.2653061224489796
Prevalence of heart disease of test set:0.26229508196721313


In [43]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy() # you don't want to pop out data column from original dataframe.
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [50]:
batch_size = 5 # A small batch sized is used for demonstration purposes to explore the tensorflow data set. 
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [51]:
for feature_batch, label_batch in train_ds.take(1): # take one batch from train set.
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch )

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([64 66 67 64 48], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 0 1 1], shape=(5,), dtype=int32)


In [55]:
example_batch = next(iter(train_ds))[0]

In [72]:
# A utility method to create a feature column
# and to transform a batch of data from tensorflow data type
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [73]:
age = feature_column.numeric_column("age") # feature column included in tensorflow library module. tf.feature_column
demo(age)

[[64.]
 [66.]
 [67.]
 [64.]
 [48.]]


In [85]:
#Bucketizing the numeric colum

age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]


In [84]:
# one hot coding of categorical columns

thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])

thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

[[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [86]:
# if there're too many categories, it cann't be done with one-hot coding.
# Notice the input to the embedding column is the categorical column
# we previously created
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[ 0.27156672 -0.15858811 -0.19449891  0.34011993 -0.4182869   0.28448778
  -0.19300485 -0.25333697]
 [-0.05981684 -0.6688228  -0.5995852   0.5653792   0.26861045 -0.10921428
   0.06330353 -0.36407447]
 [ 0.27156672 -0.15858811 -0.19449891  0.34011993 -0.4182869   0.28448778
  -0.19300485 -0.25333697]
 [-0.05981684 -0.6688228  -0.5995852   0.5653792   0.26861045 -0.10921428
   0.06330353 -0.36407447]
 [ 0.27156672 -0.15858811 -0.19449891  0.34011993 -0.4182869   0.28448778
  -0.19300485 -0.25333697]]


In [87]:
# hashed feature columns
thal_hashed = feature_column.categorical_column_with_hash_bucket(
      'thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

W0625 10:48:48.766055 140617342199616 deprecation.py:323] From /home/hyunsu/anaconda3/envs/tf20_py36/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [88]:
# crossed feature columns
# creating single feature from combining several features
# note that they used the bucketizing numeric column and categorial col.
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))

W0625 12:47:25.043232 140617342199616 deprecation.py:323] From /home/hyunsu/anaconda3/envs/tf20_py36/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [89]:
# selecting features and preprocessing several features.
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)


In [90]:
feature_columns

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='trestbps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='chol', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='thalach', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='oldpeak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='slope', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='ca', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversi

In [91]:
# defining dense layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [94]:
batch_size = 32 # resizing batch size for training
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [95]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe380153eb8>

In [96]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7704918
