In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
x_train_full, x_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(x_train)
x_mean = scaler.mean_
x_std = scaler.scale_

In [3]:
import os
import pandas as pd

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data('../datasets/housing/')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing_median_age = tf.feature_column.numeric_column('housing_median_age')

In [6]:
age_mean, age_std = x_mean[1], x_std[1]
housing_median_age = tf.feature_column.numeric_column(
    'housing_median_age', normalizer_fn=lambda x: (x - age_mean) / age_std)

In [7]:
median_income = tf.feature_column.numeric_column('median_income')
bucketized_income = tf.feature_column.bucketized_column(
    median_income, boundaries=[1.5, 3, 4.5, 6])
bucketized_income

BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3, 4.5, 6))

In [8]:
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(
    'ocean_proximity', ocean_prox_vocab)
ocean_proximity

VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [9]:
city_hash = tf.feature_column.categorical_column_with_hash_bucket(
    'city', hash_bucket_size=1000)
city_hash

HashedCategoricalColumn(key='city', hash_bucket_size=1000, dtype=tf.string)

In [10]:
bucketized_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=[-1, -0.5, 0., 0.5, 1.])
age_and_ocean_proximity = tf.feature_column.crossed_column(
    [bucketized_age, ocean_proximity], hash_bucket_size=100)

In [11]:
latitude = tf.feature_column.numeric_column('latitude')
longitude = tf.feature_column.numeric_column('longitude')
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))
location = tf.feature_column.crossed_column(
    [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)

In [12]:
ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

In [13]:
ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity, 
                                                           dimension=2)

In [14]:
median_house_value = tf.feature_column.numeric_column('median_house_value')

In [15]:
columns = [housing_median_age, median_house_value]
feature_descriptions = tf.feature_column.make_parse_example_spec(columns)
feature_descriptions

{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),
 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}

In [16]:
from tensorflow.train import FloatList, Features, Feature, Example

with tf.io.TFRecordWriter('my_data_with_features.tfrecord') as f:
    for x, y in zip(x_train[:, 1:2], y_train):
        example = Example(features=Features(feature={
            'housing_median_age': Feature(float_list=FloatList(value=[x])),
            'median_house_value': Feature(float_list=FloatList(value=[y])),
        }))
        f.write(example.SerializeToString())

In [17]:
def parse_examples(serialized_examples):
    examples = tf.io.parse_example(serialized_examples, feature_descriptions)
    targets = examples.pop('median_house_value')
    return examples, targets

In [18]:
batch_size = 32
dataset = tf.data.TFRecordDataset(['my_data_with_features.tfrecord'])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)

In [19]:
columns_without_target = columns[:-1]

model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns=columns_without_target),
    keras.layers.Dense(1)
])

model.compile(loss='mse', 
              optimizer=keras.optimizers.SGD(learning_rate=1e-3), 
              metrics=['acc'])

model.fit(dataset, steps_per_epoch=len(x_train) // batch_size, epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x242f558f748>

In [20]:
some_columns = [ocean_proximity_embed, bucketized_income]
dense_features = keras.layers.DenseFeatures(some_columns)
dense_features({
    'ocean_proximity': [['NEAR OCEAN'], ['INLAND'], ['INLAND']],
    'median_income': [[3.], [7.2], [1.]],
})

<tf.Tensor: shape=(3, 7), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.99385285,  1.2581351 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        -0.11480857,  0.4313302 ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.11480857,  0.4313302 ]], dtype=float32)>