In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))

  from ._conv import register_converters as _register_converters


In [2]:
def preprocess_features(california_housing_dataframe):
  
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(california_housing_dataframe):
  
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets

In [3]:
# Choose the first 12000 (out of 17000) examples for training.
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_targets = preprocess_targets(california_housing_dataframe.head(12000))

# Choose the last 5000 (out of 17000) examples for validation.
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))

# Double-check that we've done the right thing.
print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

Training examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.6,-119.6,28.5,2651.7,541.4,1432.9,502.8,3.9,2.0
std,2.1,2.0,12.6,2204.6,425.8,1168.6,388.2,1.9,1.3
min,32.5,-124.3,1.0,11.0,3.0,3.0,2.0,0.5,0.1
25%,33.9,-121.8,18.0,1462.0,297.0,786.8,281.0,2.6,1.5
50%,34.2,-118.5,28.0,2131.5,437.0,1171.0,410.0,3.5,1.9
75%,37.7,-118.0,37.0,3167.0,652.0,1737.0,607.0,4.8,2.3
max,42.0,-114.3,52.0,37937.0,6445.0,35682.0,6082.0,15.0,55.2


Validation examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.6,-119.6,28.9,2624.3,534.6,1421.6,497.5,3.9,2.0
std,2.1,2.0,12.5,2119.6,410.9,1096.5,375.4,1.9,0.9
min,32.5,-124.2,1.0,2.0,1.0,6.0,1.0,0.5,0.0
25%,33.9,-121.8,19.0,1462.0,296.0,796.0,282.0,2.6,1.5
50%,34.2,-118.5,29.0,2121.0,429.0,1159.0,406.0,3.6,1.9
75%,37.7,-118.0,37.0,3107.0,641.0,1685.2,598.2,4.8,2.3
max,41.9,-114.6,52.0,32054.0,5290.0,15507.0,5050.0,15.0,26.5


Training targets summary:


Unnamed: 0,median_house_value
count,12000.0
mean,207.1
std,116.1
min,15.0
25%,118.8
50%,180.0
75%,264.5
max,500.0


Validation targets summary:


Unnamed: 0,median_house_value
count,5000.0
mean,207.9
std,115.6
min,15.0
25%,121.2
50%,180.9
75%,266.2
max,500.0


In [4]:
display.display(training_examples.describe())


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.6,-119.6,28.5,2651.7,541.4,1432.9,502.8,3.9,2.0
std,2.1,2.0,12.6,2204.6,425.8,1168.6,388.2,1.9,1.3
min,32.5,-124.3,1.0,11.0,3.0,3.0,2.0,0.5,0.1
25%,33.9,-121.8,18.0,1462.0,297.0,786.8,281.0,2.6,1.5
50%,34.2,-118.5,28.0,2131.5,437.0,1171.0,410.0,3.5,1.9
75%,37.7,-118.0,37.0,3167.0,652.0,1737.0,607.0,4.8,2.3
max,42.0,-114.3,52.0,37937.0,6445.0,35682.0,6082.0,15.0,55.2
