## Importing libraries

In [48]:
import tensorflow as tf
import tarfile
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Loading dataset

In [3]:
housing_tgz = tarfile.open('./datasets/unprocessed_housing/housing.tgz') # Opening tar file 

In [4]:
housing_tgz.extractall('./datasets/unprocessed_housing') # Extracting the data to a csv file

In [7]:
! ls ./datasets/unprocessed_housing/

housing.csv  housing.tgz


In [12]:
df_housing = pd.read_csv('./datasets/unprocessed_housing/housing.csv')

In [13]:
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [15]:
housing_median_age = tf.feature_column.numeric_column('housing_median_age') # Creating 

In [19]:
housing = fetch_california_housing()

In [20]:
x = housing.data

In [21]:
y = housing.target.reshape(-1, 1)

In [22]:
x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, random_state = 42)

In [23]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train_full, y_train_full, random_state = 42)

In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(x_train)

StandardScaler()

In [26]:
x_mean = scaler.mean_
x_std = scaler.scale_

In [29]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [30]:
age_mean, age_std = x_mean[1], x_std[1] # The feature having index 1 is HouseAge

In [31]:
# Creating a TF numeric column having the median
housing_median_age = tf.feature_column.numeric_column('housing_median_age', normalizer_fn = lambda x : (x - age_mean) / age_std) 

In [33]:
median_income = tf.feature_column.numeric_column('median_income') # Creating a TF column having median income 

In [34]:
bucketized_income = tf.feature_column.bucketized_column(median_income, boundaries = [1.5, 3., 4.5, 6.]) # Creating a bucketized income TF column

In [35]:
bucketized_income

BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3.0, 4.5, 6.0))

In [42]:
ocean_proximity_cats = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'] # Classes present in ocean proximity.

In [43]:
# Creating a TF feature column using the ocean proximity vocabulary
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list('ocean_proximity', ocean_proximity_cats)

In [44]:
ocean_proximity

VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [45]:
# Converting scaled age to categorical feature
bucketized_age = tf.feature_column.bucketized_column(housing_median_age, boundaries = [-1, -0.5, 0.5, 1.])

In [46]:
# Creating a crossed column between bucketized_age and ocean_proximity
age_and_ocean_proximity = tf.feature_column.crossed_column([bucketized_age, ocean_proximity],hash_bucket_size = 100)

In [47]:
# Creating the logitude and latitude column
latitude = tf.feature_column.numeric_column('latitude')
longitude = tf.feature_column.numeric_column('longitude')

In [49]:
# Bucketizing the latitude and longitude features
bucketized_latitude = tf.feature_column.bucketized_column(latitude, boundaries = list(np.linspace(32., 42., 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(longitude, boundaries = list(np.linspace(-125., -114., 20 - 1)))

In [53]:
# Creating a crossed feature showing location of the house
location = tf.feature_column.crossed_column([bucketized_latitude, bucketized_longitude], hash_bucket_size = 1000)

In [54]:
# One-hot encoding the ocean proximity categories
ocean_proximity_onehot = tf.feature_column.indicator_column(ocean_proximity)

In [55]:
# Embedding the ocean_proximity categories
ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity, dimension = 2)