In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### Importing data for use in TensorFlow
- Data can be imported using `tensorflow`
    - Useful for managing complex pipelines
    - Not necessary for this chapter
- Simpler option used in this chapter 
    - Import data using `pandas` 
    - Convert data to `numpy` array
    - Use in `tensorflow` without modification  

In [3]:
# Load data from csv
housing = pd.read_csv('data/kc_house_data.csv')

# Convert to numpy array
housing = np.array(housing)

### Setting the data type 

In [4]:
# Load data from csv
housing = pd.read_csv('data/kc_house_data.csv')

# Convert price to column to float32
price = np.array(housing['price'], np.float32)

# Convert waterfront column to Boolean
waterfront = np.array(housing['waterfront'], np.bool)

In [5]:
import tensorflow as tf

# Load data from csv
housing = pd.read_csv('data/kc_house_data.csv')
 
# Convert price to column to float32
price = tf.cast(housing['price'], tf.float32)
 
# Convert waterfront column to Boolean
waterfront = tf.cast(housing['waterfront'], tf.bool)

In [6]:
# Import pandas under the alias pd
import pandas as pd

# Assign the path to a string variable named data_path
data_path = 'data/kc_house_data.csv'

# Load the dataset as a dataframe named housing
housing = pd.read_csv(data_path)

# Print the price column of housing
print(housing['price'])

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21608    360000.0
21609    400000.0
21610    402101.0
21611    400000.0
21612    325000.0
Name: price, Length: 21613, dtype: float64


In [7]:
# Use a numpy array to define price as a 32-bit float
price = np.array(housing['price'], np.float32)

# Define waterfront as a Boolean using cast
waterfront = tf.cast(housing['waterfront'], tf.bool)

# Print price and waterfront
print(price)
print(waterfront)

[221900. 538000. 180000. ... 402101. 400000. 325000.]
tf.Tensor([False False False ... False False False], shape=(21613,), dtype=bool)


### Loss functions 
- Fundamental `tensorflow` operation
    - Used to train a model
    - Measure of model fit
    
- Higher value -> worse fit
    - Minimize the loss function 
    
- TensorFlow has operations for common loss functions
    - Mean squared error (MSE)
    - Mean absolute error (MAE)
    - Huber loss
    
- Loss functions are available from `tf.keras.losses()`
    - `tf.keras.losses.mse()`
    - `tf.keras.losses.mae()`
    - `tf.keras.losses.Huber()`
    
### Why do we care about loss functions?

- MSE
    - Strongly penalizes outliers
    - High (gradient) sensitivity near minimum
   
- MAE 
    - Scales linearly with size of error
    - Low sensitivity near minimum

- Huber 
    - Similar to MSE near minimum
    - Similar to MAE away from minimum  

In [18]:
targets = np.random.randint(5, size=500)
predictions = np.random.randint(5, size=500)

loss = tf.keras.losses.mse(targets, predictions)
loss.numpy()

3

In [23]:
from sklearn.linear_model import LinearRegression

X = housing.drop(['id', 'price', 'date'], axis=1)
y = housing['price']

X_train, X_test, y_train, y_test = train_test_split(X,y)

lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
predictions = lr.predict(X_test)
price = y_test

In [29]:
# Import the keras module from tensorflow
from tensorflow import keras

# Compute the mean squared error (mse)
loss = keras.losses.mse(price, predictions)

# Print the mean squared error (mse)
print(loss.numpy())

46390041478.96243


In [21]:
!../gitbsh > /dev/null 2>&1