In this challenge, you will predict healthcare costs using a regression algorithm.

You are given a dataset that contains information about different people including their healthcare costs. Use the data to predict healthcare costs based on new data.

The first two cells of this notebook import libraries and the data.

Make sure to convert categorical data to numbers. Use 80% of the data as the `train_dataset` and 20% of the data as the `test_dataset`.

`pop` off the "expenses" column from these datasets to create new datasets called `train_labels` and `test_labels`. Use these labels when training your model.

Create a model and train it with the `train_dataset`. Run the final cell in this notebook to check your model. The final cell will use the unseen `test_dataset` to check how well the model generalizes.

To pass the challenge, `model.evaluate` must return a Mean Absolute Error of under 3500. This means it predicts health care costs correctly within $3500.

The final cell will also predict expenses using the `test_dataset` and graph the results.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

from sklearn.model_selection import train_test_split

2.6.0


# Get the data | Preprocess | Train-test-split

In [2]:
raw_dataset = pd.read_csv("insurance.csv")
dataset = raw_dataset.copy()
dataset.tail()


Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95
1337,61,female,29.1,0,yes,northwest,29141.36


In [3]:
#check for missing values
dataset.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [4]:
# one hot encode categorical data

categorical_features =["sex", "smoker", "region"]

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

for feature in categorical_features:
    dataset = encode_and_bind(dataset, feature)

dataset.head()


Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,18,33.8,1,1725.55,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,33,22.7,0,21984.47,0,1,1,0,0,1,0,0
4,32,28.9,0,3866.86,0,1,1,0,0,1,0,0


In [5]:
#Test-train-split: 80:20
dftrain, dftest = train_test_split(dataset, test_size=0.2, random_state = 42)
y_train = dftrain.pop('expenses')
y_test = dftest.pop('expenses')

In [6]:
dftrain.describe()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,39.357009,30.56215,1.107477,0.48785,0.51215,0.794393,0.205607,0.249533,0.239252,0.264486,0.246729
std,14.07396,6.043266,1.215983,0.500086,0.500086,0.404334,0.404334,0.432945,0.426827,0.441265,0.431309
min,18.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,39.5,30.2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,34.5,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,64.0,53.1,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Build and train the linear model

In [7]:
#input layer
age = np.array(dftrain['age'])

#model

model = tf.keras.Sequential([
    layers.Dense(units=1, input_shape=(1,))
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 2         
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


In [8]:
#define optimizer and loss function
model.compile(
    optimizer= tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [9]:
%%time
history = model.fit(
    dftrain['age'],
    y_train,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

Wall time: 3.98 s


In [10]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

Unnamed: 0,loss,val_loss,epoch
95,7115.461426,6572.131836,95
96,7114.039551,6572.140625,96
97,7112.62793,6572.338867,97
98,7111.560059,6572.831543,98
99,7110.640137,6573.212402,99


In [11]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(history)