# Its time to test Ourself with a more real life example with Large DataSet

In [None]:
# Import required libraries
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Read in the insurance dataset
insurance = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")
insurance

In [None]:
# Lets try one hot encode our dataframe
insurance_one_hot = pd.get_dummies(insurance)
insurance_one_hot.head()

In [None]:
# Create X & Y values (features and labels)
X = insurance_one_hot.drop("charges", axis=1)
y = insurance_one_hot["charges"]

In [None]:
# View X
X.head()

In [None]:
# View y
y.head()

In [None]:
# Create training and test sets
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Explinance
# test_size = 0.2, here is were we do the splitting 80-20 ratio of the data. 0.2 or 20% of the data is dedicated to testing
# random_state =42 makes sure that the data is split evenly each time


# Lets see the data
len(X), len(X_train), len(X_test)

# Explinance
# len(X) is all the data we have
# len(X_train) is all the data we use for training
# len(X_test) is all the data we use for testing, in this case 20%

In [None]:
# Build a Neural Network

# Set random seed
tf.random.set_seed(42)

# 1. Create the model
inscurance_model = tf.keras.Sequential([
        tf.keras.layers.Dense(10),
        tf.keras.layers.Dense(1),
])

# 2. Compile the model
inscurance_model.compile(loss = tf.keras.layers.mae,
                optimizer = tf.keras.optimizers.SGD(),
                metrics = ["mae"])

# 3. Fit the model
inscurance_model.fit(X_train, y_train, epochs = 100)

In [None]:
# Check the results of the insurance model on the test data
inscurance_model.evaluate(X_test, y_test)

Right now it loook like our model isnt performing too well, so lets start to improve it.

To (try) to improve the model we could do:
1. Add an extra layer with more hidden units and use the Adam optimizer
2. Train for longer (200 epochs)

In [None]:
# More Hidden units Experiment

# Set random seed
tf.random.set_seed(42)

# 1. Create the model
inscurance_model_2 = tf.keras.Sequential([
        tf.keras.layers.Dense(100),
        tf.keras.layers.Dense(10),
        tf.keras.layers.Dense(1),
])

# 2. Compile the model
inscurance_model_2.compile(loss = tf.keras.layers.mae,
                optimizer = tf.keras.optimizers.Adam(), # switched to Adam
                metrics = ["mae"])

# 3. Fit the model
inscurance_model_2.fit(X_train, y_train, epochs = 100, verbose = 0)

In [None]:
# Check the results of the insurance model 2 on the test data
inscurance_model_2.evaluate(X_test, y_test)

Experiment with more epochs

In [None]:
# More Epochs 

# Set random seed
tf.random.set_seed(42)

# 1. Create the model
inscurance_model_3 = tf.keras.Sequential([
        tf.keras.layers.Dense(100),
        tf.keras.layers.Dense(10),
        tf.keras.layers.Dense(1),
])

# 2. Compile the model
inscurance_model_3.compile(loss = tf.keras.layers.mae,
                optimizer = tf.keras.optimizers.Adam(), # switched to Adam
                metrics = ["mae"])

# 3. Fit the model
inscurance_model_3.fit(X_train, y_train, epochs = 200)

In [None]:
# Check the results of the insurance model 3 on the test data
history = inscurance_model_3.evaluate(X_test, y_test)

### Visualize the Loss of the training model

#### Plot History
Also known as a loss curve or training curve

In [None]:
pd.DataFrame(history.history).plot()
plt.ylabel("loss")
plt.xlabel("epochs")

### Preprocessing data (normalization and standardization)

In terms of scaling values, neural networks tend to prefer normalization.

If, you are not sure what to use you can perform both

In [None]:
X

In [None]:
X["age"].plot(kind="hist")

In [None]:
X["bmi"].plot(kind="hist")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# Read in the insurance dataset
insurance = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")
insurance

To prepare the data , we could borrow a few classes from the Scikit-Learn Library.

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Create a column transformer
ct = make_column_transformer(
    (MinMaxScaler(), ["age", "bmi", "children"]), # turn all values in these columns between 0 and 1
    (OneHotEncoder(handle_uknown="ignore"), ["sex", "smoker", "region"]),
)

# Create the X and y 
X = insurance.drop("charges", axis=1)
y = insurance["charges"]

# Build our train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the column transformer to our training data

ct.fit(X_train)

# Transform training and test data with normalization (MinMaxScaler) and OneHotEncoder
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)

In [None]:
# What does our data look like
X_train_normal[0]

Our data has been normalized and one hot ecoded. Now lets build our neural network model on it and see how it goes!

In [None]:
# Set random seed
tf.random.set_seed(42)

# 1. Create the model
inscurance_model_4 = tf.keras.Sequential([
        tf.keras.layers.Dense(100),
        tf.keras.layers.Dense(10),
        tf.keras.layers.Dense(1),
])

# 2. Compile the model
inscurance_model_4.compile(loss = tf.keras.layers.mae,
                optimizer = tf.keras.optimizers.Adam(), # switched to Adam
                metrics = ["mae"])

# 3. Fit the model
inscurance_model_4.fit(X_train, y_train, epochs = 100)

In [None]:
# Check the results of the insurance model 4 on the test data
history = inscurance_model_4.evaluate(X_test_normal, y_test)