In [2]:
# Import the dependencies:
import pandas as pd
from pathlib import Path
import tensorflow.python as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split




In [3]:
# INTRODUCTION TO OPTIMIZING NEURAL NETWORKS
# The technoloogy behin neural networks is relatively new, and techniques for optimizing neural network performance continue to evolve as data scientists conduct further research into this area.
# No infallible approach to designing an optimal neural network exists.
# But, we can apply some guidelines and best practices to find the most suitable model for a particular problem.
# In this lesson, we'll explore some common techniques for optimizing neural network models.
# These techniques span all elements of building a neural network, form data processing to model design.

In [4]:
# OPTIMIZING TECHNIQUES FOR DATA PREPROCESSING
# In this module, we have learned that - as for all machine learning algorithms - in order to use a neural network effectively, we must preprocess our data.
# Neural networks cannon process categorical variables in their raw forms.
# And it is often challenging to train neural networks on numerical variables that are represented in different units or that have different scales of magnitude.
# We have previously used the `StandardScaler` module from scikit-learn to mitigate this problem for numerical variables.
# This module scales the values of different variables by normalizing data to center around the mean.
# We have also previously used the `get_dummies` function to solve this problem for categorical variables.
# However, `get_dummies` can run into trouble if the testing dataset contains categories that are not present in the training dataset.
# For example, consider the following scenario:
    # 1. Our X_train dataset has a column named 'country' that contains the following categorical levels: 'canada', 'usa', mexico', 'colombia'.
    # 2. Our X_test dataset has the following levels: 'canada', 'usa', 'mexico', 'colombia', 'spain',
    # 3. Using the `get_dummies` function, X_test ends up with a column called 'country_spain'' that X_train doesn't have. 
        # This inconsistency may break our model, which was trained using X_train.
# To account for this potential `get_dummies` issue, we will instead use `OneHotEncoder`.
# `OneHotEncoder` is a scikit-learn module that allows us to specify what happens when a new category appears in testing data.

# DEEP DIVE
# To learn more about how `OneHotEncoder` works, you can read the following link:
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# The development community regularly debates whether `get_dummies` or `OneHotEncoder` better handles categorical data.
# If you are curious about the technical discussion surrounding these two approaches, check out the following links:
    # https://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn
    # https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

In [5]:
# Let's look at an example to see how `OneHotEncoder`'s coding technique works.
# We'll use a dataset that has been adapted from the following Kaggle link:
    # https://www.kaggle.com/zaurbegiev/my-dataset#credit_train.csv
# This dataset has more than 36,000 rows and 16 feature columns.
# Using this dataset, we want to build a deep neural classifier model that can predict whether a loan will or will not be paid.
# The model will make this prediction based on the loan's current status and other metrics provided in the dataset.
# To start, we'll load the csv data into a Dataframe.

# Read the CSV data into a DataFrame:
loan_status_df = pd.read_csv(
    Path('loan_status.csv')
)

# Review the first and last 5 rows of the DataFrame:
display(loan_status_df.head())
display(loan_status_df.tail())

Unnamed: 0,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
0,Fully_Paid,99999999,Short_Term,741.0,2231892.0,8_years,Own_Home,Debt_Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
1,Fully_Paid,217646,Short_Term,730.0,1184194.0,<_1_year,Home_Mortgage,Debt_Consolidation,10855.08,19.6,10.0,13,1,122170,272052.0,1.0,0.0
2,Fully_Paid,548746,Short_Term,678.0,2559110.0,2_years,Rent,Debt_Consolidation,18660.28,22.6,33.0,4,0,437171,555038.0,0.0,0.0
3,Fully_Paid,99999999,Short_Term,728.0,714628.0,3_years,Rent,Debt_Consolidation,11851.06,16.0,76.0,16,0,203965,289784.0,0.0,0.0
4,Fully_Paid,99999999,Short_Term,740.0,776188.0,<_1_year,Own_Home,Debt_Consolidation,11578.22,8.5,25.0,6,0,134083,220220.0,0.0,0.0


Unnamed: 0,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
36418,Fully_Paid,99999999,Short_Term,742.0,1190046.0,<_1_year,Rent,Other,11969.81,20.1,16.0,9,0,37392,134442.0,0.0,0.0
36419,Fully_Paid,44484,Short_Term,717.0,1152426.0,10+_years,Home_Mortgage,Other,6280.64,21.0,12.0,6,0,961932,0.0,0.0,0.0
36420,Fully_Paid,210584,Short_Term,719.0,783389.0,1_year,Home_Mortgage,Other,3727.61,17.4,18.0,6,0,456,259160.0,0.0,0.0
36421,Fully_Paid,99999999,Short_Term,732.0,1289416.0,1_year,Rent,Debt_Consolidation,13109.05,9.4,21.0,22,0,153045,509234.0,0.0,0.0
36422,Fully_Paid,103136,Short_Term,742.0,1150545.0,6_years,Rent,Debt_Consolidation,7315.57,18.8,18.0,12,1,109554,537548.0,1.0,0.0


In [6]:
# Note that we have both numerical and categorical ('Loan_Status, 'Term', 'Home_Ownership', etc) variables.
# In this case, we want our model to predict whetehr a loan will be paid.
# So, the target is the "Loan_Status" column.
# However, this column is categorical.
# So, we will need to encode this column's entries as numerical values.
# There are several other categorical variables in this dataset too.
# So before creating features (X) and target (y) datasets, we'll use the `OneHotEncoder` module to numerically encode all of the dataset's categorical data.
# First, we create an instance of `OneHotEncoder`.
# We set the parameter `sparse=False` to fetch a NumPY array.
# We will use the resulting array to create a DataFrame.
# Later, we'll use this DataFrame to fit the neural network model.
# The following code creates the `OneHotEncoder` instance:

# Create a OneHotEncoder instance:
enc = OneHotEncoder(sparse=False)

In [8]:
# Next, we want to list all of the columns in our original dataset that have categorical variables.
# Later, we'll use this list to filter the DataFrame so that it will contain only categorical data.

# NOTE
# You can use the `dytpes` function to review the data type of each column in the DataFrame.
# Columns that have categorical variables will be listed with a data type of `object`.
# You can create a list of columns with categorical data by calling hte `dtypes` function on the DataFrame and passing it a value of `loan_status_df.dtypes=='object'`.

list(loan_status_df.dtypes[loan_status_df.dtypes == 'object'].index)

['Loan_Status', 'Term', 'Years_in_current_job', 'Home_Ownership', 'Purpose']

In [9]:
# Create a list of the columns with categorical variables:
categorical_variables = ['Loan_Status', 'Term', 'Years_in_current_job', 'Home_Ownership', 'Purpose']

In [11]:
# Next, we need to train our encoder to approriately transform our categorical data.
# We use the encoder's `fit_transform()` function to train it with those variables included in the categorical_variables list.
# This same function transforms the data once the encoder has been trained.
# We pass the original DataFrame, filtered using the `categorical_variables` list, to the function.
# We code all of this in one step:

# Use the fit_transform function from the OneHotEncoder to encode the data:
encoded_data = enc.fit_transform(loan_status_df[categorical_variables])

In [12]:
# Next, we create a new DataFrame.
# This DataFrame will contian the encoded data returned by OneHotEncoder.
# We can use the encoder's `get_feature_names()` function to set the DataFrame's column names.
# We use our list of categorical variables as the funciton's parameter.
# This way, in addition to the encoded values for the categorical variables, the encoder will also fetch the correct name for each column.

# Create a DataFrame with the encoded variables:
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display sample data:
encoded_df.head()



Unnamed: 0,Loan_Status_Fully_Paid,Loan_Status_Not_Paid,Term_Long_Term,Term_Short_Term,Years_in_current_job_10+_years,Years_in_current_job_1_year,Years_in_current_job_2_years,Years_in_current_job_3_years,Years_in_current_job_4_years,Years_in_current_job_5_years,...,Home_Ownership_Home_Mortgage,Home_Ownership_Own_Home,Home_Ownership_Rent,Purpose_Business_Loan,Purpose_Buy_House,Purpose_Buy_a_Car,Purpose_Debt_Consolidation,Purpose_Home_Improvements,Purpose_Medical_Bills,Purpose_Other
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
# Now that we've seen the resulting DataFrame, let's examine the process `OneHotEncoder` uses to encode categorical variables.
# Take the 'Loan_Status' column, for example:
# Originally, the "Loan_Status" column held two values: Fully_Paid and Not_paid.
# To encode this column, `OneHotEncoder` creates two new columns, one for each CATEGORY LEVEL - that is, one for each possible value in the original column.
# So, the encoder creates a new column called 'Loan_Status_Fully_Paid' , and the new column called 'Loan_Status_Not_Paid'.
# In each of these new columns, the value 1.0 represents the presence of a value, and 0.0 represents the absence of a value.
# So, a data point with an original 'Loan_Status' value of Fully_Paid is now represented by the value 1.0 in hte new 'Loan_Status_Fully_Paid' column.
# It's represented by the value 0.0 in the new 'Loan_Status_Not_Paid' column.
# The rest of the categorical variables are encoded in a similar manner.
# Once we've encoded the categorical variables, we can use the `StandardScaler` to scale the numerical variables to a similar range.
# Finally, the DataFrame with the encoded information needs to be concatenated with a version of the original DataFrame that has the categorical variable columns dropped.

In [14]:
# OPTIMIZATION TECHNIQUES FOR NEURAL NETWORK DESIGN
# As you've learned, the number of layers in a neural network, and the number of neurons (aka nodes) within each layer, can greatly impact a neural network's performance.
# Deciding how many nodes and layers to use can be challenging. 
# However, the formal literature on neural networks and the experiences of machine learning experst and practitioners offer us some help.
# Next, we'll explore a few general rules of thumb for optimizing neural network design.
# The sections that follow will provide guidance on:
    # 1. Defining the number of layers in a neural network.
    # 2. Defining the number of nodes in each hidden layer.
    # 3. Selecting activation and loss functions.
    # 4. Determining the optimal number of epochs.

# DEEP DIVE:
# For a more in-depth look at each of these topics, check out resources by machine learning experts and practitioners via the following links:
    # https://datascience.stackexchange.com/
    # https://stats.stackexchange.com/

In [16]:
# NUMBER OF LAYERS
# Deep neural networks are robust tools, but sometimes we don't need a deep learning model to solve a business problem.
# For example, when dealing with data that is linearly separable, you don't typically need any hidden layers.
# Recall that linearly separable data can be separated by a straight line when it's plotted in two dimensions.
# When dealing with nonlinear data, you may need more than one hidden layer.
# When designing a deep neural network, it's best practice to start with two hidden layers.
# Then, continue adding additional layers until the model's performance no longer imporves over the same number of epochs.

In [17]:
# NUMBER OF NODES IN EACH HIDDEN LAYER
# The number of neurons that you include in each hidden layer can impoact a deep leanring model's final output.
# So, this is an important decision. 
# Using too few neurons could lead to underfitting the model - meaning you will not achieve the model's best possible performance.
# On the other hand, including too many neurons could result in overfitting - meaning the model may not generalize well to other datasets.
# Devs use a few methods to determine the optimal number of neurons for a hidden layer:
    # 1. Find the mean of the number of input features and the number of neurons in the output layer:
        # [(Number of input features + number of neurons in output layer) / 2]
        # Use a number close to this mean for the number of neurons in the first hidden layer.
        # Repeat this pattern for subsequent hidden layers:
        # [(number of neurons in prior hidden layer + number of neurons in output layer) / 2]
        # This rule normally works well for the first approximation.
    # 2. The total number of neurons across all hidden layers should be 2/3 the size of the input layer:
        # (size of input layer  = number of features), plus the size of the output layer (size of output layer = number of neurons on the output layer).
    # 3. Alternatively, the total number of neurons across all hidden layers should be less than twice the size of the number of features in the input layer.
# These rules provide a starting point for designing a neural network.
# Ultimately, however, we can only discover the best architectures for our neural network models by trial and error testing.

In [18]:
# ACTIVATION AND LOSS FUNCTIONS
# Selecting the best activation and loss functions is, once again, part experience and part trial-and-error testing.
# As you gain more experience using neural networks, and machine learning in general, you will develop preferences for certain functions in certain use cases.
# In the meantime, you can use the following recommendations as starting points:
    # 1. For the activation function on a hidden layer, use `relu`.
    # 2. For the activation function on an output layer, use `sigmoid` for binary classification, `softmax` for multi-class classification, and `linear` for regression.
    # 3. Choose the loss function based on the type of problem you're solving. 
        # For binary classification, use `binary_crossentropy`.
        # For mulit-class classification, use `categorical_crossentropy` if you encode the variables using `OneHotEncoder`.
        # Or use `sparce_categorical_crossentropy` if the labels are integers.
        # Finally, use `mse` for regression.        

In [None]:
# NUMBER OF EPOCHS
# The number of epochs that we can run can have an important effect on the model.
# Achieving the optimal number of epochs helpts improve the model's evaluation metrics.
# Ususally, we vary the number of epochs as we test a model, seeking to minimize the model's loss value.
# A good rule of thumb is to start training a model with 20 epochs.
# With these plots, we can verify the following:
    # 1. Whether the model's training loss decreases over the epochs
    # 2. Whether its accuracy (for classification) increases
    # 3. Or its mean squared error (for regression) moves toward zero.
# We can then continue training the model, increasing the number of epochs by 20 on each trial.
# After each new trial, we'll again verify that the model's loss and evaluation metric move in the directions we want.