In [1]:
import warnings
warnings.simplefilter("ignore")

import json
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential

In [2]:
#  Import and read the charity_data.csv.
application_df = pd.read_csv("Resources/charity_data.csv")
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


### Adjust the input data to ensure that there are no variables or outliers that are causing confusion in the model:

* Drop more columns.
* Create more bins for rare occurrences in columns.
* Increase or decrease the number of values for each bin.

#### Create bins for INCOME_AMT and ASK_AMT

In [3]:
# Look at INCOME_AMT value counts for binning
incms = application_df['INCOME_AMT'].value_counts()
incms

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

In [4]:
# create a list of INCOME_AMT values to be replaced
cut = 500
income_to_replace = incms[incms < cut].index.values.tolist()

for inc in income_to_replace:
    application_df['INCOME_AMT'] = application_df['INCOME_AMT'].replace(inc,"5M+")
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [5]:
# Look at ASK_AMT values
am = application_df['ASK_AMT'].value_counts()
am

5000        25398
10478           3
15583           3
63981           3
6725            3
            ...  
5371754         1
30060           1
43091152        1
18683           1
36500179        1
Name: ASK_AMT, Length: 8747, dtype: int64

In [6]:
# create bins for ASK_AMT values
application_df['bins'] = np.select(
    [
        application_df['ASK_AMT']== 5000, 
        application_df['ASK_AMT'].between(5000, 10000, inclusive=False),
        application_df['ASK_AMT'].between(10000, 100000, inclusive=True), 
        application_df['ASK_AMT'].between(100000, 500000, inclusive=False),
        application_df['ASK_AMT'].between(500000, 999999, inclusive=True), 
        application_df['ASK_AMT'].between(1000000, 1000000000, inclusive=True)
    ], 
    [
        '5000', 
        '5-10k',
        '10-100k',
        '100-500k',
        '500k-1M',
        '1M+'
    ], 
    default='Unknown'
)
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,bins
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1,5000
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1,100-500k
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,5000
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1,5-10k
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1,100-500k
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0,5000
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,5000
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0,5000
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1,5000


#### Drop columns that are not considered valuable for model

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN', 'NAME'], axis = 1, inplace = True)

In [7]:
# values of SPECIAL_CONSIDERATIONS are always N
# status does not seem to influence the result
# ASK_AMT we substitude with a new column bins that contains grouped values from ASK_AMT
application_df.drop(['STATUS', 'SPECIAL_CONSIDERATIONS', 'ASK_AMT'], axis = 1, inplace = True)

#### Increase or decrease the number of values for each bin.

In [8]:
# Create a list of application types to be replaced
type_series = application_df['APPLICATION_TYPE'].value_counts()

cutoff = 1000
application_types_to_replace = type_series[type_series < cutoff].index.values.tolist()

for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

In [9]:
# create a list of classification types to be replaced
class_series = application_df['CLASSIFICATION'].value_counts()

cutoffc = 200
classifications_to_replace = class_series[class_series < cutoffc].index.values.tolist()

for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

In [10]:
application_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Other     1197
C7000      777
C1700      287
Name: CLASSIFICATION, dtype: int64

In [11]:
# Convert categorical values to numeric
X_dummies = pd.get_dummies(application_df)
X_dummies

Unnamed: 0,EIN,IS_SUCCESSFUL,NAME_1 DAY RANCH RESCUE AND RURAL OKLAHOMA ANIMAL RESOURCE INC,NAME_100 BLACK MEN OF AMERICA,NAME_100 BLACK MEN OF MEMPHIS INC,NAME_100 BLACK MEN OF WEST GEORGIA INC,NAME_1150 WEBSTER STREET INC,NAME_116TH CAVALRY REGIMENT CHAPTER OF THE US CAVALRY & ARMOR ASSOCIATION,NAME_13TH BOMB SQUADRON ASSOCIATION,NAME_146TH ALUMNI ASSOCIATION,...,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_5M+,bins_10-100k,bins_100-500k,bins_1M+,bins_5-10k,bins_5000,bins_500k-1M,bins_Unknown
0,10520599,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,10531628,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,10547893,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,10553066,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,10556103,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34295,996010315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34296,996012607,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34297,996015768,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
# Split our preprocessed data into our features and target arrays
X = X_dummies.drop(columns=["IS_SUCCESSFUL"])
y = X_dummies["IS_SUCCESSFUL"]
X

Unnamed: 0,EIN,NAME_1 DAY RANCH RESCUE AND RURAL OKLAHOMA ANIMAL RESOURCE INC,NAME_100 BLACK MEN OF AMERICA,NAME_100 BLACK MEN OF MEMPHIS INC,NAME_100 BLACK MEN OF WEST GEORGIA INC,NAME_1150 WEBSTER STREET INC,NAME_116TH CAVALRY REGIMENT CHAPTER OF THE US CAVALRY & ARMOR ASSOCIATION,NAME_13TH BOMB SQUADRON ASSOCIATION,NAME_146TH ALUMNI ASSOCIATION,NAME_14TH ARMORED DIVISION,...,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_5M+,bins_10-100k,bins_100-500k,bins_1M+,bins_5-10k,bins_5000,bins_500k-1M,bins_Unknown
0,10520599,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,10531628,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,10547893,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,10553066,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,10556103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34295,996010315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34296,996012607,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
34297,996015768,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
number_input_features = len(X_train_scaled[0])

### Adjust model parameters
* Add more neurons to a hidden layer.
* Add more hidden layers.
* Use different activation functions for the hidden layers.
* Add or reduce the number of epochs to the training regimen.

In [15]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=50,
        max_value=100,
        step=5), activation=activation, input_dim=number_input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=10,
            max_value=50,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [17]:
# Import the kerastuner library
import kerastuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=5,
    hyperband_iterations=2,
    directory='a3'
)

In [18]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=5,validation_data=(X_test_scaled,y_test))

Epoch 1/2
Epoch 2/2


Epoch 1/2
Epoch 2/2




Epoch 1/2




Epoch 2/2






Epoch 1/2
Epoch 2/2






Epoch 1/2
Epoch 2/2






Epoch 3/5
Epoch 4/5




Epoch 5/5






Epoch 3/5
Epoch 4/5




Epoch 5/5






Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






Epoch 1/5
Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






Epoch 1/2
Epoch 2/2






Epoch 1/2
Epoch 2/2




Epoch 1/2




Epoch 2/2






Epoch 1/2
Epoch 2/2




Epoch 1/2
Epoch 2/2






Epoch 3/5
Epoch 4/5


Epoch 5/5






Epoch 3/5
Epoch 4/5




Epoch 5/5






Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5


Epoch 5/5


Epoch 1/5
Epoch 2/5


Epoch 3/5


Epoch 4/5


Epoch 5/5




Epoch 1/5
Epoch 2/5




Epoch 3/5




Epoch 4/5


Epoch 5/5






INFO:tensorflow:Oracle triggered exit


In [19]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'tanh', 'first_units': 80, 'num_layers': 1, 'units_0': 30, 'units_1': 35, 'units_2': 15, 'units_3': 30, 'units_4': 40, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 1, 'tuner/round': 0}
{'activation': 'tanh', 'first_units': 80, 'num_layers': 1, 'units_0': 30, 'units_1': 35, 'units_2': 15, 'units_3': 30, 'units_4': 40, 'tuner/epochs': 5, 'tuner/initial_epoch': 2, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': 'dd711750a09584adc1f43e52271a451d'}
{'activation': 'tanh', 'first_units': 75, 'num_layers': 3, 'units_0': 45, 'units_1': 40, 'units_2': 30, 'units_3': 50, 'units_4': 45, 'tuner/epochs': 5, 'tuner/initial_epoch': 2, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0123fdbf9d109dde9d690c9796410d12'}


In [20]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5417 - accuracy: 0.7977
Loss: 0.5417230129241943, Accuracy: 0.7976676225662231
268/268 - 1s - loss: 0.5942 - accuracy: 0.7958
Loss: 0.5941565632820129, Accuracy: 0.7958017587661743
268/268 - 1s - loss: 0.4773 - accuracy: 0.7956
Loss: 0.47731533646583557, Accuracy: 0.7955685257911682


In [26]:
# Export our model to HDF5 file
model = tuner.get_best_models(1)[0]
model.save('AlphabetSoupCharity_Optimization.h5')

