<a href="https://colab.research.google.com/github/Jonathan-Gilbert/deep-learning-challenge/blob/main/deep_learning_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing

In [32]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import keras_tuner as kt

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [33]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'])

# Check the dataframe to confirm the columns have been dropped
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [34]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()

unique_values

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [35]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
# Check the value counts for 'APPLICATION_TYPE'
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Set a threshold for the minimum count (e.g., if the category appears less than 100 times)
threshold = 0.5

# Replace less frequent categories with 'Other'
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].apply(
    lambda x: x if application_type_counts[x] >= threshold else 'Other'
)

# Check the updated value counts for 'APPLICATION_TYPE'
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [36]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = application_type_counts[application_type_counts < threshold].index.tolist()

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [37]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Set a threshold for the minimum count (e.g., if the category appears less than 10 times)
threshold = 0.5

# Create a list of classification types to be replaced (those with counts less than the threshold)
classification_types_to_replace = classification_counts[classification_counts < threshold].index.tolist()

# Replace the less frequent classification types with "Other"
for classification in classification_types_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(classification, "Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [38]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Filter to get classification types that appear more than once
classification_more_than_one = classification_counts[classification_counts > 1]

# Display the result
classification_more_than_one

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [39]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
# set threshold
threshold = 0.5

# Create a list of classification types to be replaced (those with counts less than the threshold)
classification_counts = application_df['CLASSIFICATION'].value_counts()
classifications_to_replace = classification_counts[classification_counts < threshold].index.tolist()


# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [40]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df_encoded = pd.get_dummies(application_df)

# Check the first few rows of the encoded dataframe
application_df_encoded.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,108590,1,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,1,5000,0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,1,6692,1,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,1,142590,1,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False


In [41]:
# Split our preprocessed data into our features and target arrays
# Define the target column (dependent variable)
target_column = 'IS_SUCCESSFUL'

# Split the preprocessed data into a training and testing dataset
X = application_df_encoded.drop(columns=[target_column])  # Features
y = application_df_encoded[target_column]  # Target

# Split the data into training and testing datasets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the split datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27439, 116), (6860, 116), (27439,), (6860,))

In [42]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [44]:
# Optional - Define a Hyperparameter Tuning Function:

def model_builder(hp):
       model = tf.keras.models.Sequential()
       model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32),
                       activation='relu', input_dim=X_train_scaled.shape[1]))
       model.add(Dense(units=hp.Int('units_2', min_value=32, max_value=512, step=32), activation='relu'))
       model.add(Dense(units=hp.Int('units_3', min_value=32, max_value=512, step=32), activation='relu'))
       model.add(Dense(units=1, activation='sigmoid'))

       # Tune the learning rate for the optimizer
       # Choose an optimal value from 0.01, 0.001, or 0.0001
       hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

       model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                     loss='binary_crossentropy',
                     metrics=['accuracy'])

       return model

In [45]:
# Optional Continued - Initialize and Run the Tuner:
tuner = kt.Hyperband(
    model_builder,
    objective='val_accuracy',
    max_epochs=5,
    factor=3,
    directory='my_dir',
    project_name='intro_to_kt'
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)  # Reduced patience for earlier stopping

# Reduce the validation split for faster epochs (but potentially less reliable estimates)
tuner.search(X_train_scaled, y_train, epochs=50, validation_split=0.15, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units_1')}, the second is {best_hps.get('units_2')}, the third is {best_hps.get('units_3')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 30 Complete [00h 01m 30s]
val_accuracy: 0.7397959232330322

Best val_accuracy So Far: 0.7428935766220093
Total elapsed time: 00h 21m 03s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 128, the second is 352, the third is 96 and the optimal learning rate for the optimizer
is 0.0001.



In [47]:
# Optional Continued - Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_scaled, y_train, epochs=5, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.6865 - loss: 0.6150 - val_accuracy: 0.7400 - val_loss: 0.5599
Epoch 2/5
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7253 - loss: 0.5587 - val_accuracy: 0.7389 - val_loss: 0.5540
Epoch 3/5
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7305 - loss: 0.5481 - val_accuracy: 0.7394 - val_loss: 0.5514
Epoch 4/5
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7269 - loss: 0.5519 - val_accuracy: 0.7405 - val_loss: 0.5473
Epoch 5/5
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7318 - loss: 0.5457 - val_accuracy: 0.7376 - val_loss: 0.5479
Best epoch: 4


In [49]:
# Optional Continued - Evaluate the Model:
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train_scaled, y_train, epochs=best_epoch, validation_split=0.2)

eval_result = hypermodel.evaluate(X_test_scaled, y_test)
print("[test loss, test accuracy]:", eval_result) # Fixed the print statement

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/4
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6925 - loss: 0.6132 - val_accuracy: 0.7358 - val_loss: 0.5629
Epoch 2/4
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7298 - loss: 0.5571 - val_accuracy: 0.7372 - val_loss: 0.5591
Epoch 3/4
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7300 - loss: 0.5561 - val_accuracy: 0.7403 - val_loss: 0.5542
Epoch 4/4
[1m686/686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7381 - loss: 0.5420 - val_accuracy: 0.7405 - val_loss: 0.5587
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7328 - loss: 0.5534
[test loss, test accuracy]: [0.5562865138053894, 0.7284256815910339]


In [50]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = Sequential()

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(Dense(units=128, input_dim=X_train_scaled.shape[1], activation='relu'))

# Second hidden layer
nn.add(Dense(units=64, activation='relu'))

# Third hidden layer (optional for accuracy)
nn.add(Dense(units=32, activation='relu'))

# Output layer
nn.add(Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

In [51]:
# Compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=5, batch_size=32, validation_data=(X_test_scaled, y_test))

Epoch 1/5
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7038 - loss: 0.5864 - val_accuracy: 0.7258 - val_loss: 0.5610
Epoch 2/5
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7306 - loss: 0.5522 - val_accuracy: 0.7280 - val_loss: 0.5559
Epoch 3/5
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7321 - loss: 0.5500 - val_accuracy: 0.7310 - val_loss: 0.5565
Epoch 4/5
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7351 - loss: 0.5412 - val_accuracy: 0.7259 - val_loss: 0.5561
Epoch 5/5
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7348 - loss: 0.5418 - val_accuracy: 0.7324 - val_loss: 0.5587


In [52]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - 1ms/step - accuracy: 0.7324 - loss: 0.5587
Loss: 0.5586560368537903, Accuracy: 0.7323614954948425


In [19]:
# Export our model to HDF5 file
nn.save("charity_model.h5")

