## Preprocessing

In [19]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

import numpy as np
from sklearn.decomposition import PCA
from keras.optimizers import legacy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [20]:
def examine(df):

    inf = df.dtypes
    ct = df.count()
    unique_values = df.nunique()
    missing_values = df.isnull().sum()
    mem = df.memory_usage(deep=True)
    formatted_mem = (mem / (1024 * 1024)).apply(lambda x: "{:,.1f}".format(x))

    summary_df = pd.DataFrame({
        'Type': inf,
        'Entries': ct,
        'Unique Values': unique_values,
        'Null/Missing Values': missing_values,
        'Memory Usage MB': formatted_mem
    }).fillna('0')

    summary_df[['Unique Values', 'Null/Missing Values', 'Entries']] = summary_df[['Unique Values', 'Null/Missing Values', 'Entries']].astype(int)

    summary_df.index.name = f'Row: {df.shape[0]} Col: {df.shape[1]}'

    summary_df = summary_df.iloc[1:]

    return summary_df

examine(application_df)

Unnamed: 0_level_0,Type,Entries,Unique Values,Null/Missing Values,Memory Usage MB
Row: 34299 Col: 12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
APPLICATION_TYPE,object,34299,17,0,1.9
ASK_AMT,int64,34299,8747,0,0.3
CLASSIFICATION,object,34299,71,0,2.0
EIN,int64,34299,34299,0,0.3
INCOME_AMT,object,34299,9,0,2.0
IS_SUCCESSFUL,int64,34299,2,0,0.3
Index,0,0,0,0,0.0
NAME,object,34299,19568,0,2.9
ORGANIZATION,object,34299,4,0,2.1
SPECIAL_CONSIDERATIONS,object,34299,2,0,1.9


In [21]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'], axis=1)

In [22]:
# Look at APPLICATION_TYPE value counts for binning
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64

In [23]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = ['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64

#### CLASSIFICATION produces too many sparce features when encoded, so we need to bin those in 'Other'

In [24]:
# Look at CLASSIFICATION value counts for binning
class_value_counts = application_df['CLASSIFICATION'].value_counts()

# Calculate the cumulative percentage from the smallest to the largest
cumulative_percentage = class_value_counts.sort_values(ascending=True).cumsum() / class_value_counts.sum()

# Find the classifications that make up the bottom 3%
bottom_classifications = cumulative_percentage[cumulative_percentage <= 0.03]

# Now, create a set of classifications that are in the bottom 3%
bottom_classifications_set = set(bottom_classifications.index)

# Bin these classifications as 'Other' in the dataframe
application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].apply(lambda x: 'Other' if x in bottom_classifications_set else x)

# Check the updated value counts
print(application_df['CLASSIFICATION'].value_counts())

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Other     1003
C7000      777
C1700      287
C4000      194
Name: count, dtype: int64


In [25]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = ['AFFILIATION', 'APPLICATION_TYPE', 'CLASSIFICATION', 'INCOME_AMT', 'ORGANIZATION', 'SPECIAL_CONSIDERATIONS', 'USE_CASE']

app_df_encoded = pd.get_dummies(application_df, columns=categorical_columns)
app_df_encoded.head(3)

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,APPLICATION_TYPE_Other,...,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev
0,1,5000,1,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1,1,108590,1,False,False,True,False,False,False,False,...,True,False,False,True,False,False,False,False,True,False
2,1,5000,0,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [26]:
# Split our preprocessed data into our features and target arrays
y = app_df_encoded['IS_SUCCESSFUL']
X = app_df_encoded.drop('IS_SUCCESSFUL', axis=1)

#### In order to use Early Stopping for tuning, the data is split into Train, Validation, and Test sets.

In [27]:
# Split the dataset into train, validation, and test sets (60% train, 20% validate, 20% test)
# (using stratify to maintain proportion of classes in splits)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler (to the training data only)
X_scaler = scaler.fit(X_train)

# Scale the training, validation, and test data
X_train_scaled = X_scaler.transform(X_train)
X_val_scaled = X_scaler.transform(X_val)  
X_test_scaled = X_scaler.transform(X_test)  

#### After removing the most obviously bad features, there were still a higher number of sparse features than useful features. So I will apply PCA to determine the most relevant features

In [29]:
# Initialize PCA and fit to the scaled training data
pca = PCA(n_components=0.95)  # keep 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)

# Apply the transformation to the validation and test sets
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Number of components chosen by PCA
print(f"PCA chose {pca.n_components_} components")

PCA chose 34 components


## Compile, Train and Evaluate the Model

In [30]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=34, activation='relu', input_dim=34))

# First hidden layer
nn.add(tf.keras.layers.Dense(units=17, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 34)                1190      
                                                                 
 dense_4 (Dense)             (None, 17)                595       
                                                                 
 dense_5 (Dense)             (None, 1)                 18        
                                                                 
Total params: 1803 (7.04 KB)
Trainable params: 1803 (7.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
# Compile the model
nn.compile(optimizer=legacy.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
# Define a callback to save the model's weights every 100 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='checkpoints/weights.{epoch:02d}.h5',
    verbose=0,
    save_weights_only=True,
    save_freq=100
)

# Define early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',   
    patience=10,         
    restore_best_weights=True 
)

In [33]:
# Train the model
nn.fit(
    X_train_pca,
    y_train,
    epochs=1000,
    batch_size=128,
    callbacks=[cp_callback, early_stopping],
    validation_data=(X_val_pca, y_val)
)

Epoch 1/1000


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


<keras.src.callbacks.History at 0x17b831ee0>

In [34]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_pca, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - loss: 0.5514 - accuracy: 0.7258 - 114ms/epoch - 529us/step
Loss: 0.5514485836029053, Accuracy: 0.7258017659187317


In [35]:
# Save the entire model as a SavedModel.
nn.save('alphabet_soup_model')

INFO:tensorflow:Assets written to: alphabet_soup_model/assets


INFO:tensorflow:Assets written to: alphabet_soup_model/assets


### Examining the model to assess dimensionality

In [36]:
# Find the percentage of zeros in each feature
sparsity = ((app_df_encoded == 0).sum() / len(app_df_encoded)) * 100

# Display features with more than 90% zeros
sparse_features = sparsity[sparsity > 95]
sparse_features

AFFILIATION_Family/Parent    99.813406
AFFILIATION_National         99.903787
AFFILIATION_Other            99.988338
AFFILIATION_Regional         99.962098
APPLICATION_TYPE_Other       99.195312
APPLICATION_TYPE_T10         98.460597
APPLICATION_TYPE_T19         96.894953
APPLICATION_TYPE_T4          95.504242
APPLICATION_TYPE_T5          96.580075
APPLICATION_TYPE_T6          96.454707
APPLICATION_TYPE_T7          97.886236
APPLICATION_TYPE_T8          97.851249
CLASSIFICATION_C1700         99.163241
CLASSIFICATION_C4000         99.434386
CLASSIFICATION_C7000         97.734628
CLASSIFICATION_Other         97.075716
INCOME_AMT_1-9999            97.877489
INCOME_AMT_10000-24999       98.416863
INCOME_AMT_10M-50M           99.300271
INCOME_AMT_1M-5M             97.215662
INCOME_AMT_50M+              99.594740
INCOME_AMT_5M-10M            99.460626
ORGANIZATION_Co-operative    98.583049
ORGANIZATION_Corporation     99.874632
SPECIAL_CONSIDERATIONS_Y     99.921281
USE_CASE_CommunityServ   

Feature CLASSIFICATION created too many columns! only 3 out of 71 had less than 90% sparcity.