## Preprocessing. Read data file 'charity_data.csv', and create dataframe 'application_df'

In [26]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head(5)

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


## Output characteristics and properties of dataframe.

In [27]:
application_df.shape

(34299, 12)

In [28]:
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EIN                     34299 non-null  int64 
 1   NAME                    34299 non-null  object
 2   APPLICATION_TYPE        34299 non-null  object
 3   AFFILIATION             34299 non-null  object
 4   CLASSIFICATION          34299 non-null  object
 5   USE_CASE                34299 non-null  object
 6   ORGANIZATION            34299 non-null  object
 7   STATUS                  34299 non-null  int64 
 8   INCOME_AMT              34299 non-null  object
 9   SPECIAL_CONSIDERATIONS  34299 non-null  object
 10  ASK_AMT                 34299 non-null  int64 
 11  IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.1+ MB


### The Dataframe has 12 Columns. They comprise the Target for th emodel "IS_SUCCESSFUL", while the other 11variables are potential Features for the model.

In [29]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN', 'NAME'], axis=1, inplace=True)

# Confirm fields have been dropped.
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   APPLICATION_TYPE        34299 non-null  object
 1   AFFILIATION             34299 non-null  object
 2   CLASSIFICATION          34299 non-null  object
 3   USE_CASE                34299 non-null  object
 4   ORGANIZATION            34299 non-null  object
 5   STATUS                  34299 non-null  int64 
 6   INCOME_AMT              34299 non-null  object
 7   SPECIAL_CONSIDERATIONS  34299 non-null  object
 8   ASK_AMT                 34299 non-null  int64 
 9   IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.6+ MB


### Confirmed that the fields 'EIN' and 'NAME' have been dropped. Now 9 Variables as lists are Features for the model.

## Determine the number of Unique values in each column.

In [30]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()
print(unique_values)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


## ## Determine the number of data points for each unique value for columns with more that 10 unique values, in this case APPLICATION_TYPE.

In [31]:
# Look at APPLICATION_TYPE value counts for binning
# Get value counts for the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Print the value counts
print(application_type_counts)

APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64


## Selected 250 as the cut off point for APPLICATION_TYPES. All counts below 250 are binned as 'other'

In [32]:
# Get value counts for the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Choose a cutoff value to determine which types to replace
cutoff = 500

# Create a list of application types to be replaced (those below the cutoff)
application_types_to_replace = application_type_counts[application_type_counts < cutoff].index.tolist()

# Print the list of application types to replace
print(application_types_to_replace)

['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']


In [33]:
# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64

### Binning is successful

## Selected 1000 as the cut off point for CLASSIFICATION. First counted the numbers of each classification, then removed those with a classification, then binned those below 1000 as 'other'. 

In [34]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Print the value counts
print(classification_counts)

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64


In [35]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classifications_greater_than_one = classification_counts[classification_counts > 1]

# Print the filtered classifications and their counts
print(classifications_greater_than_one)

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64


In [36]:
# Choose a cutoff value and create a list of classifications to be replaced
cutoff = 1000

# Create a list of classifications to be replaced (those below the cutoff)
classifications_to_replace = classification_counts[classification_counts < cutoff].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")
    
# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64

### Encode Categorical variables using pd.get_dummies()

In [37]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df_numeric = pd.get_dummies(application_df)

# Print the first first of the new DataFrame to verify the transformation
print(application_df_numeric.head(5))

   STATUS  ASK_AMT  IS_SUCCESSFUL  APPLICATION_TYPE_Other  \
0       1     5000              1                   False   
1       1   108590              1                   False   
2       1     5000              0                   False   
3       1     6692              1                   False   
4       1   142590              1                   False   

   APPLICATION_TYPE_T10  APPLICATION_TYPE_T19  APPLICATION_TYPE_T3  \
0                  True                 False                False   
1                 False                 False                 True   
2                 False                 False                False   
3                 False                 False                 True   
4                 False                 False                 True   

   APPLICATION_TYPE_T4  APPLICATION_TYPE_T5  APPLICATION_TYPE_T6  ...  \
0                False                False                False  ...   
1                False                False                False  

## Split this preproccessed data into a Target array (IS_SUCCESSFUL), and Features array.

In [38]:
# Extract the target variable into its own array
y = application_df_numeric['IS_SUCCESSFUL'].values

# Drop the target column from the DataFrame and use the rest as the features array
X = application_df_numeric.drop(['IS_SUCCESSFUL'], axis=1).values

# Print the shapes of X and y to confirm the split
print("Shape of Features Matrix:", X.shape)
print("Shape of Target Array:", y.shape)

Shape of Features Matrix: (34299, 43)
Shape of Target Array: (34299,)


### Split the data set into train and test

In [39]:
# Split the data set into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train/test datasets to verify the split
print("Training set shape (features):", X_train.shape)
print("Test set shape (features):", X_test.shape)
print("Training set shape (target):", y_train.shape)
print("Test set shape (target):", y_test.shape)
#  YOUR CODE GOES HERE

Training set shape (features): (27439, 43)
Test set shape (features): (6860, 43)
Training set shape (target): (27439,)
Test set shape (target): (6860,)


## Create a StandardScaler instances

In [40]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Define, Compile, Train and Evaluate the Two Hidden Layer Model

In [41]:
#### Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(Dense(32, input_dim=n_features, activation='tanh'))

# Second hidden layer
nn.add(Dense(16, activation='tanh')) 

# Output layer
nn.add(Dense(1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Compile the 2 Hidden Layer Model

In [42]:
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
# Model summary
nn.summary()

### Confirm the data type for X_train, and y_train.

In [44]:
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_train_:", y_train.shape)

# Convert X_train and y_train to float32 if they are not already
X_train_scaled = X_train_scaled.astype('float32')
y_train = y_train.astype('float32')

Shape of X_train_scaled: (27439, 43)
Shape of y_train_: (27439,)


### Fit the Two Hidden Layer Model over 50 Epochs.

In [45]:
# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=1)

Epoch 1/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 15ms/step - accuracy: 0.6968 - loss: 0.5974 - val_accuracy: 0.7378 - val_loss: 0.5566
Epoch 2/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - accuracy: 0.7274 - loss: 0.5601 - val_accuracy: 0.7392 - val_loss: 0.5496
Epoch 3/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.7258 - loss: 0.5561 - val_accuracy: 0.7341 - val_loss: 0.5507
Epoch 4/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.7309 - loss: 0.5489 - val_accuracy: 0.7409 - val_loss: 0.5462
Epoch 5/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.7299 - loss: 0.5525 - val_accuracy: 0.7321 - val_loss: 0.5512
Epoch 6/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.7319 - loss: 0.5448 - val_accuracy: 0.7378 - val_loss: 0.5476
Ep

[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.7349 - loss: 0.5381 - val_accuracy: 0.7391 - val_loss: 0.5456
Epoch 100/100
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7381 - loss: 0.5378 - val_accuracy: 0.7378 - val_loss: 0.5462


In [None]:
#### Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(Dense(128, input_dim=n_features, activation='relu'))

# Second hidden layer
nn.add(Dense(64, activation='tanh')) 

# Third hidden layer
nn.add(Dense(32, activation = 'tanh'))

# fourth hidden layer
nn.add(Dense(16, activation = 'tanh'))

# Output layer
nn.add(Dense(1, activation='sigmoid'))
# Compile the model

nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
nn.summary()

### Confirm the data type for X_train, and y_train.

print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_train_:", y_train.shape)

# Convert X_train and y_train to float32 if they are not already
X_train_scaled = X_train_scaled.astype('float32')
y_train = y_train.astype('float32')

In [None]:
# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=1000, batch_size=10, validation_split=0.2, verbose=1)

### Output the 2 Hidden Layer Model Final Metrics

In [46]:
# Print out the final metrics
history_dict = history.history

# Retrieve the final metrics from the last epoch
final_train_loss = history_dict['loss'][-1]
final_train_accuracy = history_dict['accuracy'][-1]
final_validation_loss = history_dict['val_loss'][-1]
final_validation_accuracy = history_dict['val_accuracy'][-1]

# Print out the final metrics
print(f"Final training loss: {final_train_loss:.4f}")
print(f"Final training accuracy: {final_train_accuracy:.4f}")
print(f"Final validation loss: {final_validation_loss:.4f}")
print(f"Final validation accuracy: {final_validation_accuracy:.4f}")

Final training loss: 0.5344
Final training accuracy: 0.7390
Final validation loss: 0.5462
Final validation accuracy: 0.7378


## Build and Compile the 4 Hidden Layer Model

In [47]:
#### Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(Dense(128, input_dim=n_features, activation='relu'))

# Second hidden layer
nn.add(Dense(64, activation='tanh')) 

# Third hidden layer
nn.add(Dense(32, activation = 'tanh'))

# fourth hidden layer
nn.add(Dense(16, activation = 'tanh'))

# Output layer
nn.add(Dense(1, activation='sigmoid'))
# Compile the model

nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
nn.summary()

### Confirm the data type for X_train, and y_train.

print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_train_:", y_train.shape)

# Convert X_train and y_train to float32 if they are not already
X_train_scaled = X_train_scaled.astype('float32')
y_train = y_train.astype('float32')


Shape of X_train_scaled: (27439, 43)
Shape of y_train_: (27439,)


### Fit the 4 Hidden Layer Model over 50 Epochs.

In [None]:
# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=1000, batch_size=10, validation_split=0.2, verbose=1)


Epoch 1/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.7166 - loss: 0.5769 - val_accuracy: 0.7378 - val_loss: 0.5539
Epoch 2/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.7313 - loss: 0.5537 - val_accuracy: 0.7341 - val_loss: 0.5513
Epoch 3/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.7234 - loss: 0.5593 - val_accuracy: 0.7360 - val_loss: 0.5521
Epoch 4/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.7292 - loss: 0.5545 - val_accuracy: 0.7354 - val_loss: 0.5488
Epoch 5/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.7218 - loss: 0.5593 - val_accuracy: 0.7371 - val_loss: 0.5463
Epoch 6/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.7310 - loss: 0.5522 - val_accuracy: 0.7360 - val_loss: 0.546

Epoch 50/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.7402 - loss: 0.5364 - val_accuracy: 0.7362 - val_loss: 0.5458
Epoch 51/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.7384 - loss: 0.5372 - val_accuracy: 0.7391 - val_loss: 0.5465
Epoch 52/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.7324 - loss: 0.5446 - val_accuracy: 0.7321 - val_loss: 0.5492
Epoch 53/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.7320 - loss: 0.5439 - val_accuracy: 0.7354 - val_loss: 0.5465
Epoch 54/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.7388 - loss: 0.5393 - val_accuracy: 0.7380 - val_loss: 0.5448
Epoch 55/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.7340 - loss: 0.5388 - val_accuracy: 0.7363 - val_loss:

Epoch 99/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.7396 - loss: 0.5363 - val_accuracy: 0.7351 - val_loss: 0.5461
Epoch 100/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7370 - loss: 0.5361 - val_accuracy: 0.7362 - val_loss: 0.5464
Epoch 101/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7375 - loss: 0.5367 - val_accuracy: 0.7376 - val_loss: 0.5473
Epoch 102/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7415 - loss: 0.5352 - val_accuracy: 0.7347 - val_loss: 0.5476
Epoch 103/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7397 - loss: 0.5384 - val_accuracy: 0.7378 - val_loss: 0.5446
Epoch 104/1000
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7368 - loss: 0.5384 - val_accuracy: 0.7392 - val_loss: 

### Output 4 Hidden Layer Model Final Metrics

In [None]:
# Print out the final metrics
history_dict = history.history

# Retrieve the final metrics from the last epoch
final_train_loss = history_dict['loss'][-1]
final_train_accuracy = history_dict['accuracy'][-1]
final_validation_loss = history_dict['val_loss'][-1]
final_validation_accuracy = history_dict['val_accuracy'][-1]

# Print out the final metrics
print(f"Final training loss: {final_train_loss:.4f}")
print(f"Final training accuracy: {final_train_accuracy:.4f}")
print(f"Final validation loss: {final_validation_loss:.4f}")
print(f"Final validation accuracy: {final_validation_accuracy:.4f}")


## Export the Model to an HDFS File called 'AlphabetSoup.Charity.h5'

In [None]:
# Export our model to HDF5 file
from tensorflow.keras.models import load_model

# Load the model from the file
nn.save('AlphabetSoup.Charity.h5')