## Optimization Attempt 1: Preprocessing

In [233]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [234]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)


In [235]:
# Determine the number of unique values in each column.
print(application_df.nunique())

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [236]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
app_type_counts = application_df["APPLICATION_TYPE"].value_counts()
print(app_type_counts)

APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64


In [237]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(app_type_counts[app_type_counts < 1000].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
print(application_df['APPLICATION_TYPE'].value_counts())

APPLICATION_TYPE
T3       27037
Other     2266
T4        1542
T6        1216
T5        1173
T19       1065
Name: count, dtype: int64


In [238]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
class_counts = application_df['CLASSIFICATION'].value_counts()
print(class_counts)

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64


In [239]:
# You may find it helpful to look at CLASSIFICATION value counts >1
class_counts_gt_1 = class_counts[class_counts > 1]
print(class_counts_gt_1)


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64


In [240]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(class_counts[class_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
print(application_df['CLASSIFICATION'].value_counts())

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64


In [241]:
# Investigating status column
print(application_df['STATUS'].value_counts())

STATUS
1    34294
0        5
Name: count, dtype: int64


In [242]:
# Investigating special_considerations column
print(application_df['SPECIAL_CONSIDERATIONS'].value_counts())

SPECIAL_CONSIDERATIONS
N    34272
Y       27
Name: count, dtype: int64


In [243]:
# Removing both status and special_considerations columns
application_df = application_df.drop(['STATUS', 'SPECIAL_CONSIDERATIONS'], axis=1)

application_df.head()


Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,Other,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


In [244]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_df = pd.get_dummies(application_df)
encoded_df.head()

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,108590,1,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,5000,0,False,False,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,False
3,6692,1,False,False,True,False,False,False,True,False,...,True,False,False,True,False,False,False,False,False,False
4,142590,1,False,False,True,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


In [245]:
# Split our preprocessed data into our features and target arrays
y = encoded_df['IS_SUCCESSFUL']
X = encoded_df.drop('IS_SUCCESSFUL', axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)


In [246]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Optimization Attempt 1: Compile, Train and Evaluate the Model

In [247]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(10, activation='relu', input_dim=len(X_train_scaled[0])))

# Second hidden layer
nn.add(tf.keras.layers.Dense(10, activation='tanh'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [248]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [249]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)


Epoch 1/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6937 - loss: 0.6022
Epoch 2/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7200 - loss: 0.5665
Epoch 3/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7249 - loss: 0.5612
Epoch 4/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7281 - loss: 0.5565
Epoch 5/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7253 - loss: 0.5627
Epoch 6/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7246 - loss: 0.5580
Epoch 7/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7301 - loss: 0.5511
Epoch 8/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7225 - loss: 0.5586
Epoch 9/50
[1m804/804[0m [32m━━━━━━━━

In [250]:
# Evaluate the model using the test data
print(f"This model was created as an attempt to further optimize the model found in the file named \n'Alphabet_Soup_Charity.ipynb'\n")
print(f"Changes made to original model:\n1. Application_Type cutoff is now 1000.\n2. Dropped 'Status' and 'Special_Considerations' columns.\n3. Hidden layers now have 10 nodes.\n4. A hidden layer now using 'tanh' activation\n5. Number of epochs reduced to 50.\n")
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

This model was created as an attempt to further optimize the model found in the file named 
'Alphabet_Soup_Charity.ipynb'

Changes made to original model:
1. Application_Type cutoff is now 1000.
2. Dropped 'Status' and 'Special_Considerations' columns.
3. Hidden layers now have 10 nodes.
4. A hidden layer now using 'tanh' activation
5. Number of epochs reduced to 50.

268/268 - 1s - 2ms/step - accuracy: 0.7285 - loss: 0.5544
Loss: 0.5544366240501404, Accuracy: 0.7285131216049194


## Optimization Attempt 2: Preprocessing

In [251]:
# Read in the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [252]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)

# Determine the number of unique values in each column.
print(application_df.nunique())

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [253]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
app_type_counts = application_df["APPLICATION_TYPE"].value_counts()

# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(app_type_counts[app_type_counts < 27000].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Look at CLASSIFICATION value counts to identify and replace with "Other"
class_counts = application_df['CLASSIFICATION'].value_counts()

# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(class_counts[class_counts < 17000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
print(application_df['APPLICATION_TYPE'].value_counts(), application_df['CLASSIFICATION'].value_counts())

APPLICATION_TYPE
T3       27037
Other     7262
Name: count, dtype: int64 CLASSIFICATION
C1000    17326
Other    16973
Name: count, dtype: int64


In [254]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_df = pd.get_dummies(application_df)

# Split our preprocessed data into our features and target arrays
y = encoded_df['IS_SUCCESSFUL']
X = encoded_df.drop('IS_SUCCESSFUL', axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Optimization Attempt 2: Compile, Train and Evaluate the Model

In [255]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(10, activation='relu', input_dim=len(X_train_scaled[0])))

# Second hidden layer
nn.add(tf.keras.layers.Dense(10, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(10, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="softmax"))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="SGD", metrics=["accuracy"])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [256]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)


Epoch 1/50




[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.5293 - loss: 0.6812
Epoch 2/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5307 - loss: 0.6391
Epoch 3/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5336 - loss: 0.6140
Epoch 4/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5301 - loss: 0.6046
Epoch 5/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5362 - loss: 0.6051
Epoch 6/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5359 - loss: 0.6017
Epoch 7/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5303 - loss: 0.5963
Epoch 8/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.5359 - loss: 0.5966
Epoch 9/50
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━

In [257]:
# Evaluate the model using the test data
print(f"This model was created as a second attempt to further optimize the model found in the file named \n'Alphabet_Soup_Charity.ipynb'\n")
print(f"Changes made to original model:\n1. Application_Type cutoff is now 27000.\n2. Classification cutoff is now 17000.\n3. Hidden layers now have 10 nodes.\n4. Model now has 4 layers.\n5. Output layer now using 'softmax' activation.\n6. Optimizer changed to 'SGD'.\n7. Number of epochs reduced to 50.\n")
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

This model was created as a second attempt to further optimize the model found in the file named 
'Alphabet_Soup_Charity.ipynb'

Changes made to original model:
1. Application_Type cutoff is now 27000.
2. Classification cutoff is now 17000.
3. Hidden layers now have 10 nodes.
4. Model now has 4 layers.
5. Output layer now using 'softmax' activation.
6. Optimizer changed to 'SGD'.
7. Number of epochs reduced to 50.

268/268 - 1s - 2ms/step - accuracy: 0.5359 - loss: 0.5926
Loss: 0.5925544500350952, Accuracy: 0.5358600616455078


## Optimization Attempt 3: Preprocessing

In [258]:
# Read in the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [259]:
# Drop the non-beneficial ID column 'EIN'
application_df = application_df.drop('EIN', axis=1)

# Determine the number of unique values in each column.
print(application_df.nunique())

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64


In [261]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
app_type_counts = application_df["APPLICATION_TYPE"].value_counts()

# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(app_type_counts[app_type_counts < 500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Look at CLASSIFICATION value counts to identify and replace with "Other"
class_counts = application_df['CLASSIFICATION'].value_counts()

# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(class_counts[class_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
print(application_df['APPLICATION_TYPE'].value_counts(), application_df['CLASSIFICATION'].value_counts())

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64 CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64


In [262]:
# Looking for columns with more than 10 unique values
columns_with_many_uniques = [col for col in application_df.columns if application_df[col].nunique() > 10]

# Iterate over these columns and get the count of each unique value
for col in columns_with_many_uniques:
    print(f"Column: {col}")
    print(application_df[col].value_counts())
    print("-----------")

Column: NAME
NAME
PARENT BOOSTER USA INC                                                  1260
TOPS CLUB INC                                                            765
UNITED STATES BOWLING CONGRESS INC                                       700
WASHINGTON STATE UNIVERSITY                                              492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC                          408
                                                                        ... 
ST LOUIS SLAM WOMENS FOOTBALL                                              1
AIESEC ALUMNI IBEROAMERICA CORP                                            1
WEALLBLEEDRED ORG INC                                                      1
AMERICAN SOCIETY FOR STANDARDS IN MEDIUMSHIP & PSYCHICAL INVESTIGATI       1
WATERHOUSE CHARITABLE TR                                                   1
Name: count, Length: 19568, dtype: int64
-----------
Column: ASK_AMT
ASK_AMT
5000        25398
10478           3
15583           3
6398

In [263]:
# Create cutoff of 4 for name_counts and replace values with 'Other'
name_counts = application_df['NAME'].value_counts()

names_to_replace = list(name_counts[name_counts < 4].index)

for name in names_to_replace:
  application_df['NAME'] = application_df['NAME'].replace(name, 'Other')

# Check to make sure replacement was successful
print(application_df['NAME'].value_counts())

NAME
Other                                 19515
PARENT BOOSTER USA INC                 1260
TOPS CLUB INC                           765
UNITED STATES BOWLING CONGRESS INC      700
WASHINGTON STATE UNIVERSITY             492
                                      ...  
OMEGA PSI PHI FRATERNITY                  4
AMERICAN COACH ASSOCIATION                4
PTA NEW MEXICO CONGRESS                   4
INDUSTRIAL WORKERS OF THE WORLD           4
PHI DELTA KAPP INTERNATIONAL INC          4
Name: count, Length: 475, dtype: int64


In [264]:
# Create cutoff of 4 for ask_counts and replace values with 'Other'
ask_counts = application_df['ASK_AMT'].value_counts()

asks_to_replace = list(ask_counts[ask_counts < 4].index)

for ask in asks_to_replace:
  application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(ask, 'Other')

# Check to make sure replacement was successful
print(application_df['ASK_AMT'].value_counts())


ASK_AMT
5000     25398
Other     8901
Name: count, dtype: int64


In [205]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_df = pd.get_dummies(application_df)

# Split our preprocessed data into our features and target arrays
y = encoded_df['IS_SUCCESSFUL']
X = encoded_df.drop('IS_SUCCESSFUL', axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Optimization Attempt 3: Compile, Train and Evaluate the Model

In [209]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_dim=len(X_train_scaled[0])),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [216]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, validation_split=0.15, epochs=50)

Epoch 1/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8144 - loss: 0.3848 - val_accuracy: 0.7940 - val_loss: 0.5147
Epoch 2/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8099 - loss: 0.3923 - val_accuracy: 0.7942 - val_loss: 0.5059
Epoch 3/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8140 - loss: 0.3879 - val_accuracy: 0.7942 - val_loss: 0.5135
Epoch 4/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8139 - loss: 0.3865 - val_accuracy: 0.7942 - val_loss: 0.5250
Epoch 5/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8169 - loss: 0.3804 - val_accuracy: 0.7930 - val_loss: 0.5236
Epoch 6/50
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8170 - loss: 0.3850 - val_accuracy: 0.7948 - val_loss: 0.5106
Epoch 7/50
[1m684/684[0m 

In [232]:
# Evaluate the model using the test data
print(f"This model was created as a third attempt to further optimize the model found in the file named \n'Alphabet_Soup_Charity.ipynb'\n")
print(f"Changes made to original model:\n1. Name column was not dropped. Binned all names with value counts less than 4.\n2. Created a bin for Ask_Amt containing values with a value count less than 4.\n3. There are 5 layers now. Neurons for each are 64, 32, 16, 8, 1.\n4. Validation split of 0.15 added to .fit().\n5. Number of epochs reduced to 50.\n")
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

This model was created as a third attempt to further optimize the model found in the file named 
'Alphabet_Soup_Charity.ipynb'

Changes made to original model:
1. Name column was not dropped. Binned all names with value counts less than 4.
2. Created a bin for Ask_Amt containing values with a value count less than 4.
3. There are 5 layers now. Neurons for each are 64, 32, 16, 8, 1.
4. Validation split of 0.15 added to .fit().
5. Number of epochs reduced to 50.

268/268 - 0s - 1ms/step - accuracy: 0.8010 - loss: 0.5224
Loss: 0.5223952531814575, Accuracy: 0.8010495901107788


In [265]:
# Export our model to HDF5 file
nn.save('AlphabetSoupCharity_Optimization.h5')

