In [4]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd


In [5]:
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df = application_df.drop(columns=["EIN", "NAME"])
application_df.nunique()
application_type_counts = application_df["APPLICATION_TYPE"].value_counts().to_dict()
cutoff_value = 527
application_types_to_replace = []
for atype in application_type_counts:
    if application_type_counts[atype] < cutoff_value:
        application_types_to_replace.append(atype)
# Replace in dataframe
# After creating the application_types_to_replace list,
# another for loop is used to replace the application types in the "APPLICATION_TYPE" column of the DataFrame with "Other".
# This ensures that the application types identified in the application_types_to_replace list are replaced accordingly.
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()
classifications_counts = application_df["CLASSIFICATION"].value_counts().to_dict()
cutoff_value_c = 1882
classifications_to_replace = []
for classf in classifications_counts:
    if classifications_counts[classf] < cutoff_value_c:
        classifications_to_replace.append(classf)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")


categorical_columns = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']


dummy_df = pd.get_dummies(application_df[categorical_columns], columns=categorical_columns)



application_df.drop(columns=categorical_columns, inplace=True)

application_df = pd.concat([application_df, dummy_df], axis=1)



Attempt 1:

In [7]:
def detect_columns_with_outliers_iqr(df, lower_percentile=25, upper_percentile=75, multiplier=1.5):
    # Initialize an empty list to store column names with outliers
    columns_with_outliers = []

    # Loop through each column in the DataFrame
    for column_name in df.columns:
        # Calculate the IQR for the column
        Q1 = df[column_name].quantile(lower_percentile / 100)
        Q3 = df[column_name].quantile(upper_percentile / 100)
        IQR = Q3 - Q1

        # Define the lower and upper bounds to identify outliers
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR

        # Check if any value is outside the bounds
        if any((df[column_name] < lower_bound) | (df[column_name] > upper_bound)):
            columns_with_outliers.append(column_name)

    return columns_with_outliers


def drop_outliers_iqr(df, column_name, lower_percentile=25, upper_percentile=75, multiplier=1.5):
    # Calculate the IQR for the specified column
    Q1 = df[column_name].quantile(lower_percentile / 100)
    Q3 = df[column_name].quantile(upper_percentile / 100)
    IQR = Q3 - Q1

    # Define the lower and upper bounds to identify outliers
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR

    # Drop rows with values outside the bounds
    df_cleaned = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

    return df_cleaned



In [11]:
y=application_df['IS_SUCCESSFUL']
x = application_df.drop(columns=['IS_SUCCESSFUL'])
print(x.shape)
columns_with_outliers = detect_columns_with_outliers_iqr(x, lower_percentile=25, upper_percentile=75, multiplier=1.5)
print("Columns with outliers based on IQR:", columns_with_outliers)
for c in columns_with_outliers:
  x=drop_outliers_iqr(x,c,lower_percentile=25, upper_percentile=75, multiplier=1.5)
print(x.shape)

(34299, 43)
Columns with outliers based on IQR: ['STATUS', 'ASK_AMT', 'APPLICATION_TYPE_Other', 'APPLICATION_TYPE_T10', 'APPLICATION_TYPE_T19', 'APPLICATION_TYPE_T3', 'APPLICATION_TYPE_T4', 'APPLICATION_TYPE_T5', 'APPLICATION_TYPE_T6', 'APPLICATION_TYPE_T7', 'APPLICATION_TYPE_T8', 'AFFILIATION_Family/Parent', 'AFFILIATION_National', 'AFFILIATION_Other', 'AFFILIATION_Regional', 'CLASSIFICATION_C1200', 'CLASSIFICATION_C2000', 'CLASSIFICATION_C2100', 'CLASSIFICATION_C3000', 'CLASSIFICATION_Other', 'USE_CASE_CommunityServ', 'USE_CASE_Heathcare', 'USE_CASE_Other', 'USE_CASE_Preservation', 'USE_CASE_ProductDev', 'ORGANIZATION_Co-operative', 'ORGANIZATION_Corporation', 'INCOME_AMT_1-9999', 'INCOME_AMT_10000-24999', 'INCOME_AMT_100000-499999', 'INCOME_AMT_10M-50M', 'INCOME_AMT_1M-5M', 'INCOME_AMT_25000-99999', 'INCOME_AMT_50M+', 'INCOME_AMT_5M-10M', 'SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y']
(11478, 43)


In [12]:

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
new_nn = tf.keras.models.Sequential()

# Input layer
new_nn.add(tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)))

# First hidden layer with 64 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Dropout layer to reduce overfitting
new_nn.add(tf.keras.layers.Dropout(0.2))

# Second hidden layer with 32 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Third hidden layer with 16 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Fourth hidden layer with 8 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=8, activation='relu'))

# Output layer with 1 neuron and sigmoid activation for binary classification
new_nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Compile the new model
new_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Check the summary of the new model
new_nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                2816      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 8)                 136       
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                 
Total params: 5,569
Trainable params: 5,569
Non-trainabl

In [15]:
num_epochs=150
batch_size=32
new_nn.fit(X_train_scaled, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_scaled, y_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ac96e153af0>

In [17]:
model_loss, model_accuracy = new_nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 1s - loss: 0.5640 - accuracy: 0.7245 - 576ms/epoch - 3ms/step
Loss: 0.5640274286270142, Accuracy: 0.7244898080825806


Attemp 2

In [18]:
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df = application_df.drop(columns=["EIN", "NAME"])
application_df.nunique()
application_type_counts = application_df["APPLICATION_TYPE"].value_counts().to_dict()
cutoff_value = 100
application_types_to_replace = []
for atype in application_type_counts:
    if application_type_counts[atype] < cutoff_value:
        application_types_to_replace.append(atype)
# Replace in dataframe
# After creating the application_types_to_replace list,
# another for loop is used to replace the application types in the "APPLICATION_TYPE" column of the DataFrame with "Other".
# This ensures that the application types identified in the application_types_to_replace list are replaced accordingly.
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()
classifications_counts = application_df["CLASSIFICATION"].value_counts().to_dict()
cutoff_value_c = 100
classifications_to_replace = []
for classf in classifications_counts:
    if classifications_counts[classf] < cutoff_value_c:
        classifications_to_replace.append(classf)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")


categorical_columns = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']


dummy_df = pd.get_dummies(application_df[categorical_columns], columns=categorical_columns)



application_df.drop(columns=categorical_columns, inplace=True)

application_df = pd.concat([application_df, dummy_df], axis=1)


In [19]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
new_nn = tf.keras.models.Sequential()

# Input layer
new_nn.add(tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)))

# First hidden layer with 64 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Dropout layer to reduce overfitting
new_nn.add(tf.keras.layers.Dropout(0.2))

# Second hidden layer with 32 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Third hidden layer with 16 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Fourth hidden layer with 8 neurons and ReLU activation
new_nn.add(tf.keras.layers.Dense(units=8, activation='relu'))

# Output layer with 1 neuron and sigmoid activation for binary classification
new_nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Compile the new model
new_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Check the summary of the new model
new_nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 64)                2816      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 16)                528       
                                                                 
 dense_8 (Dense)             (None, 8)                 136       
                                                                 
 dense_9 (Dense)             (None, 1)                 9         
                                                                 
Total params: 5,569
Trainable params: 5,569
Non-traina

In [21]:
num_epochs=150
batch_size=32
new_nn.fit(X_train_scaled, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_scaled, y_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ac95a63ae60>

In [22]:
model_loss, model_accuracy = new_nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - loss: 0.5622 - accuracy: 0.7265 - 309ms/epoch - 1ms/step
Loss: 0.5622490644454956, Accuracy: 0.7265306115150452


Attempt 3:

In [24]:
sophisticated_nn = tf.keras.models.Sequential()

sophisticated_nn.add(tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)))


sophisticated_nn.add(tf.keras.layers.Dense(units=128, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dropout(0.2))


sophisticated_nn.add(tf.keras.layers.Dense(units=64, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=32, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=16, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


sophisticated_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [25]:
num_epochs=50
batch_size=32
sophisticated_nn.fit(X_train_scaled, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_scaled, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ac95a632fe0>

In [26]:
model_loss, model_accuracy = sophisticated_nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - loss: 0.5615 - accuracy: 0.7252 - 467ms/epoch - 2ms/step
Loss: 0.5614736676216125, Accuracy: 0.725218653678894


Attempt 4:

In [28]:
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df = application_df.drop(columns=["EIN", "NAME"])
application_df.nunique()
application_type_counts = application_df["APPLICATION_TYPE"].value_counts().to_dict()
cutoff_value = 527
application_types_to_replace = []
for atype in application_type_counts:
    if application_type_counts[atype] < cutoff_value:
        application_types_to_replace.append(atype)
# Replace in dataframe
# After creating the application_types_to_replace list,
# another for loop is used to replace the application types in the "APPLICATION_TYPE" column of the DataFrame with "Other".
# This ensures that the application types identified in the application_types_to_replace list are replaced accordingly.
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()
classifications_counts = application_df["CLASSIFICATION"].value_counts().to_dict()
cutoff_value_c = 1882
classifications_to_replace = []
for classf in classifications_counts:
    if classifications_counts[classf] < cutoff_value_c:
        classifications_to_replace.append(classf)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")


categorical_columns = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']


dummy_df = pd.get_dummies(application_df[categorical_columns], columns=categorical_columns)



application_df.drop(columns=categorical_columns, inplace=True)

application_df = pd.concat([application_df, dummy_df], axis=1)


In [29]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
sophisticated_nn = tf.keras.models.Sequential()

sophisticated_nn.add(tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)))


sophisticated_nn.add(tf.keras.layers.Dense(units=128, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dropout(0.2))


sophisticated_nn.add(tf.keras.layers.Dense(units=64, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=32, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=16, activation='relu'))


sophisticated_nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


sophisticated_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [31]:
num_epochs=80
batch_size=32
sophisticated_nn.fit(X_train_scaled, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_scaled, y_test))

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7ac95a27a0e0>

In [32]:
model_loss, model_accuracy = sophisticated_nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - loss: 0.5636 - accuracy: 0.7259 - 454ms/epoch - 2ms/step
Loss: 0.5636117458343506, Accuracy: 0.7259474992752075
