## Preprocessing

In [5]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [6]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
#  YOUR CODE GOES HERE
application_df = application_df.drop(columns=['EIN', 'NAME'])

In [7]:
# Determine the number of unique values in each column.
application_df.nunique()

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [8]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [9]:
# Determine the number of data points for each unique value in columns with more than 10 unique values
columns_to_analyze = application_df.columns[application_df.nunique() > 10]
for column in columns_to_analyze:
    print(f"\nValue counts for {column}:")
    print(application_df[column].value_counts())

# Based on value counts, determine cutoff points and replace rare values with "Other"
# For APPLICATION_TYPE, cutoff of 500 data points
application_types_to_replace = application_df['APPLICATION_TYPE'].value_counts()[application_df['APPLICATION_TYPE'].value_counts() < 500].index
for app_type in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app_type, 'Other')

# For CLASSIFICATION, cutoff of 1000 data points
classifications_to_replace = application_df['CLASSIFICATION'].value_counts()[application_df['CLASSIFICATION'].value_counts() < 1000].index
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, 'Other')

# Convert categorical variables to numeric using pd.get_dummies
application_df_encoded = pd.get_dummies(application_df)

# Split the preprocessed data into features (X) and target (y)
y = application_df_encoded['IS_SUCCESSFUL']
X = application_df_encoded.drop(columns=['IS_SUCCESSFUL'])

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



Value counts for APPLICATION_TYPE:
APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64

Value counts for CLASSIFICATION:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64

Value counts for ASK_AMT:
ASK_AMT
5000        25398
10478           3
15583           3
63981           3
6725            3
            ...  
5371754         1
30060           1
43091152        1
18683           1
36500179        1
Name: count, Length: 8747, dtype: int64


In [10]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
# Cutoff of > 500 data points for APPLICATION_TYPE
application_types_to_replace = ['T9', 'T13', 'T12', 'T2', 'T14', 'T25', 'T15', 'T29', 'T17']

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [11]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Create a list of classifications to replace (counts < 100)
classifications_to_replace = classification_counts[classification_counts < 100].index

# Replace classifications with "Other" where count < 100
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, 'Other')

# Check the updated value counts
print(application_df['CLASSIFICATION'].value_counts())


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64


In [12]:
# Display value counts for CLASSIFICATION 'Other'
other_classifications_counts = application_df[application_df['CLASSIFICATION'] == 'Other']['CLASSIFICATION'].value_counts()
print(other_classifications_counts)


CLASSIFICATION
Other    2261
Name: count, dtype: int64


In [13]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
#  YOUR CODE GOES HERE

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df_encoded = pd.get_dummies(application_df)

In [15]:
# Split our preprocessed data into our features and target arrays
y = application_df_encoded['IS_SUCCESSFUL']
X = application_df_encoded.drop(columns=['IS_SUCCESSFUL'])

# Split the preprocessed data into training and testing datasets
# train_test_split from sklearn automatically shuffles and splits the data
# - X contains our feature variables (all columns except IS_SUCCESSFUL)
# - y contains our target variable (IS_SUCCESSFUL column)
# - random_state=42 ensures reproducibility of the split
# - Default test_size=0.25 means 75% training data, 25% testing data
# This split allows us to:
# 1. Train the model on one dataset (X_train, y_train)
# 2. Evaluate performance on separate holdout data (X_test, y_test)
# 3. Avoid overfitting by testing on unseen data
# 4. Random state for reproducibility

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [17]:
# Compile, Train, and Evaluate the Model

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=80, activation='relu', input_dim=43))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Model Parameters:
- Input dimension: 43 features (input_dim=43)
- First hidden layer: 80 neurons with ReLU activation
- Second hidden layer: 30 neurons with ReLU activation  
- Output layer: 1 neuron with sigmoid activation
- Loss function: binary_crossentropy
- Optimizer: adam
- Metrics: accuracy
- Epochs: 100


In [18]:
# Check the shape of X_train_scaled to see number of features/parameters
print(f"Number of features in X_train_scaled: {X_train_scaled.shape[1]}")
print(f"Number of samples in X_train_scaled: {X_train_scaled.shape[0]}")
print("\nX_train_scaled shape:", X_train_scaled.shape)


Number of features in X_train_scaled: 43
Number of samples in X_train_scaled: 25724

X_train_scaled shape: (25724, 43)


In [19]:
# Get the column names from application_df_encoded
print("\nColumns in application_df_encoded:")
for column in application_df_encoded.columns:
    print(column)



Columns in application_df_encoded:
STATUS
ASK_AMT
IS_SUCCESSFUL
APPLICATION_TYPE_Other
APPLICATION_TYPE_T10
APPLICATION_TYPE_T19
APPLICATION_TYPE_T3
APPLICATION_TYPE_T4
APPLICATION_TYPE_T5
APPLICATION_TYPE_T6
APPLICATION_TYPE_T7
APPLICATION_TYPE_T8
AFFILIATION_CompanySponsored
AFFILIATION_Family/Parent
AFFILIATION_Independent
AFFILIATION_National
AFFILIATION_Other
AFFILIATION_Regional
CLASSIFICATION_C1000
CLASSIFICATION_C1200
CLASSIFICATION_C2000
CLASSIFICATION_C2100
CLASSIFICATION_C3000
CLASSIFICATION_Other
USE_CASE_CommunityServ
USE_CASE_Heathcare
USE_CASE_Other
USE_CASE_Preservation
USE_CASE_ProductDev
ORGANIZATION_Association
ORGANIZATION_Co-operative
ORGANIZATION_Corporation
ORGANIZATION_Trust
INCOME_AMT_0
INCOME_AMT_1-9999
INCOME_AMT_10000-24999
INCOME_AMT_100000-499999
INCOME_AMT_10M-50M
INCOME_AMT_1M-5M
INCOME_AMT_25000-99999
INCOME_AMT_50M+
INCOME_AMT_5M-10M
SPECIAL_CONSIDERATIONS_N
SPECIAL_CONSIDERATIONS_Y


In [20]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# Train the model
# Update input_dim in first layer to match input shape of 43 features
nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=80, activation='relu', input_dim=43))
nn.add(tf.keras.layers.Dense(units=30, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6904 - loss: 0.6004
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7327 - loss: 0.5511
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7317 - loss: 0.5513
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7319 - loss: 0.5507
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7336 - loss: 0.5441
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7341 - loss: 0.5468
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7275 - loss: 0.5506
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7357 - loss: 0.5422
Epoch 9/100
[1m804/804[0m [32

In [22]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Get the actual columns from X_train_scaled
num_features = X_train_scaled.shape[1]
feature_names = application_df_encoded.columns[:num_features]

# Convert numpy arrays back to dataframes using only the matching columns
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=None)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names, index=None)

# Define columns to drop (only include columns that exist in our dataframe)
columns_to_drop = [col for col in ['SPECIAL_CONSIDERATIONS_Y', 'AFFILIATION_Regional',
                  'CLASSIFICATION_C7000', 'USE_CASE_Other',
                  'INCOME_AMT_50M+', 'APPLICATION_TYPE_T7']
                  if col in feature_names]

# Drop columns using drop() method
X_train_scaled_reduced = X_train_scaled_df.drop(columns=columns_to_drop).values
X_test_scaled_reduced = X_test_scaled_df.drop(columns=columns_to_drop).values

# Show number of columns in reduced datasets for comparison
print("\nAfter dropping columns:")
print("Number of columns in X_train_scaled_reduced:", X_train_scaled_reduced.shape[1])
print("Number of columns in X_test_scaled_reduced:", X_test_scaled_reduced.shape[1])


After dropping columns:
Number of columns in X_train_scaled_reduced: 39
Number of columns in X_test_scaled_reduced: 39


In [23]:
# Show the number of columns in both datasets
print("Number of columns in X_train_scaled:", X_train_scaled.shape[1])
print("Number of columns in X_test_scaled:", X_test_scaled.shape[1])

# Also show number of columns in reduced datasets for comparison
print("\nAfter dropping columns:")
print("Number of columns in X_train_scaled_reduced:", X_train_scaled_reduced.shape[1])
print("Number of columns in X_test_scaled_reduced:", X_test_scaled_reduced.shape[1])

Number of columns in X_train_scaled: 43
Number of columns in X_test_scaled: 43

After dropping columns:
Number of columns in X_train_scaled_reduced: 39
Number of columns in X_test_scaled_reduced: 39


In [24]:
# Create and compile model with updated input dimensions
nn_reduced = tf.keras.models.Sequential()
nn_reduced.add(tf.keras.layers.Dense(units=80, activation='relu', input_dim=39))
nn_reduced.add(tf.keras.layers.Dense(units=30, activation='relu'))
nn_reduced.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
nn_reduced.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the reduced model
fit_model_reduced = nn_reduced.fit(X_train_scaled_reduced, y_train, epochs=100)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7053 - loss: 0.5901
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7339 - loss: 0.5480
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7262 - loss: 0.5591
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7354 - loss: 0.5496
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7318 - loss: 0.5503
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7369 - loss: 0.5428
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7344 - loss: 0.5444
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7354 - loss: 0.5444
Epoch 9/100
[1m804/804[0m [32m━━━━━━━━━━━

In [25]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7290 - loss: 0.5602
Loss: 0.560157835483551, Accuracy: 0.7289795875549316


In [26]:
# Export our model to HDF5 file
nn_reduced.save('AlphabetSoupCharity.h5', save_format='h5')

