## Preprocessing. Read data file 'charity_data.csv', and create dataframe 'application_df'

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


## Output characteristics and properties of dataframe.

In [2]:
application_df.shape

(34299, 12)

In [3]:
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EIN                     34299 non-null  int64 
 1   NAME                    34299 non-null  object
 2   APPLICATION_TYPE        34299 non-null  object
 3   AFFILIATION             34299 non-null  object
 4   CLASSIFICATION          34299 non-null  object
 5   USE_CASE                34299 non-null  object
 6   ORGANIZATION            34299 non-null  object
 7   STATUS                  34299 non-null  int64 
 8   INCOME_AMT              34299 non-null  object
 9   SPECIAL_CONSIDERATIONS  34299 non-null  object
 10  ASK_AMT                 34299 non-null  int64 
 11  IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.1+ MB


### Drop Non-beneficial ID Columns 'EIN', 'NAME', 'STATUS', and 'SPECIAL CONSIDERATIONS'[

In [4]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN', 'NAME', 'STATUS', 'SPECIAL_CONSIDERATIONS'], axis=1, inplace=True)

# Confirm fields have been dropped.
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   APPLICATION_TYPE  34299 non-null  object
 1   AFFILIATION       34299 non-null  object
 2   CLASSIFICATION    34299 non-null  object
 3   USE_CASE          34299 non-null  object
 4   ORGANIZATION      34299 non-null  object
 5   INCOME_AMT        34299 non-null  object
 6   ASK_AMT           34299 non-null  int64 
 7   IS_SUCCESSFUL     34299 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 2.1+ MB


### Confirmed that the fields 'EIN','NAME', 'STATUS', and 'SPECIAL_CONSIDERATIONS have been dropped. Now 8 Features for the model.

## Determine the number of Unique values in each column.

In [5]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()
# print(unique_values)

## ## Determine the number of data points for each unique value for columns with more that 10 unique values, in this case APPLICATION_TYPE.

In [6]:
# Look at APPLICATION_TYPE value counts for binning
# Get value counts for the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Print the value counts
# print(application_type_counts)

## Selected 250 as the cut off point for APPLICATION_TYPES. All counts below 250 are binned as 'other'

In [7]:
# Get value counts for the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Choose a cutoff value to determine which types to replace
cutoff = 500

# Create a list of application types to be replaced (those below the cutoff)
application_types_to_replace = application_type_counts[application_type_counts < cutoff].index.tolist()

# Print the list of application types to replace
# print(application_types_to_replace)

In [8]:
# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
# application_df['APPLICATION_TYPE'].value_counts()

### Binning is successful

## Selected 1000 as the cut off point for CLASSIFICATION. First counted the numbers of each classification, then removed those with a classification, then binned those below 1000 as 'other'. 

In [9]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Print the value counts
# print(classification_counts)

In [10]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classifications_greater_than_one = classification_counts[classification_counts > 1]

# Print the filtered classifications and their counts
# print(classifications_greater_than_one)

In [11]:
# Choose a cutoff value and create a list of classifications to be replaced
cutoff = 1000

# Create a list of classifications to be replaced (those below the cutoff)
classifications_to_replace = classification_counts[classification_counts < cutoff].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")
    
# Check to make sure binning was successful
# application_df['CLASSIFICATION'].value_counts()

### Encode Categorical variables using pd.get_dummies()

In [12]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df_numeric = pd.get_dummies(application_df)

# Print the first first of the new DataFrame to verify the transformation
# print(application_df_numeric.head(5))

## Split this prepoccessed data into a Target array (IS_SUCCESSFUL), and Features array.

In [13]:
# Extract the target variable into its own array
y = application_df_numeric['IS_SUCCESSFUL'].values

# Drop the target column from the DataFrame and use the rest as the features array
X = application_df_numeric.drop(['IS_SUCCESSFUL'], axis=1).values

# Print the shapes of X and y to confirm the split
print("Shape of Features Matrix:", X.shape)
print("Shape of Target Array:", y.shape)

Shape of Features Matrix: (34299, 40)
Shape of Target Array: (34299,)


### Split the data set into train and test

In [14]:
# Split the data set into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train/test datasets to verify the split
print("Training set shape (features):", X_train.shape)
print("Test set shape (features):", X_test.shape)
print("Training set shape (target):", y_train.shape)
print("Test set shape (target):", y_test.shape)

Training set shape (features): (27439, 40)
Test set shape (features): (6860, 40)
Training set shape (target): (27439,)
Test set shape (target): (6860,)


## Create a StandardScaler instances

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Define, Compile, Train and Evaluate the Model

In [16]:
#### Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(Dense(128, input_dim=n_features, activation='relu'))

# Second hidden layer
nn.add(Dense(64, activation='relu')) 

# Third hidden layer
nn.add(Dense(32, activation = 'relu'))

# fourth hidden layer
nn.add(Dense(16, activation = 'relu'))

# Output layer
nn.add(Dense(1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# Model summary
nn.summary()

### Confirm the data type for X_train, and y_train.

In [19]:
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_train_:", y_train.shape)

# Convert X_train and y_train to float32 if they are not already
X_train_scaled = X_train_scaled.astype('float32')
y_train = y_train.astype('float32')

Shape of X_train_scaled: (27439, 40)
Shape of y_train_: (27439,)


## Fit the Model over 50 Epochs.

In [20]:
# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=50, batch_size=10, validation_split=0.2, verbose=1)

Epoch 1/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7111 - loss: 0.5820 - val_accuracy: 0.7349 - val_loss: 0.5599
Epoch 2/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7228 - loss: 0.5601 - val_accuracy: 0.7367 - val_loss: 0.5502
Epoch 3/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7281 - loss: 0.5556 - val_accuracy: 0.7334 - val_loss: 0.5461
Epoch 4/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7312 - loss: 0.5525 - val_accuracy: 0.7356 - val_loss: 0.5479
Epoch 5/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7295 - loss: 0.5535 - val_accuracy: 0.7369 - val_loss: 0.5467
Epoch 6/50
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7306 - loss: 0.5509 - val_accuracy: 0.7392 - val_loss: 0.5524
Epoch 7/50
[1m2

### Output 4 Hidden Layer Model after dropping STATUS and SPECIAL_CONSIDERATIONS Features Final Metrics

In [21]:
# Print out the final metrics
history_dict = history.history

# Retrieve the final metrics from the last epoch
final_train_loss = history_dict['loss'][-1]
final_train_accuracy = history_dict['accuracy'][-1]
final_validation_loss = history_dict['val_loss'][-1]
final_validation_accuracy = history_dict['val_accuracy'][-1]

# Print out the final metrics
print(f"Final training loss: {final_train_loss:.4f}")
print(f"Final training accuracy: {final_train_accuracy:.4f}")
print(f"Final validation loss: {final_validation_loss:.4f}")
print(f"Final validation accuracy: {final_validation_accuracy:.4f}")

Final training loss: 0.5374
Final training accuracy: 0.7374
Final validation loss: 0.5546
Final validation accuracy: 0.7371


### Export the Model to an HDFS File called 'AlphabetSoup.Charity.h5'

In [22]:
# Export our model to HDF5 file
from tensorflow.keras.models import load_model

# Load the model from the file
nn.save('AlphabetSoup-drop_features.Charity.h5')



ImportError: `save_model()` using h5 format requires h5py. Could not import h5py.