# Tweaking the Features, Part 2

Our first round of feature tweaks brought us a certain set of features, but I also want to test the influence of the number of values collapsed into the Other value for certain features.

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [2]:
application_df = pd.read_csv("Data/charity_data.csv")[["APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "ORGANIZATION", "INCOME_AMT", "IS_SUCCESSFUL"]]

In [3]:
application_df['APPLICATION_TYPE'].value_counts()

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [4]:
application_df['AFFILIATION'].value_counts()

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

In [5]:
temp = application_df['CLASSIFICATION'].value_counts()
temp[temp > 10]

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
Name: CLASSIFICATION, dtype: int64

In [6]:
application_df['ORGANIZATION'].value_counts()

Trust           23515
Association     10255
Co-operative      486
Corporation        43
Name: ORGANIZATION, dtype: int64

In [7]:
application_df['INCOME_AMT'].value_counts()

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

In [8]:
# Only generate for APPLICATION_TYPE, CLASSIFICATION, and INCOME_AMT
# because the others were rather obvious where the outliers start
def get_column_counts():
    for app_type_count in range(5,9):
        for classify_count in range(5,8):
            for inc_count in range(3,5):
                yield (app_type_count, classify_count, inc_count)

In [9]:
def create_app_df(col_counts):
    application_df = pd.read_csv("Data/charity_data.csv")[["APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "ORGANIZATION", "INCOME_AMT", "IS_SUCCESSFUL"]]
    # Choose a cutoff value and create a list of application types to be replaced
    for app in application_df['APPLICATION_TYPE'].value_counts().index[col_counts[0]:]:
        application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

    # Choose a cutoff value and create a list of affiliations to be replaced
    for app in application_df['AFFILIATION'].value_counts().index[2:]:
        application_df['AFFILIATION'] = application_df['AFFILIATION'].replace(app,"Other")

    # Choose a cutoff value and create a list of classifications to be replaced
    for cls in application_df['CLASSIFICATION'].value_counts().index[col_counts[1]:]:
        application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

    # Choose a cutoff value and create a list of organizations to be replaced
    for app in application_df['ORGANIZATION'].value_counts().index[2:]:
        application_df['ORGANIZATION'] = application_df['ORGANIZATION'].replace(app,"Other")

    # Choose a cutoff value and create a list of income amounts to be replaced
    for inc in application_df['INCOME_AMT'].value_counts().index[col_counts[2]:]:
        application_df['INCOME_AMT'] = application_df['INCOME_AMT'].replace(inc,"Other")

    return pd.get_dummies(application_df)

In [10]:
def create_model(num_cols):
    # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
    # Use most efficient neural network from tuning notebook. Won't always be most efficient, but it gives us a chance
    # 0 - selu 6, 1 - selu 13, 2 - tanh 7, output - sigmoid 1
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(units=6, activation="selu", input_dim=num_cols))
    model.add(tf.keras.layers.Dense(units=13, activation="selu"))
    model.add(tf.keras.layers.Dense(units=7, activation="tanh"))
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    return model

In [11]:
# Beware, this can take almost 6 hours to run!
all_model_stats = []

for col_counts in get_column_counts():
    application_df = create_app_df(col_counts)

    model_summary = { 'counts': col_counts }

    # Split our preprocessed data into our features and target arrays
    features = application_df.drop("IS_SUCCESSFUL", axis=1)
    target = application_df["IS_SUCCESSFUL"]

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)

    # Create a StandardScaler instance
    # Fit the StandardScaler
    X_scaler = StandardScaler().fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    model = create_model(len(features.columns))

    # Train the model
    history = model.fit(
        X_train_scaled,
        y_train,
        epochs=34,
        verbose=0
        )

    print(col_counts)
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    model_summary["accuracy"] = model_accuracy
    model_summary["loss"] = model_loss

    all_model_stats.append(model_summary)

(5, 5, 3)
268/268 - 0s - loss: 0.5618 - accuracy: 0.7221 - 409ms/epoch - 2ms/step
(5, 5, 4)
268/268 - 0s - loss: 0.5628 - accuracy: 0.7234 - 375ms/epoch - 1ms/step
(5, 6, 3)
268/268 - 0s - loss: 0.5629 - accuracy: 0.7186 - 377ms/epoch - 1ms/step
(5, 6, 4)
268/268 - 0s - loss: 0.5621 - accuracy: 0.7237 - 378ms/epoch - 1ms/step
(5, 7, 3)
268/268 - 0s - loss: 0.5642 - accuracy: 0.7212 - 376ms/epoch - 1ms/step
(5, 7, 4)
268/268 - 0s - loss: 0.5641 - accuracy: 0.7178 - 400ms/epoch - 1ms/step
(6, 5, 3)
268/268 - 0s - loss: 0.5634 - accuracy: 0.7226 - 394ms/epoch - 1ms/step
(6, 5, 4)
268/268 - 0s - loss: 0.5606 - accuracy: 0.7256 - 391ms/epoch - 1ms/step
(6, 6, 3)
268/268 - 0s - loss: 0.5623 - accuracy: 0.7237 - 380ms/epoch - 1ms/step
(6, 6, 4)
268/268 - 0s - loss: 0.5620 - accuracy: 0.7215 - 378ms/epoch - 1ms/step
(6, 7, 3)
268/268 - 0s - loss: 0.5626 - accuracy: 0.7224 - 376ms/epoch - 1ms/step
(6, 7, 4)
268/268 - 0s - loss: 0.5602 - accuracy: 0.7265 - 383ms/epoch - 1ms/step
(7, 5, 3)
268/26

In [12]:
# Print all summaries, in order of descending accuracy
for i, model_stats in enumerate(sorted(all_model_stats, key=lambda x: x["accuracy"], reverse=True)):
    print(f'Number {i+1} most effective\nCounts: {model_stats["counts"]}\n{model_stats["accuracy"]}\n{model_stats["loss"]}\n--------------')

Number 1 most effective
Counts: (8, 5, 3)
0.72967928647995
0.5594186186790466
--------------
Number 2 most effective
Counts: (7, 6, 4)
0.728396475315094
0.5572439432144165
--------------
Number 3 most effective
Counts: (7, 6, 3)
0.7280466556549072
0.5593292713165283
--------------
Number 4 most effective
Counts: (8, 7, 4)
0.7280466556549072
0.5587282776832581
--------------
Number 5 most effective
Counts: (7, 5, 3)
0.7278134226799011
0.5592970252037048
--------------
Number 6 most effective
Counts: (7, 7, 3)
0.7276967763900757
0.558948278427124
--------------
Number 7 most effective
Counts: (7, 5, 4)
0.727580189704895
0.5584208965301514
--------------
Number 8 most effective
Counts: (8, 7, 3)
0.727580189704895
0.5576977729797363
--------------
Number 9 most effective
Counts: (8, 6, 4)
0.7272303104400635
0.5575675368309021
--------------
Number 10 most effective
Counts: (7, 7, 4)
0.7269970774650574
0.5587436556816101
--------------
Number 11 most effective
Counts: (6, 7, 4)
0.7265306115

# Analysis

Given that there was very little appreciable difference between any of these permutations, I think we can safely say that where the line is drawn for outliers doesn't matter much for these features. When you include the results of FeatureTweaks done prior to this, and the fact that its most efficient accuracies were within random-chance tolerances of these results, I don't think it that most of the outliers really affect the training chances at all.

Still, I'll use the counts from the number 1 most effective here, if only because they were the cutoffs that I had expected would produce the best results.

We're still sitting below our target accuracy, though, so in absence of other ideas, I'll perform a round of more intensive tuning in AlphabetSoupCharity_TuningPt2