# Imports

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy

In [None]:
#  Import and read the encounter data.
import pandas as pd

#  Import and read the csv file.
path = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRlIMLpo7omYmZ-GtDVeu7eXivg1QF2DRr-jFG_sCt4YIQ1V9UJvq1ryg-3pF8sl0zXiE1cF4XSdH4W/pub?gid=678314375&single=true&output=csv'

sheriff_df = pd.read_csv(path)
sheriff_df.head()

Unnamed: 0,Age,Limited or No English Fluency,Perception Made,Homeless,Gender,Race,Disability,Arrested
0,28,False,After Contact,False,Male,Hispanic/Latino/Latina,No Disabilities,False
1,40,False,After Contact,False,Male,White,No Disabilities,True
2,45,False,After Contact,False,Male,Black/African American,No Disabilities,True
3,30,False,Before Contact,True,Female,Black/African American,No Disabilities,False
4,40,False,After Contact,False,Female,White,No Disabilities,True


# Data Cleaning

In [None]:
# sheriff_df['Race'] = sheriff_df['Race'].apply(lambda x: 'Mixed' if ',' in x else x)
# sheriff_df['Race'] = sheriff_df['Race'].apply(lambda x: 'Black/African American' if 'Black' in x else x)
sheriff_df['Race'] = sheriff_df['Race'].apply(lambda x: 'Mixed' if ',' in x else x)
sheriff_df['Race'].value_counts()

Black/African American        1926
Hispanic/Latino/Latina         988
White                          757
Mixed                          135
Middle Eastern/South Asian      21
Asian                           16
Pacific Islander                 8
Native American                  3
Name: Race, dtype: int64

In [None]:
# Determine the number of unique values in each column.
sheriff_df.nunique()

Age                              71
Limited or No English Fluency     2
Perception Made                   2
Homeless                          2
Gender                            5
Race                              8
Disability                       10
Arrested                          2
dtype: int64

In [None]:
sheriff_df['Gender'] = sheriff_df['Gender'].loc[(sheriff_df['Gender']=='Male') | (sheriff_df['Gender']=='Female')]
sheriff_df['Gender'].value_counts()

Male      2824
Female    1025
Name: Gender, dtype: int64

In [None]:
sheriff_df.drop(columns=['Disability'], inplace=True)

In [None]:
sheriff_df = sheriff_df.join(pd.cut(sheriff_df.pop('Age'),
                     bins=[1,10,15,20,25,30,35,40,45,50,55,60,120],
                     labels=['1-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60+'],
                     include_lowest=True,).to_frame('Age_Bins'))

In [None]:
sheriff_df = sheriff_df.loc[sheriff_df['Age_Bins']!='1-9']
sheriff_df['Age_Bins'] = sheriff_df['Age_Bins'].astype(str)

# Encoding and Scaling

In [None]:
categories = sheriff_df.select_dtypes(include=['object']).columns.tolist()
sheriff_df_encoded = pd.get_dummies(sheriff_df, columns=categories)

In [None]:
sheriff_df_encoded

Unnamed: 0,Limited or No English Fluency,Homeless,Arrested,Perception Made_After Contact,Perception Made_Before Contact,Gender_Female,Gender_Male,Race_Asian,Race_Black/African American,Race_Hispanic/Latino/Latina,...,Age_Bins_15-19,Age_Bins_20-24,Age_Bins_25-29,Age_Bins_30-34,Age_Bins_35-39,Age_Bins_40-44,Age_Bins_45-49,Age_Bins_50-54,Age_Bins_55-59,Age_Bins_60+
0,False,False,False,1,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,False,False,True,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,False,False,True,1,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,False,True,False,0,1,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,False,False,True,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3849,False,False,False,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3850,False,False,True,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3851,False,False,True,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3852,False,False,True,1,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
# Split our preprocessed data into our features and target arrays
X = sheriff_df_encoded.drop(columns=['Arrested'])
y = sheriff_df_encoded['Arrested']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=69)
classifier

In [None]:
# Train the data
classifier.fit(X_train, y_train)

In [None]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6131386861313869
Testing Data Score: 0.6145833333333334


In [None]:
# Predict outcomes for test data set
predictions = classifier.predict_proba(X_test)
# pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
predictions

array([[0.46065374, 0.53934626],
       [0.47853976, 0.52146024],
       [0.37132379, 0.62867621],
       ...,
       [0.49897589, 0.50102411],
       [0.46964904, 0.53035096],
       [0.52713206, 0.47286794]])

In [None]:
pred = pd.DataFrame(predictions).loc[:,0]

In [None]:
threshold_pred = [0 if x > 0.50 else 1 for x in pred]

In [None]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, threshold_pred)

0.6145833333333334

In [None]:
from sklearn.metrics import classification_report
target_names = ["Not Arrested", "Arrested"]
print(classification_report(y_test, threshold_pred, target_names=target_names))

              precision    recall  f1-score   support

Not Arrested       0.63      0.45      0.52       456
    Arrested       0.60      0.77      0.68       504

    accuracy                           0.61       960
   macro avg       0.62      0.61      0.60       960
weighted avg       0.62      0.61      0.60       960



# Random Forest

In [None]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [None]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,129,264
Actual 1,107,383


Accuracy Score : 0.579841449603624
Classification Report
              precision    recall  f1-score   support

       False       0.55      0.33      0.41       393
        True       0.59      0.78      0.67       490

    accuracy                           0.58       883
   macro avg       0.57      0.55      0.54       883
weighted avg       0.57      0.58      0.56       883



In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.10041954776456463, 'Age_Bins_15-16'),
 (0.06326957603136102, 'Gender_Female'),
 (0.06233128782601429, 'Gender_Male'),
 (0.05895997634626736, 'Race_White'),
 (0.057409448411902646, 'Race_Black/African American'),
 (0.053675468471363216, 'Race_Hispanic/Latino/Latina'),
 (0.039975000855952555, 'Perception Made_After Contact'),
 (0.0385590825040817, 'Perception Made_Before Contact'),
 (0.035035223405787345, 'Age_Bins_23-24'),
 (0.03355171552622279, 'Age_Bins_33-34')]

# Keras Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=200, input_dim=number_input_features, activation="tanh")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=100, activation="sigmoid"))
nn.add(tf.keras.layers.Dropout(rate=.5))
nn.add(tf.keras.layers.Dense(units=200, activation="tanh"))
nn.add(tf.keras.layers.Dense(units=200, activation="leaky_relu" ))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               5200      
                                                                 
 dense_1 (Dense)             (None, 100)               20100     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 200)               20200     
                                                                 
 dense_3 (Dense)             (None, 200)               40200     
                                                                 
 dense_4 (Dense)             (None, 1)                 201       
                                                                 
Total params: 85,901
Trainable params: 85,901
Non-traina

In [None]:
# Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
nn.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

30/30 - 0s - loss: 0.6591 - accuracy: 0.6240 - 148ms/epoch - 5ms/step
Loss: 0.6590672731399536, Accuracy: 0.6239583492279053


# Keras Tuner

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=25))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [None]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    overwrite=True)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 17 Complete [00h 00m 07s]
val_accuracy: 0.6188992857933044

Best val_accuracy So Far: 0.6188992857933044
Total elapsed time: 00h 01m 08s

Search: Running Trial #18

Value             |Best Value So Far |Hyperparameter
tanh              |tanh              |activation
1                 |7                 |first_units
3                 |4                 |num_layers
7                 |7                 |units_0
3                 |7                 |units_1
1                 |9                 |units_2
1                 |3                 |units_3
3                 |5                 |units_4
7                 |9                 |units_5
20                |20                |tuner/epochs
7                 |7                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
2                 |2                 |tuner/round
0012              |0015              |tuner/trial_id

Epoch 8/20
Epoch 9/20
Epoch 10/20

KeyboardInterrupt: ignored

In [None]:
best_model = tuner.get_best_models()[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 78        
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 9)                 36        
                                                                 
 dense_3 (Dense)             (None, 7)                 70        
                                                                 
 dense_4 (Dense)             (None, 3)                 24        
                                                                 
 dense_5 (Dense)             (None, 1)                 4         
                                                                 
Total params: 224
Trainable params: 224
Non-trainable pa

In [None]:
tuner.results_summary()

Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0016 summary
Hyperparameters:
activation: tanh
first_units: 3
num_layers: 4
units_0: 3
units_1: 9
units_2: 7
units_3: 3
units_4: 3
units_5: 1
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0014
Score: 0.6375908851623535

Trial 0014 summary
Hyperparameters:
activation: tanh
first_units: 3
num_layers: 4
units_0: 3
units_1: 9
units_2: 7
units_3: 3
units_4: 3
units_5: 1
tuner/epochs: 7
tuner/initial_epoch: 3
tuner/bracket: 2
tuner/round: 1
tuner/trial_id: 0008
Score: 0.6334371566772461

Trial 0054 summary
Hyperparameters:
activation: tanh
first_units: 5
num_layers: 5
units_0: 1
units_1: 7
units_2: 7
units_3: 5
units_4: 5
units_5: 3
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 0053
Score: 0.6303219199180603

Trial 0024 summary
Hyperparameters:
activation: tanh
first_units: 5
num_layers:

# Save Model

In [None]:
# Export our model to HDF5 file
from keras.models import save_model

# Save the model to an HDF5 file
nn.save("nn.keras")