In [1]:
#changing up data encoding
#importing reuired dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

In [2]:
#import our CSV

new_df = pd.read_csv("../../SQL_And_CSV/BinaryClassifier.csv")
new_df.head()

Unnamed: 0.1,Unnamed: 0,pub_agency_name,agency_type_name,state_name,division_name,county_name,region_name,population_group_code,offense_code,offender_race,offender_ethnicity,offender_age,offender_sex,victim_type_code,location_code,weapon_code,prop_desc_code,stolen_value,recovered_flag
0,0,Angelina,County,Texas,West South Central,ANGELINA,South,8B,26B,Unknown,Unknown,0.0,U,I,25,95,20,375.0,False
1,1,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,95,77,1.0,False
2,2,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,95,65,0.0,False
3,3,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,95,13,320.0,False
4,4,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,23F,Unknown,Unknown,0.0,U,I,20,95,77,1.0,False


In [3]:
#drop index
new_df = new_df.drop(columns="Unnamed: 0", axis=1)
new_df.head(1)

Unnamed: 0,pub_agency_name,agency_type_name,state_name,division_name,county_name,region_name,population_group_code,offense_code,offender_race,offender_ethnicity,offender_age,offender_sex,victim_type_code,location_code,weapon_code,prop_desc_code,stolen_value,recovered_flag
0,Angelina,County,Texas,West South Central,ANGELINA,South,8B,26B,Unknown,Unknown,0.0,U,I,25,95,20,375.0,False


In [4]:
#pub_agency_name, county_name and division_name large and may confuse model. Probably not necessary as state_name and agency_type_name will give the same general info
#that is to say, COUNTY v CITY, and REGION
#in addition, will be dropping offender_age, and offender_ethnicity, as those can be superflous as well. Age of "0" seems weird anyway
to_drop = ["pub_agency_name","division_name", "county_name", "offender_age","offender_ethnicity"]
fbi_new = new_df.drop(labels = to_drop, axis = 1)
fbi_new.head()

Unnamed: 0,agency_type_name,state_name,region_name,population_group_code,offense_code,offender_race,offender_sex,victim_type_code,location_code,weapon_code,prop_desc_code,stolen_value,recovered_flag
0,County,Texas,South,8B,26B,Unknown,U,I,25,95,20,375.0,False
1,County,Alabama,South,9A,23H,Unknown,U,I,20,95,77,1.0,False
2,County,Alabama,South,9A,23H,Unknown,U,I,20,95,65,0.0,False
3,County,Alabama,South,9A,23H,Unknown,U,I,20,95,13,320.0,False
4,County,Alabama,South,9A,23F,Unknown,U,I,20,95,77,1.0,False


In [5]:
enc = OneHotEncoder(sparse=False)
#time to re-bucket the states
states_counts = fbi_new.state_name.value_counts()
states_counts

North Carolina          26784
Ohio                    11927
Massachusetts           10788
Texas                   10233
Georgia                  9850
Tennessee                4891
South Carolina           4596
Virginia                 4468
Michigan                 3772
Alabama                  3393
West Virginia            3383
Maryland                 2419
Nevada                   2308
Pennsylvania             2263
Missouri                 2145
Indiana                  1581
New Mexico               1518
Oregon                   1437
California               1375
Washington               1372
Colorado                 1367
New Jersey               1359
Illinois                 1350
Arkansas                 1292
Florida                  1283
Mississippi              1232
Kentucky                  804
Iowa                      531
Connecticut               388
Nebraska                  343
Rhode Island              305
Wisconsin                 283
Arizona                   278
Montana   

In [6]:
#let's try cutting off at 1,000 and call it "Other US States/Territories"
replacements = list(states_counts[states_counts < 1000].index)

# Replace in DataFrame
for state in replacements:
    fbi_new.state_name = fbi_new.state_name.replace(state,"Other_US_States/Territories")


# Check to make sure data succesfully binned
fbi_new.state_name.value_counts()

North Carolina                 26784
Ohio                           11927
Massachusetts                  10788
Texas                          10233
Georgia                         9850
Tennessee                       4891
South Carolina                  4596
Other_US_States/Territories     4570
Virginia                        4468
Michigan                        3772
Alabama                         3393
West Virginia                   3383
Maryland                        2419
Nevada                          2308
Pennsylvania                    2263
Missouri                        2145
Indiana                         1581
New Mexico                      1518
Oregon                          1437
California                      1375
Washington                      1372
Colorado                        1367
New Jersey                      1359
Illinois                        1350
Arkansas                        1292
Florida                         1283
Mississippi                     1232
N

In [7]:
#now on to the encoding
object_columns = fbi_new.dtypes[fbi_new.dtypes == "object"].index.tolist()
print(object_columns)

['agency_type_name', 'state_name', 'region_name', 'population_group_code', 'offense_code', 'offender_race', 'offender_sex', 'victim_type_code', 'weapon_code']


In [8]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fbi_new[object_columns]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(object_columns)
encode_df.head()

Unnamed: 0,agency_type_name_City,agency_type_name_County,agency_type_name_Federal,agency_type_name_Other,agency_type_name_Other State Agency,agency_type_name_State Police,agency_type_name_Tribal,agency_type_name_University or College,state_name_Alabama,state_name_Arkansas,...,weapon_code_20,weapon_code_30,weapon_code_35,weapon_code_40,weapon_code_50,weapon_code_70,weapon_code_85,weapon_code_90,weapon_code_95,weapon_code_99
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
#take the encoded dataframe and add it back into the original, then drop the changed columns
fbi_final = fbi_new.merge(encode_df, left_index=True, right_index = True)
fbi_final= fbi_final.drop(labels =object_columns,axis=1)
fbi_final.head()

Unnamed: 0,location_code,prop_desc_code,stolen_value,recovered_flag,agency_type_name_City,agency_type_name_County,agency_type_name_Federal,agency_type_name_Other,agency_type_name_Other State Agency,agency_type_name_State Police,...,weapon_code_20,weapon_code_30,weapon_code_35,weapon_code_40,weapon_code_50,weapon_code_70,weapon_code_85,weapon_code_90,weapon_code_95,weapon_code_99
0,25,20,375.0,False,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20,77,1.0,False,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20,65,0.0,False,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,20,13,320.0,False,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,20,77,1.0,False,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
#make our target column binary for the encoder

fbi_final.recovered_flag = fbi_final.recovered_flag.replace({True:1,False:0})
#check to see if worked
fbi_final.dtypes.value_counts()

float64    117
int64        3
dtype: int64

In [11]:
#export this CSV for group and to save time in future
#fbi_final.to_csv("FBI_object_encoded_modified.csv", index=False)

In [12]:
#after all that it's time to fit some data
y = fbi_final["recovered_flag"].values
X = fbi_final.drop(labels ="recovered_flag",axis =1).values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Define the model, let's reuse an old model with the new data. the model that got better performance
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 3
hidden_nodes_layer2 = 3
nn6 = tf.keras.models.Sequential()

# First hidden layer
nn6.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features,activation="relu"))
       
# Second hidden layer
nn6.add(tf.keras.layers.Dense(units=hidden_nodes_layer2 ,activation="relu"))


# Output layer
nn6.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn6.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 360       
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 376
Trainable params: 376
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Compile the model
nn6.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [16]:
#send it
fit_model = nn6.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn6.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

961/961 - 1s - loss: 0.3339 - accuracy: 0.8640 - 887ms/epoch - 923us/step
Loss: 0.3339027762413025, Accuracy: 0.8639838695526123


In [18]:
#worse results

In [19]:
#try again and let it go longer
#send it
fit_model = nn6.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn6.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
#barely any improvement after running for twice as long

961/961 - 1s - loss: 0.3311 - accuracy: 0.8662 - 855ms/epoch - 889us/step
Loss: 0.33110910654067993, Accuracy: 0.8661634922027588


In [21]:
#try a tiny one
# Define the model, let's reuse an old model with the new data. the model that got better performance
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 6

nn7 = tf.keras.models.Sequential()

# First hidden layer
nn7.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features,activation="relu"))
# Output layer
nn7.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn7.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 6)                 720       
                                                                 
 dense_4 (Dense)             (None, 1)                 7         
                                                                 
Total params: 727
Trainable params: 727
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Compile the model
nn7.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [23]:
#send it
fit_model2 = nn7.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [24]:
# Evaluate the model using the test data
model_loss2, model_accuracy2 = nn7.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss2}, Accuracy: {model_accuracy2}")
#same score despite using changed data and a tiny model

961/961 - 1s - loss: 0.3144 - accuracy: 0.8725 - 810ms/epoch - 843us/step
Loss: 0.31438153982162476, Accuracy: 0.872474730014801
