In [1]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
import tensorflow as tf

In [2]:
# Loading data
file_path = Path("Resources/sample_crime_data.csv")
df = pd.read_csv(file_path)
df = df.set_index("ID")
print(df.shape)
df.head()

(206997, 18)


Unnamed: 0_level_0,ID.1,Block,IUCR,Primary Type,Violence Status,Description,Location Description,Arrest,Domestic,District,Ward,Community Area,FBI Code,Year,Latitude,Longitude,Date,Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6909918,12260346,070XX S EGGLESTON AVE,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,7.0,6.0,68.0,08B,2021,41.766435,-87.635964,2021-01-03,13:23:00
6927718,12263464,080XX S YALE AVE,820,THEFT,NON-VIOLENT,$500 AND UNDER,RESIDENCE,False,False,6.0,17.0,44.0,06,2021,41.748474,-87.630607,2021-01-03,06:59:00
6927807,12259990,056XX W WASHINGTON BLVD,486,BATTERY,VIOLENT,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,15.0,29.0,25.0,08B,2021,41.882224,-87.766076,2021-01-03,00:20:00
6931849,12260669,057XX S RACINE AVE,2022,NARCOTICS,NON-VIOLENT,POSSESS - COCAINE,STREET,True,False,7.0,16.0,67.0,18,2021,41.790069,-87.654769,2021-01-03,20:47:00
6931854,25702,068XX S STONY ISLAND AVE,110,HOMICIDE,VIOLENT,FIRST DEGREE MURDER,STREET,False,False,3.0,5.0,43.0,01A,2021,41.771062,-87.586271,2021-01-03,20:09:00


In [3]:
df.dtypes

ID.1                      int64
Block                    object
IUCR                     object
Primary Type             object
Violence Status          object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
District                float64
Ward                    float64
Community Area          float64
FBI Code                 object
Year                      int64
Latitude                float64
Longitude               float64
Date                     object
Time                     object
dtype: object

In [4]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()


# Check the number of unique values in each column
df[df_cat].nunique()

Block                   26955
IUCR                      300
Primary Type               31
Violence Status             2
Description               278
Location Description      126
FBI Code                   26
Date                      370
Time                     1450
dtype: int64

In [5]:
df['Violence Status'].value_counts()

NON-VIOLENT    135435
VIOLENT         71562
Name: Violence Status, dtype: int64

In [6]:
df = df[['Date','Violence Status']]
df.head()

Unnamed: 0_level_0,Date,Violence Status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6909918,2021-01-03,VIOLENT
6927718,2021-01-03,NON-VIOLENT
6927807,2021-01-03,VIOLENT
6931849,2021-01-03,NON-VIOLENT
6931854,2021-01-03,VIOLENT


In [7]:
# create sample weather data
import numpy as np

Chicago_Weather_df = pd.DataFrame({'Temp':np.random.uniform(-10,70,206997),'Date':df.Date}).set_index('Date').reset_index()

In [8]:
df = df.set_index('Date').reset_index()

In [9]:
merged_df = df.merge(Chicago_Weather_df,on='Date').drop(columns='Date')

In [10]:
merged_df.head()

Unnamed: 0,Violence Status,Temp
0,VIOLENT,38.202938
1,VIOLENT,59.132091
2,VIOLENT,49.818465
3,VIOLENT,-3.623086
4,VIOLENT,30.943212


In [15]:
sample_df = merged_df.sample(10000).reset_index()
sample_df = sample_df.drop(columns='index')
sample_df

Unnamed: 0,Violence Status,Temp
0,NON-VIOLENT,44.425220
1,VIOLENT,-8.674899
2,VIOLENT,22.898026
3,VIOLENT,27.636420
4,NON-VIOLENT,27.661422
...,...,...
9995,VIOLENT,36.494386
9996,NON-VIOLENT,64.575210
9997,NON-VIOLENT,34.729582
9998,NON-VIOLENT,68.479947


In [29]:
# Generate our categorical variable lists
sample_cat = sample_df.dtypes[sample_df.dtypes == "object"].index.tolist()

In [30]:
# Check the number of unique values in each column
sample_df[sample_cat].nunique()

Violence Status    2
dtype: int64

In [32]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(sample_df[sample_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(sample_cat)
encode_df.head()

Unnamed: 0,Violence Status_NON-VIOLENT,Violence Status_VIOLENT
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


In [33]:
# Merge one-hot encoded features and drop the originals
sample_df = sample_df.merge(encode_df,left_index=True, right_index=True)
sample_df = sample_df.drop(sample_cat,1)
sample_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp,Violence Status_NON-VIOLENT,Violence Status_VIOLENT
0,44.42522,1.0,0.0
1,-8.674899,0.0,1.0
2,22.898026,0.0,1.0
3,27.63642,0.0,1.0
4,27.661422,1.0,0.0


In [34]:
y = sample_df['Violence Status_VIOLENT'].values
X = sample_df[['Temp']].values

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42)

In [36]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
import tensorflow as tf

In [39]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 16        
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 67
Trainable params: 67
Non-trainable params: 0
_________________________________________________________________


In [40]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [41]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [42]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

79/79 - 0s - loss: 0.6433 - accuracy: 0.6568 - 188ms/epoch - 2ms/step
Loss: 0.6432539820671082, Accuracy: 0.6567999720573425
