In [47]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras import callbacks
from tensorflow.keras.layers import Dropout


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [48]:
# load dataset
df = pd.read_csv('merged.csv')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8044 entries, 0 to 8043
Data columns (total 15 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Type                                      8044 non-null   object 
 1   Date                                      8044 non-null   object 
 2   Part of a policing operation              0 non-null      float64
 3   Policing operation                        0 non-null      float64
 4   Latitude                                  6299 non-null   float64
 5   Longitude                                 6299 non-null   float64
 6   Gender                                    8037 non-null   object 
 7   Age range                                 7410 non-null   object 
 8   Self-defined ethnicity                    7589 non-null   object 
 9   Officer-defined ethnicity                 7586 non-null   object 
 10  Legislation                         

In [49]:
# print Date column
df['Date']

# drop empty columns
df = df.drop(['Part of a policing operation', 'Policing operation','Self-defined ethnicity', 'Legislation', 'Removal of more than just outer clothing','Outcome'], axis=1)

In [50]:
df.info() # check the dataset columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8044 entries, 0 to 8043
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                8044 non-null   object 
 1   Date                                8044 non-null   object 
 2   Latitude                            6299 non-null   float64
 3   Longitude                           6299 non-null   float64
 4   Gender                              8037 non-null   object 
 5   Age range                           7410 non-null   object 
 6   Officer-defined ethnicity           7586 non-null   object 
 7   Object of search                    7979 non-null   object 
 8   Outcome linked to object of search  8044 non-null   bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 510.7+ KB


The rows without latitude and longitude information are not helpful for our purpose. Removing those rows and keeping the data with latitude and longitude information would be best. This will result in a cleaner dataset that is more relevant for location prediction. I could consider imputing missing latitude and longitude values, but that would require additional data or assumptions about the relationship between the other columns and the location information.

In [51]:
df = df.dropna(subset=["Latitude", "Longitude"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6299 entries, 0 to 8040
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                6299 non-null   object 
 1   Date                                6299 non-null   object 
 2   Latitude                            6299 non-null   float64
 3   Longitude                           6299 non-null   float64
 4   Gender                              6293 non-null   object 
 5   Age range                           5799 non-null   object 
 6   Officer-defined ethnicity           5935 non-null   object 
 7   Object of search                    6247 non-null   object 
 8   Outcome linked to object of search  6299 non-null   bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 449.0+ KB


By observing the above result, the problem is that the police must have had a justification for conducting searches, so I need to reduce the dataset to include rows where the reason for the search is specified. I will remove all the rows where the "Object of search" column is missing or null.

In [52]:
df = df.dropna(subset=["Object of search"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6247 entries, 0 to 8040
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                6247 non-null   object 
 1   Date                                6247 non-null   object 
 2   Latitude                            6247 non-null   float64
 3   Longitude                           6247 non-null   float64
 4   Gender                              6241 non-null   object 
 5   Age range                           5756 non-null   object 
 6   Officer-defined ethnicity           5884 non-null   object 
 7   Object of search                    6247 non-null   object 
 8   Outcome linked to object of search  6247 non-null   bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 445.3+ KB


In [53]:
import pandas as pd

# get all the unique values in the df column and assign it to a variable
# define object to hold value_counts
value_counts = {}
# loop through each column in the df
for col in df.columns:
    # exclude the 'Date', 'Latitude', and 'Longitude' columns
    if col != 'Date' and  col != 'Latitude' and col != 'Longitude':
        # get the value_counts for each column
        value_counts[col] = df[col].value_counts().to_dict()
# print the value_counts
print(value_counts)

# extract the json format value_counts to a csv file
df_value_counts = pd.DataFrame(value_counts)
df_value_counts.to_csv('value_counts.csv')

{'Type': {'Person search': 4802, 'Person and Vehicle search': 1445}, 'Gender': {'Male': 5651, 'Female': 590}, 'Age range': {'18-24': 2052, '25-34': 1920, 'over 34': 1333, '10-17': 448, 'under 10': 3}, 'Officer-defined ethnicity': {'White': 2954, 'Black': 1318, 'Asian': 1142, 'Other': 470}, 'Object of search': {'Controlled drugs': 3642, 'Article for use in theft': 1140, 'Stolen goods': 833, 'Offensive weapons': 384, 'Evidence of offences under the Act': 105, 'Articles for use in criminal damage': 100, 'Firearms': 40, 'Fireworks': 2, 'Crossbows': 1}, 'Outcome linked to object of search': {False: 4151, True: 2096}}


In [54]:
import folium
from IPython.display import display
loc = df.dropna(subset=["Latitude", "Longitude"])
map_ = folium.Map(location=[loc["Latitude"].mean(), loc["Longitude"].mean()], zoom_start=7, min_zoom=7, max_zoom=7)
for lat, lon in zip(loc["Latitude"], loc["Longitude"]):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(map_)
map_.save("map.html")
display(map_)

The first inspection of the result we can see that 6 location report were o

In [55]:
import folium
from IPython.display import display
loc = df.dropna(subset=["Latitude", "Longitude"])

mask = loc["Latitude"] <= 52
loc2 = loc[mask]

map_ = folium.Map(location=[loc2["Latitude"].mean(), loc2["Longitude"].mean()], zoom_start=7)
for lat, lon in zip(loc2["Latitude"], loc2["Longitude"]):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(map_)
map_.save("map2.html")
display(map_)

In [56]:
# drop from df the rows with latitude greater than 52 
df = df[df.Latitude <= 52]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6240 entries, 0 to 8040
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                6240 non-null   object 
 1   Date                                6240 non-null   object 
 2   Latitude                            6240 non-null   float64
 3   Longitude                           6240 non-null   float64
 4   Gender                              6234 non-null   object 
 5   Age range                           5751 non-null   object 
 6   Officer-defined ethnicity           5879 non-null   object 
 7   Object of search                    6240 non-null   object 
 8   Outcome linked to object of search  6240 non-null   bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 444.8+ KB


After careful brainstorming, I think to avoid ethical issues and keep with my original goal of predicting the location and time of a crime. I will remove the feature like racial profile, gender and age range.

In [57]:
# Drop the "Officer-defined ethnicity" column
df = df.drop(columns=['Officer-defined ethnicity','Age range','Gender'])

# Display the info of the updated dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6240 entries, 0 to 8040
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                6240 non-null   object 
 1   Date                                6240 non-null   object 
 2   Latitude                            6240 non-null   float64
 3   Longitude                           6240 non-null   float64
 4   Object of search                    6240 non-null   object 
 5   Outcome linked to object of search  6240 non-null   bool   
dtypes: bool(1), float64(2), object(3)
memory usage: 298.6+ KB


In [58]:
#  manual one hot encoding the boolean columns
df['Outcome linked to object of search'] = df['Outcome linked to object of search'].astype(int)

In [59]:
# remove outcome linked to object of search column to only true values
df = df[df['Outcome linked to object of search'] == 1]
df.info()

# save as csv file
df.to_csv('firstClean.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 1 to 8031
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Type                                2094 non-null   object 
 1   Date                                2094 non-null   object 
 2   Latitude                            2094 non-null   float64
 3   Longitude                           2094 non-null   float64
 4   Object of search                    2094 non-null   object 
 5   Outcome linked to object of search  2094 non-null   int32  
dtypes: float64(2), int32(1), object(3)
memory usage: 106.3+ KB
