In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import requests
import json

In [52]:
# stop and searches: ethnic gender age -> outcome
def request(url):
    response_API = requests.get(url)
    return json.loads(response_API.text)
    
data = []
for force in request("https://data.police.uk/api/forces"):
    url = "https://data.police.uk/api/stops-force?force={}&date=2020-10".format(force["id"])
    res = request(url)
    data = data + res
# to get all data: aditionaly loop through /crimes-street-dates and use index ["date"] for "date"-query-param

# small data set:
#data = request("https://data.police.uk/api/stops-force?force=avon-and-somerset&date=2019-01")

df = pd.DataFrame(data)
print(df)

    age_range                       outcome  involved_person  \
0     over 34  A no further action disposal             True   
1       18-24  A no further action disposal             True   
2       18-24                        Arrest             True   
3     over 34  A no further action disposal             True   
4       18-24          Community resolution             True   
..        ...                           ...              ...   
514     10-17                        Arrest             True   
515     18-24  A no further action disposal             True   
516     10-17                        Arrest             True   
517     10-17                        Arrest             True   
518     10-17  A no further action disposal             True   

                                self_defined_ethnicity gender  \
0    White - English/Welsh/Scottish/Northern Irish/...   Male   
1      Black/African/Caribbean/Black British - African   Male   
2                                   

In [37]:
#subset:
df = df.iloc[:,[0,1,3,4]]
print(df)

      age_range                       outcome  \
0       over 34  A no further action disposal   
1         18-24  A no further action disposal   
2         25-34                        Arrest   
3         10-17  A no further action disposal   
4         10-17  A no further action disposal   
...         ...                           ...   
58766   over 34  A no further action disposal   
58767   over 34  A no further action disposal   
58768     10-17  A no further action disposal   
58769     10-17  A no further action disposal   
58770   over 34  A no further action disposal   

                                  self_defined_ethnicity  gender  
0      White - English/Welsh/Scottish/Northern Irish/...  Female  
1      White - English/Welsh/Scottish/Northern Irish/...    Male  
2      Mixed/Multiple ethnic groups - White and Black...    Male  
3      Mixed/Multiple ethnic groups - White and Black...    Male  
4                        Other ethnic group - Not stated    Male  
...      

In [38]:
# remove missing values
df = df.drop(df[df.outcome == "" ].index)
df = df.dropna()
print(df)

      age_range                       outcome  \
0       over 34  A no further action disposal   
1         18-24  A no further action disposal   
2         25-34                        Arrest   
3         10-17  A no further action disposal   
4         10-17  A no further action disposal   
...         ...                           ...   
58766   over 34  A no further action disposal   
58767   over 34  A no further action disposal   
58768     10-17  A no further action disposal   
58769     10-17  A no further action disposal   
58770   over 34  A no further action disposal   

                                  self_defined_ethnicity  gender  
0      White - English/Welsh/Scottish/Northern Irish/...  Female  
1      White - English/Welsh/Scottish/Northern Irish/...    Male  
2      Mixed/Multiple ethnic groups - White and Black...    Male  
3      Mixed/Multiple ethnic groups - White and Black...    Male  
4                        Other ethnic group - Not stated    Male  
...      

In [39]:
# inspect features values
df["age_range"].value_counts()

18-24       16744
25-34       11226
10-17        9904
over 34      9708
under 10       20
Name: age_range, dtype: int64

In [40]:
df["outcome"].value_counts()

A no further action disposal       35549
Arrest                              5830
Community resolution                3697
Summons / charged by post            896
Penalty Notice for Disorder          725
Caution (simple or conditional)      190
Name: outcome, dtype: int64

In [41]:
# inspect features values
df["self_defined_ethnicity"].value_counts()

White - English/Welsh/Scottish/Northern Irish/British                                   22477
Other ethnic group - Not stated                                                          7580
White - Any other White background                                                       3712
Black/African/Caribbean/Black British - Any other Black/African/Caribbean background     2894
Asian/Asian British - Any other Asian background                                         2185
Black/African/Caribbean/Black British - African                                          2103
Black/African/Caribbean/Black British - Caribbean                                        1331
Asian/Asian British - Pakistani                                                          1136
Asian/Asian British - Bangladeshi                                                         961
Other ethnic group - Any other ethnic group                                               847
Mixed/Multiple ethnic groups - Any other Mixed/Multiple ethn

In [42]:
# group items in outcome column
df["further_action"] = df["outcome"].isin([
    "Arrest","Community resolution","Khat or Cannabis warning",
    "Summons / charged by post","Caution (simple or conditional)"
    ]).astype(int)
del df["outcome"]
print(df)

      age_range                             self_defined_ethnicity  gender  \
0       over 34  White - English/Welsh/Scottish/Northern Irish/...  Female   
1         18-24  White - English/Welsh/Scottish/Northern Irish/...    Male   
2         25-34  Mixed/Multiple ethnic groups - White and Black...    Male   
3         10-17  Mixed/Multiple ethnic groups - White and Black...    Male   
4         10-17                    Other ethnic group - Not stated    Male   
...         ...                                                ...     ...   
58766   over 34  White - English/Welsh/Scottish/Northern Irish/...    Male   
58767   over 34  White - English/Welsh/Scottish/Northern Irish/...    Male   
58768     10-17  White - English/Welsh/Scottish/Northern Irish/...    Male   
58769     10-17  White - English/Welsh/Scottish/Northern Irish/...    Male   
58770   over 34  White - English/Welsh/Scottish/Northern Irish/...    Male   

       further_action  
0                   0  
1              

In [43]:
# group items in self_defined_ethnicity column
df["ethnicity_uk_and_white"] = df["self_defined_ethnicity"].isin(["White - English/Welsh/Scottish/Northern Irish/British"]).astype(int)
del df["self_defined_ethnicity"]
print(df)

      age_range  gender  further_action  ethnicity_uk_and_white
0       over 34  Female               0                       1
1         18-24    Male               0                       1
2         25-34    Male               1                       0
3         10-17    Male               0                       0
4         10-17    Male               0                       0
...         ...     ...             ...                     ...
58766   over 34    Male               0                       1
58767   over 34    Male               0                       1
58768     10-17    Male               0                       1
58769     10-17    Male               0                       1
58770   over 34    Male               0                       1

[47602 rows x 4 columns]


In [44]:
# splitting dataset in input and outout
x = df.iloc[:,[0,1,3]].values
print(x)  
y = df.iloc[:, 2].values    
print(y)

[['over 34' 'Female' 1]
 ['18-24' 'Male' 1]
 ['25-34' 'Male' 0]
 ...
 ['10-17' 'Male' 1]
 ['10-17' 'Male' 1]
 ['over 34' 'Male' 1]]
[0 0 1 ... 0 0 0]


In [45]:
# label encoding: gender 
le = LabelEncoder()
x[:, 1] = le.fit_transform(x[:, 1])
print(x)

[['over 34' 0 1]
 ['18-24' 1 1]
 ['25-34' 1 0]
 ...
 ['10-17' 1 1]
 ['10-17' 1 1]
 ['over 34' 1 1]]


In [46]:
# one hot encoding: age
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)
print(y)

[[0.0 0.0 0.0 ... 0.0 0 1]
 [0.0 1.0 0.0 ... 0.0 1 1]
 [0.0 0.0 1.0 ... 0.0 1 0]
 ...
 [1.0 0.0 0.0 ... 0.0 1 1]
 [1.0 0.0 0.0 ... 0.0 1 1]
 [0.0 0.0 0.0 ... 0.0 1 1]]
[0 0 1 ... 0 0 0]


In [47]:
# split into test- und trainings-data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [48]:
# feature scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
print(x_train)
print("-----")
print(x_test)

[[-0.51223187 -0.74091336 -0.55218071 ... -0.02174627  0.32806832
  -0.94442658]
 [ 1.95224088 -0.74091336 -0.55218071 ... -0.02174627  0.32806832
  -0.94442658]
 [-0.51223187  1.34968547 -0.55218071 ... -0.02174627  0.32806832
  -0.94442658]
 ...
 [-0.51223187 -0.74091336  1.81100132 ... -0.02174627  0.32806832
   1.05884355]
 [-0.51223187 -0.74091336 -0.55218071 ... -0.02174627  0.32806832
  -0.94442658]
 [-0.51223187 -0.74091336  1.81100132 ... -0.02174627  0.32806832
   1.05884355]]
-----
[[-0.51223187 -0.74091336 -0.55218071 ... -0.02174627 -2.98489101
  -0.94442658]
 [ 1.95224088 -0.74091336 -0.55218071 ... -0.02174627  0.32806832
   1.05884355]
 [-0.51223187  1.34968547 -0.55218071 ... -0.02174627  0.32806832
   1.05884355]
 ...
 [-0.51223187 -0.74091336 -0.55218071 ... -0.02174627 -2.98489101
   1.05884355]
 [-0.51223187 -0.74091336  1.81100132 ... -0.02174627  0.32806832
   1.05884355]
 [-0.51223187  1.34968547 -0.55218071 ... -0.02174627  0.32806832
   1.05884355]]


In [49]:
# create model
ann = tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

# train model
ann.fit(x_train, y_train, batch_size = 32, epochs = 100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2589de95cd0>

In [51]:
""" # use model with small dataset 500 rows
# args: age(1 on the right -> old), gender (male = 1), ethnicity   
# result: the higher the more likely it is that further actions happen
print(ann.predict(sc.transform([[1, 0, 0, 0, 1, 0]])))
print(ann.predict(sc.transform([[0, 1, 0, 0, 1, 0]]))) # no further action: 0.19: 18-24, male, not white uk
print(ann.predict(sc.transform([[0, 0, 1, 0, 1, 0]])))
print(ann.predict(sc.transform([[0, 0, 0, 1, 1, 0]])))

print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 0]]))) 
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 0]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 0]]))) # further action: 0.65: 25-34, female, not white uk
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 0]])))

print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 1]]))) 
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 1]]))) # no further action: 0.26: 18-24, female, white uk
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 1]]))) # further action: 0.51: 25-34, female, white uk
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 1]])))

print(ann.predict(sc.transform([[1, 0, 0, 0, 1, 1]]))) 
print(ann.predict(sc.transform([[0, 1, 0, 0, 1, 1]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 1, 1]])))
print(ann.predict(sc.transform([[0, 0, 0, 1, 1, 1]]))) """


# use model with big dataset 47000 rows
# args: age(1 on the right -> old, except all the way to the right is under 10), gender (male = 1), ethnicity (white uk = 1)  
# result: the higher the more likely it is that further actions happen
print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 1, 0]]))) 
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 1, 0]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 1, 0]])))
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 1, 0]])))
print(ann.predict(sc.transform([[0, 0, 0, 0, 1, 1, 0]])))

print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 0, 0]]))) 
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 0, 0]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 0, 0]])))
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 0, 0]])))
print(ann.predict(sc.transform([[0, 0, 0, 0, 1, 0, 0]]))) # further action: 0.30: under 10, female, no white uk

print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 0, 1]]))) # no further action: 0.14: 10-17, female, white uk
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 0, 1]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 0, 1]])))
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 0, 1]])))
print(ann.predict(sc.transform([[0, 0, 0, 0, 1, 0, 1]])))

print(ann.predict(sc.transform([[1, 0, 0, 0, 0, 1, 1]]))) # no further action: 0.13: 10-17, male, white uk
print(ann.predict(sc.transform([[0, 1, 0, 0, 0, 1, 1]])))
print(ann.predict(sc.transform([[0, 0, 1, 0, 0, 1, 1]]))) # further action: 0.31: 25-34, male, white uk
print(ann.predict(sc.transform([[0, 0, 0, 1, 0, 1, 1]])))
print(ann.predict(sc.transform([[0, 0, 0, 0, 1, 1, 1]])))

"""
result big dataset:

no further action: 
1. 10-17, female, white uk
2. 10-17, male, white uk

further action:
1. 25-34, male, white uk
2. under 10, female, no white uk    ?! (val_count of under 10: only 20 of 47000)
"""

[[0.15010828]]
[[0.239236]]
[[0.2823944]]
[[0.2538321]]
[[0.18109727]]
[[0.17543966]]
[[0.27573705]]
[[0.28369296]]
[[0.24314407]]
[[0.3071898]]
[[0.14283404]]
[[0.22989327]]
[[0.25778398]]
[[0.22895762]]
[[0.29629555]]
[[0.13655385]]
[[0.2977122]]
[[0.31415886]]
[[0.24845904]]
[[0.21776724]]
