In [1]:
import json, pandas
from math import sqrt

In [29]:
import json
with open("incidents.json", 'r') as incident_json:
    incidents = json.load(incident_json)
    
with open("featuredata.json", 'r') as feature_json:
    featuredata = json.load(feature_json)
    
with open("reducedwaydata.json", "r") as waydata_json:
    waydata = json.load(waydata_json)
    
with open("node_data.json", "r") as nodedata_json:
    nodedata = json.load(nodedata_json)

In [19]:
def in_bounding_box(polyline, point, fuzz = 0.0001):
    """
    Check the first and last points in the polyline and check if the point lies within this box.
    Note: this will not work properly for curvy streets.
    """
    originating_point = polyline[0]
    last_point = polyline[-1]
    lat = float(point['lat'])
    lon = float(point['lon'])
    
    min_lat = min(float(originating_point['lat']), float(last_point['lat']))
    max_lat = max(float(originating_point['lat']), float(last_point['lat']))
    min_lon = min(float(originating_point['lon']), float(last_point['lon']))
    max_lon = max(float(originating_point['lon']), float(last_point['lon']))
    
    # 0.0001 degree is approx 10 metres.
    if (lat < min_lat - fuzz) or (lat > max_lat + fuzz):
        return False
    if (lon < min_lon - fuzz) or (lon > max_lon + fuzz):
        return False
    
    # If we are here it is in a reasonable box around the two points!
    return True

polyline = [
        {
            "lat": 0.0,
            "lon": 0.0,
            },
        {
            "lat": 0.1,
            "lon": 0.1,
            }
    ]

point = {"lat": 0.00005,
        "lon": 0.0}
true_point = {"lat": 0.05,
             "lon":0.05}
false_point = {"lat": -0.005,
        "lon": 0.0}
print(in_bounding_box(polyline, point))
print(in_bounding_box(polyline, true_point))
print(in_bounding_box(polyline, false_point))

True
True
False


In [20]:
def point_is_on_line(point_0, point_1, point_2, fuzz = 0.0001):
    """
    Calculate if a point is within 5m of perpendicular distance of a line.
    based on http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
    """
    # We have three points: point 0 is the one we want to know the distance of.
    # Points 1 and 2 are the line that we'll compare to.
    # Ignore the fact that the Earth is a sphere.
    x0 = float(point_0["lon"])
    y0 = float(point_0["lat"])
    x1 = float(point_1["lon"])
    y1 = float(point_1["lat"])
    x2 = float(point_2["lon"])
    y2 = float(point_2["lat"])
    
    numerator = abs((x2 - x1)*(y1 - y0) - (x1 - x0)*(y2 - y1))
    denominator = sqrt((x2 - x1)**2 + (y2 - y1)**2)
    try:
        d = numerator/denominator
    except ZeroDivisionError:
        print("Zero Division error.")
        return False
    
    if d > fuzz:
        return False
    return True

point_1 = {
    "lat": 0.0,
    "lon": 0.0
}
point_2 = {
    "lat": 0.1,
    "lon": 0.1
}

point_0_true = {
    "lat": 0.05005,
    "lon": 0.05
    }
point_0_false = {
    "lat": 0.0505,
    "lon": 0.05
    }

print(point_is_on_line(point_0_true, point_1, point_2))
print(point_is_on_line(point_0_false, point_1, point_2))

True
False


In [21]:
def point_is_on_polyline(polyline, point, fuzz = 0.0001):
    """
    Function to run through a polyline and tell if a point is on the polyline.
    """
    # First step - establish is point is in bounding range of polyline
    # Note that this might result in fuckups for curvy streets!
    if not in_bounding_box(polyline, point, fuzz):
        return False
    
    # If point is in box, then check point pairs and see if point is within 5 metres
    number_of_point_pairs = len(polyline) - 1
    for i in range(number_of_point_pairs):
        point_1 = polyline[i]
        point_2 = polyline[i+1]
        if point_is_on_line(point, point_1, point_2, fuzz):
            return True
        
    # If we got here, it wasn't close enough to the street.
    return False

polyline = {"points": [
        {
            "lat": 0.0,
            "lon": 0.0,
        },
        {
            "lat": 0.075,
            "lon": 0.075
        },
        {
            "lat": 0.1,
            "lon": 0.1
        }
    ]}

true_point = {
    "lat": 0.05,
    "lon": 0.05
}
false_point = {
    "lat": 0.2,
    "lon": 0.075
}
print(point_is_on_polyline(polyline['points'], true_point))
print(point_is_on_polyline(polyline['points'], false_point))

True
False


# Mapping assaults onto streets

In [22]:
counter = 0
for incident in incidents:
    counter +=1
    print(incident)
    if counter == 20:
        break

{'lon': 151.21479, 'lat': -33.89239}
{'lon': 151.20984, 'lat': -33.8677}
{'lon': 151.2191, 'lat': -33.872671}
{'lon': 151.22019, 'lat': -33.87026}
{'lon': 151.215001, 'lat': -33.88007}
{'lon': 151.206701, 'lat': -33.882432}
{'lon': 151.22451, 'lat': -33.87519}
{'lon': 151.213056, 'lat': -33.863096}
{'lon': 151.212542, 'lat': -33.87454}
{'lon': 151.207925, 'lat': -33.856479}
{'lon': 151.17502, 'lat': -33.87567}
{'lon': 151.22088, 'lat': -33.87502}
{'lon': 151.20743, 'lat': -33.87809}
{'lon': 151.20444, 'lat': -33.88283}
{'lon': 151.21036, 'lat': -33.863495}
{'lon': 151.20849, 'lat': -33.86097}
{'lon': 151.207695, 'lat': -33.896275}
{'lon': 151.20849, 'lat': -33.86097}
{'lon': 151.20849, 'lat': -33.86097}
{'lon': 151.202749, 'lat': -33.877137}


In [23]:
counter = 0
for way in waydata.keys():
    counter += 1
    print(waydata[way]['points'])
    if counter == 3:
        break

[{'lat': '-33.87224', 'lon': '151.19845'}, {'lat': '-33.87234', 'lon': '151.19852'}, {'lat': '-33.87266', 'lon': '151.19877'}, {'lat': '-33.87272', 'lon': '151.19881'}]
[{'lat': '-33.87147', 'lon': '151.23026'}, {'lat': '-33.87139', 'lon': '151.23071'}, {'lat': '-33.87124', 'lon': '151.23092'}]
[{'lat': '-33.89995', 'lon': '151.18582'}, {'lat': '-33.89997', 'lon': '151.18591'}, {'lat': '-33.90003', 'lon': '151.18589'}, {'lat': '-33.90012', 'lon': '151.18587'}, {'lat': '-33.9002', 'lon': '151.1858'}, {'lat': '-33.90023', 'lon': '151.18573'}]


In [24]:
# Our goal is to loop through all the offence locations.
# If they lie on a way, mark that way with an offence.
# If not go through them all.
counter = 0
for incident in incidents:
    counter += 1
    if counter == 10:
        break
        
    for way in waydata.keys():
        if point_is_on_polyline(waydata[way]['points'], incident):
            if "incidents" not in waydata[way].keys():
                waydata[way]['incidents'] = 1
            else:
                waydata[way]['incidents'] += 1
            break

Zero Division error.


In [25]:
# quick test...
some_dict = dict()
if "yep" not in some_dict.keys():
    some_dict["yep"] = 1
some_dict["yep"] += 1
print(some_dict)

{'yep': 2}


In [26]:
for way in waydata.keys():
    if "incidents" in waydata[way].keys():
        print(waydata[way]['incidents'])

1
1
1
1
1
1
1
1
1


In [27]:
with open("reducedwaydata.json", "r") as waydata_json:
    waydata = json.load(waydata_json)
    
# Our goal is to loop through all the offence locations.
# If they lie on a way, mark that way with an offence.
# If not go through them all.
for incident in incidents:
        
    for way in waydata.keys():
        if point_is_on_polyline(waydata[way]['points'], incident):
            if "incidents" not in waydata[way].keys():
                waydata[way]['incidents'] = 1
            else:
                waydata[way]['incidents'] += 1
            break

Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.


In [28]:
import pandas as pd

crime_streets = list()
for way in waydata.keys():
    if "incidents" in waydata[way].keys():
        
        crime_dict = dict()
        try:
            crime_dict['name'] = waydata[way]['name']
        except:
            crime_dict['name'] = 'path'
        crime_dict['incidents'] = waydata[way]['incidents']
        crime_streets.append(crime_dict)
        
df_crimestreets = pd.DataFrame(crime_streets)
print(df_crimestreets.sort_values('incidents', ascending = False))
print(df_crimestreets['incidents'].sum())

     incidents                        name
154        130                 Cahill Walk
130        122           Darlinghurst Road
379        108              Kellett Street
56          89                 Park Street
444         64                        path
681         51                        path
198         49               Bourke Street
52          47               Druitt Street
128         44       Liverpool St Cycleway
257         43                Mount Street
753         43               George Street
192         41               Oxford Street
816         38                        path
205         38              College Street
880         36               Bourke Street
72          34                        path
688         34  Western Distributor Onramp
182         33                        path
550         32             Victoria Street
189         31               Oxford Street
227         30            Glebe Point Road
605         30                Martin Place
930        

# Non-assault feature allocation

In [30]:
type(featuredata)

dict

In [35]:
featureset = set()

for key in featuredata.keys():
    counter += 1
    featureset.add(featuredata[key]["feature"])

print(featureset)

{'bar', 'shop', 'place_of_worship', 'restaurant', 'taxi', 'drinking_water', 'fast_food', 'hostel', 'atm', 'hospital', 'parking', 'pub', 'police', 'nightclub', 'bus_stop'}


In [38]:
# 0.0001 approx 10 m
# 0.001 approx 100 m
# 0.01 approx 1km

# We're assuming that these features has a range at which they're important... 
# This is not a great way to approach this, but it'll work as a first run...
featuresetfuzz = {
    "bar": 0.001,
    "shop": 0.001,
    "place_of_worship": 0.0005,
    "restaurant": 0.0005,
    "taxi": 0.0001,
    "drinking_water": 0.0001,
    "fast_food": 0.0005,
    "hostel": 0.005,
    "atm": 0.005,
    "hospital": 0.001,
    "parking": 0.001,
    "pub": 0.001,
    "police": 0.001,
    "nightclub": 0.005,
    "bus_stop": 0.0001
}

In [45]:
counter = 0
with open("reducedwaydata.json", "r") as waydata_json:
    waydata = json.load(waydata_json)

for feature in featuredata.keys():
    feature_type = featuredata[feature]["feature"]
    counter += 1    
    for way in waydata.keys():
        if point_is_on_polyline(waydata[way]['points'], featuredata[feature], featuresetfuzz[feature_type]):
            if feature_type not in waydata[way].keys():
                waydata[way][feature_type] = 1
            else:
                waydata[way][feature_type] += 1

Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division

In [46]:
import json
with open("feature_laden_ways_data.json", 'w') as f:
    json.dump(waydata, f, indent = 1)

In [49]:
# Ascribe hourly incident data
with open("feature_laden_ways_data.json", "r") as waydata_json:
    waydata = json.load(waydata_json)
    
# Our goal is to loop through all the offence locations.
# If they lie on a way, mark that way with an offence.
# If not go through them all.
for i in range(24):
    with open("{0}_incident.json".format(i), 'r') as incident_json:
        incidents = json.load(incident_json)
    for incident in incidents:
        for way in waydata.keys():
            if point_is_on_polyline(waydata[way]['points'], incident):
                if "incident_{0}".format(i) not in waydata[way].keys():
                    waydata[way]["incident_{0}".format(i)] = 1
                else:
                    waydata[way]["incident_{0}".format(i)] += 1
                break
                
with open("hourly_assaults_feature_laden_ways_data.json", 'w') as f:
    json.dump(waydata, f, indent = 1)

Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.
Zero Division error.


In [56]:
df_final_dataset = pd.DataFrame(waydata).transpose().fillna(0)
print(df_final_dataset.head())
df_final_dataset.to_csv("final_predictive_datset.csv")

           atm  bar  bus_stop  drinking_water  fast_food  hospital  hostel  \
100183506    1    0         0               0          0         0       3   
100183507    1    0         0               0          1         0       8   
100183508    1    0         0               0          0         0       1   
100183509    0    0         0               0          0         0       0   
100183510    0    1         0               0          0         0       0   

           incident_0  incident_1  incident_10  ...                     name  \
100183506           0           0            0  ...                        0   
100183507           0           0            0  ...                        0   
100183508           0           0            0  ...                        0   
100183509           0           0            0  ...   Bourke Street Cycleway   
100183510           0           0            0  ...                        0   

           nightclub  parking  place_of_worship  \

In [100]:
# Generate predictions for the safety of any particular street
from sklearn import tree

for i in range(24):
    X = df_final_dataset[list(featureset)]
    y = df_final_dataset["incident_{0}".format(i)]
    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(X, y)
    df_final_dataset["incident_{0}_predict".format(i)] = clf.predict(df_final_dataset[list(featureset)])
    low_threshold = df_final_dataset[df_final_dataset["incident_{0}_predict".format(i)] > 0]["incident_{0}_predict".format(i)].median()
    high_threshold = df_final_dataset[df_final_dataset["incident_{0}_predict".format(i)] > 0]["incident_{0}_predict".format(i)].mean()
    df_final_dataset["{0}".format(i)] = df_final_dataset["incident_{0}_predict".format(i)].apply(lambda x: 1 if x > low_threshold else 0)
    df_final_dataset["{0}".format(i)] = df_final_dataset[df_final_dataset["incident_{0}_predict".format(i)] > low_threshold]["incident_{0}_predict".format(i)].apply(lambda x: 2 if x > high_threshold else 1)
df_final_dataset.to_csv("prediction_datasets.csv")

In [101]:
# This is the final dataset that will be displayed on the website.
# Note that this is a horrible way to present this data for a web platform, and it will make any 
# web developer cry.

import json

relevant_columns = ["points","name"]
for i in range(24):
    relevant_columns.append("{0}".format(i))
df_json_output = df_final_dataset[(relevant_columns)]
print(df_json_output.head())
df_json_output.to_json("street_predictions.json", orient="records")
with open("street_predictions.json", 'r') as predictions_json:
    street_predictions = json.load(predictions_json)
with open("street_predictions.json", 'w') as predictions_json:
    json.dump(street_predictions, predictions_json, indent = 1)

                                                      points  \
100183506  [{'lon': '151.21764', 'lat': '-33.87626'}, {'l...   
100183507  [{'lon': '151.21792', 'lat': '-33.8749'}, {'lo...   
100183508  [{'lon': '151.21693', 'lat': '-33.87987'}, {'l...   
100183509  [{'lon': '151.2132', 'lat': '-33.89682'}, {'lo...   
100183510  [{'lon': '151.2166', 'lat': '-33.88118'}, {'lo...   

                             name    0    1    2    3    4    5    6    7  \
100183506                       0  2.0  2.0  1.0  1.0  2.0  2.0  NaN  NaN   
100183507                       0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
100183508                       0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
100183509  Bourke Street Cycleway  NaN  NaN  1.0  NaN  NaN  2.0  NaN  NaN   
100183510                       0  2.0  2.0  2.0  2.0  NaN  2.0  2.0  2.0   

          ...   14  15   16   17   18   19   20   21   22   23  
100183506 ...  NaN NaN  NaN  2.0  NaN  NaN  2.0  2.0  2.0  1.0  
100183507 ...  NaN NaN