In [15]:
import json

import folium
import requests
import pandas as pd

Reading the raw data from csv witch has benn downloaded from https://data.cityofnewyork.us/Education/Brooklyn-Schools/bkjd-kr4k 

In [16]:
df = pd.read_csv('brooklyn_schools_original.csv', ',')


Drop superfluous columns

In [17]:
df.drop(columns=['DBN', 'Location Code', 'Location Name', 'Building Name', 'Borough', 'Geographical District Code', 'Schools in Building', 'ENGroupA',], inplace=True)
df.head()

Unnamed: 0,Building Code,Address,Register,# Schools,Major N,Oth N,NoCrim N,Prop N,Vio N,RangeA,AvgOfMajor N,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N
0,K001,309 47 STREET,1277.0,1,0.0,2.0,1.0,1.0,0.0,1251-1500,0.86,3.26,5.55,2.17,1.29
1,K002,655 PARKSIDE AVENUE,479.0,3,,,,,,251-500,,,,,
2,K002,655 PARKSIDE AVENUE,397.0,3,,,,,,251-500,,,,,
3,K002,655 PARKSIDE AVENUE,,3,,,,,,,,,,,
4,K002,655 PARKSIDE AVENUE,876.0,3,1.0,5.0,2.0,2.0,4.0,751-1000,0.52,1.71,2.49,1.16,0.75


Delete all rows that have NaN values in either the 'Major N' or 'RangeA' column

In [18]:
df.dropna(subset=['Major N', 'RangeA'], inplace=True)
df.head()

Unnamed: 0,Building Code,Address,Register,# Schools,Major N,Oth N,NoCrim N,Prop N,Vio N,RangeA,AvgOfMajor N,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N
0,K001,309 47 STREET,1277,1,0.0,2.0,1.0,1.0,0.0,1251-1500,0.86,3.26,5.55,2.17,1.29
4,K002,655 PARKSIDE AVENUE,876,3,1.0,5.0,2.0,2.0,4.0,751-1000,0.52,1.71,2.49,1.16,0.75
5,K003,50 JEFFERSON AVENUE,513,1,2.0,0.0,0.0,2.0,0.0,501-750,0.33,1.32,1.76,0.83,0.59
6,K005,820 HANCOCK STREET,312,1,1.0,1.0,0.0,2.0,0.0,251-500,0.35,1.06,1.09,0.73,0.5
7,K006,43 SNYDER AVENUE,714,1,0.0,1.0,2.0,0.0,0.0,501-750,0.33,1.32,1.76,0.83,0.59


Parse of the 'RangeA' (population of building) column.
The value is range like 751-1000. Try to split the range and use the second (the larger value) value. There are also value in form of 4000+. Just split by '+' and use the first value.
Write all the values back to the 'RangeA' column.

In [19]:
def map_range_a(value):
    try:
        return int(value.split('-')[1].strip())
    except IndexError:
        return int(value.split('+')[0])

df['RangeA'] = df['RangeA'].apply(map_range_a)
df.head()

Unnamed: 0,Building Code,Address,Register,# Schools,Major N,Oth N,NoCrim N,Prop N,Vio N,RangeA,AvgOfMajor N,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N
0,K001,309 47 STREET,1277,1,0.0,2.0,1.0,1.0,0.0,1500,0.86,3.26,5.55,2.17,1.29
4,K002,655 PARKSIDE AVENUE,876,3,1.0,5.0,2.0,2.0,4.0,1000,0.52,1.71,2.49,1.16,0.75
5,K003,50 JEFFERSON AVENUE,513,1,2.0,0.0,0.0,2.0,0.0,750,0.33,1.32,1.76,0.83,0.59
6,K005,820 HANCOCK STREET,312,1,1.0,1.0,0.0,2.0,0.0,500,0.35,1.06,1.09,0.73,0.5
7,K006,43 SNYDER AVENUE,714,1,0.0,1.0,2.0,0.0,0.0,750,0.33,1.32,1.76,0.83,0.59


There are many schools that are located in the same building on the same address. We summarize these rows and take the average of all numerical values in each column.

In [20]:
df = df.groupby(['Building Code', 'Address'], as_index=False).mean()
df.head()

Unnamed: 0,Building Code,Address,# Schools,Major N,Oth N,NoCrim N,Prop N,Vio N,RangeA,AvgOfMajor N,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N
0,K001,309 47 STREET,1.0,0.0,1.0,0.333333,0.666667,0.0,1500.0,0.796667,3.166667,5.463333,2.023333,1.49
1,K002,655 PARKSIDE AVENUE,3.0,0.666667,3.333333,1.333333,1.333333,2.0,1000.0,0.453333,1.69,2.546667,1.103333,0.773333
2,K003,50 JEFFERSON AVENUE,1.0,0.666667,0.333333,1.0,0.666667,0.333333,666.666667,0.303333,1.113333,1.75,0.75,0.513333
3,K005,820 HANCOCK STREET,1.0,0.333333,0.666667,0.666667,0.666667,0.333333,416.666667,0.286667,0.96,1.393333,0.68,0.433333
4,K006,43 SNYDER AVENUE,1.0,0.0,0.666667,1.666667,0.333333,0.0,750.0,0.306667,1.183333,1.773333,0.79,0.543333


Get the longitude and latitude of each address by requesting an openstreetmap service. To speed up subsequent runs, we store the results in a JSON file and use this file as a cache for later runs.

In [21]:
import importlib
import address_tools

importlib.reload(address_tools)

def map_row(row):
    coordinates = address_tools.get_coordinates_from_address(row['Address'], resolve=False)
    row['lon'] = coordinates.lon
    row['lat'] = coordinates.lat
    row['valid_coordinates'] = coordinates.is_valid()
    row['in_brooklyn'] = address_tools.brooklyn.contains(coordinates)

    return row

In [22]:
df = df.apply(map_row, axis=1)

df.to_csv('prepared_brooklyn_schools.csv', ',')
df.head()

# 


Unnamed: 0,Building Code,Address,# Schools,Major N,Oth N,NoCrim N,Prop N,Vio N,RangeA,AvgOfMajor N,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N,lon,lat,valid_coordinates,in_brooklyn
0,K001,309 47 STREET,1.0,0.0,1.0,0.333333,0.666667,0.0,1500.0,0.796667,3.166667,5.463333,2.023333,1.49,-73.999662,40.637904,True,True
1,K002,655 PARKSIDE AVENUE,3.0,0.666667,3.333333,1.333333,1.333333,2.0,1000.0,0.453333,1.69,2.546667,1.103333,0.773333,-73.951524,40.656441,True,True
2,K003,50 JEFFERSON AVENUE,1.0,0.666667,0.333333,1.0,0.666667,0.333333,666.666667,0.303333,1.113333,1.75,0.75,0.513333,-73.955544,40.682302,True,True
3,K005,820 HANCOCK STREET,1.0,0.333333,0.666667,0.666667,0.666667,0.333333,416.666667,0.286667,0.96,1.393333,0.68,0.433333,-73.922603,40.685546,True,True
4,K006,43 SNYDER AVENUE,1.0,0.0,0.666667,1.666667,0.333333,0.0,750.0,0.306667,1.183333,1.773333,0.79,0.543333,-73.956111,40.648889,True,True


In [23]:
# unresolved_addresses = df[df['in_brooklyn'] == False]
# unresolved_addresses.to_csv('unresolved_brooklyn_schools.csv', ',')
# unresolved_addresses.head()

In [24]:
df = df[df['in_brooklyn'] == True]

Calculting a crime factor by adding the average of each type of crime. Major crimes and violent crimes are weighted more heavily.

In [25]:
# def calculate_crime(row):
#     row['crime'] = int(row['AvgOfMajor N'] * 10 + row['AvgOfOth N'] + row['AvgOfNoCrim N'] + row['AvgOfProp N'] * 2 + row['AvgOfVio N'] * 7.5)
#     return row

# df = df.apply(calculate_crime, axis=1)
# df.head()

In [26]:
import math

import folium
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

Find the center of Brooklyn

In [27]:
center_of_brookyn = address_tools.brooklyn.center

m = folium.Map(
    location=(center_of_brookyn.y, center_of_brookyn.x), 
    zoom_start=12,
    tiles='Stamen Terrain'
)

def colorcode(x):
    if x in range(0,10):
        color = 'green'
    elif x in range(10,20):
        color = 'orange'
    elif x in range(20,30):
        color = 'red'
    else:
        color = 'darkred'
    return color

for _, row in df.iterrows():
    row['lon']
    if math.isnan(row['lon']):
        continue

    # radius = int(row['RangeA'] / 100)
    radius = 5
    
    # color = colorcode(row['crime'])
    color = 'green'

    folium.CircleMarker(
        location=(row['lat'], row['lon']),
        radius=radius,
        color=color,
        fill_color=color,
        fill_opacity=0.6,
        popup='School',
    ).add_to(m)

m

In [28]:
m.save('schools.html')