In [5]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium as fol
import pandas as pd
pd.options.display.max_columns= None
pd.set_option('display.max_colwidth', -1)
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import requests

def foursq_search(lats, lngs, query, limit=50, radius=3000 ):
    '''Search Foursquare at the interable coordinates [lats/lngs] given for the given [query]. 
    Return a list of jsons containing the results'''
    res=[]
    CLIENT_ID = 'CJZWMC5IOUH4IG4KC0KORHSJUQNEQFHS0Y5XRPOO0S1OXMCW' # your Foursquare ID
    CLIENT_SECRET = 'VYWF405QELBRA3113JSAJMODMM4ZYIPWRFSVVZRJ1SIQ311L' # your Foursquare Secret
    VERSION = '20180605' # Foursquare API version

    base_url= 'https://api.foursquare.com/v2/venues/search?'
    
    for lat, lng in zip( lats, lngs):
        url= base_url + '&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&query={}&limit={}'.format(
                CLIENT_ID,
                CLIENT_SECRET,
                VERSION,
                lat,
                lng,
                radius,
                query,
                limit)
        try:
            result= requests.get(url).json()
        except:
            print('Error searching: {},{}. Assigning 0 venues.'.format(lat, lng))
            result= { 'response':{} }
        res.append(result)
    return res

def extract_results( results, amt=[], venues=[], unique_venues= [], specs= [] , excls= []):
    '''Extract/update from each json in the list of [results] the [amt] of venues and the [unique_venues]. 
    Optionally return only venues containing a string in the [specs] list and not one in the [exclude] list.
    Return a list with the number of venues in each result and a dataframe with the lat/lng/name of each unique venue'''

    for i, result in enumerate( results ):
        
        if len( amt ) < len( results ): 
            amt.append( 0 )
        
        # non-empty result
        if result['response'] != {}:
                
            # iterate through the venues in the response
            for venue in result['response']['venues']:

                # make a tuple of the lat/lng/name of each venue
                temp_venue= ( venue['location']['lat'], venue['location']['lng'], venue['name'] )

                # if the specifics list is nonempty check that at least one of the strings is in the venue name
                # if there are specifics and they aren't in the venue name move on to the next venue
                # similar process if a string in the inclusion list is present
                if ( specs != [] ) & ( np.array([spec.lower() in temp_venue[2].lower() for spec in specs] ).sum() == 0): 
                    continue
                if ( excls != [] ) & ( np.array([excl.lower() in temp_venue[2].lower() for excl in excls] ).sum() != 0): 
                    continue

                amt[i]+= 1 
                
                # if we haven't encountered this venue, add it to our unique venue list
                if not temp_venue in unique_venues: 
                    unique_venues+= [ temp_venue ]

                    
    return amt, unique_venues



def plot_points( lats, lngs , radii=[], colors=[], labels=[], opacities=[], toner=False,zoom=12, prev_map= None):
    '''Plot/add coordinates [lats/lngs] with optional [radii],[colors],[labels]. Optinally update a [prev_map].
    Return a map.'''
    
    pt_amt= len( lats )
    
    # check initial conditions    
    make_popups= lambda labels: [ fol.Popup( l, parse_html=True) for l in labels ] \
                                if len( labels ) == pt_amt \
                                else [None] * pt_amt
    check_radii= lambda radii: radii if len( radii ) == pt_amt else [1] * pt_amt
    check_colors= lambda colors: colors if len( colors ) == pt_amt else ['black'] * pt_amt
    check_opacities= lambda opacities: opacities if len( opacities ) == pt_amt else [1] * pt_amt
    
    popups= make_popups( labels )
    radii= check_radii( radii )
    colors= check_colors( colors )
    opacities= check_opacities( opacities )
    
    # if there was no previous map make a new one
    if prev_map == None:
        center= [ lats.mean(), lngs.mean() ]
        tiles= 'Stamen Toner' if toner else 'OpenStreetMap'
        prev_map= fol.Map( location=center, zoom_start=zoom, control_scale=True, tiles=tiles)
        
    for lat, lng, r, color, op, popup in zip(lats, lngs, radii, colors, opacities, popups):
        fol.Circle(
            location=[lat,lng],
            radius=r,
            color=color,
            popup= popup,
            fill=True,
            fill_color=color,
            fill_opacity=op
        ).add_to(prev_map)

    return prev_map

# Import raw data from CSV
## Also remove some unnecessary rows

In [6]:
# ontario has free data
data= pd.read_excel('https://files.ontario.ca/opendata/sif_data_table_2015_2016_en.xlsx')

# drop what seem to be keys as well as irrelevant/redundant columns
data.drop(['Board Number','Board Type','School Number', 'Province', 'Municipality','School Website','Board Website','Building Suite','P.O. Box'],axis=1,inplace=True)

# title case the city column for ease
data['City'] = data['City'].apply(lambda x: x.title())

# take only english speaking elementary and secondary schools into account
data= data[ data['School Language'] == 'English' ]
data.drop('School Language', axis=1, inplace=True)

data= data[ (data['School Level'] == 'Elementary') | (data['School Level'] == 'Secondary') ]
data.drop('School Level', axis=1, inplace=True)

print('The data has {} rows & {} cols.'.format(data.shape[0],data.shape[1]))

data.head(2)

The data has 4449 rows & 42 cols.


Unnamed: 0,Board Name,School Name,School Type,School Special Condition Code,Grade Range,Street,City,Postal Code,Phone Number,Fax Number,Enrolment,Latitude,Longitude,Percentage of Students Whose First Language Is Not English,Percentage of Students Whose First Language Is Not French,Percentage of Students Who Are New to Canada from a Non-English Speaking Country,Percentage of Students Who Are New to Canada from a Non-French Speaking Country,Percentage of Students Receiving Special Education Services,Percentage of Students Identified as Gifted,Percentage of Grade 3 Students Achieving the Provincial Standard in Reading,Change in Grade 3 Reading Achievement Over Three Years,Percentage of Grade 3 Students Achieving the Provincial Standard in Writing,Change in Grade 3 Writing Acheivement Over Three Years,Percentage of Grade 3 Students Achieving the Provincial Standard in Mathematics,Change in Grade 3 Mathematics Achievement Over Three Years,Percentage of Grade 6 Students Achieving the Provincial Standard in Reading,Change in Grade 6 Reading Achievement Over Three Years,Percentage of Grade 6 Students Achieving the Provincial Standard in Writing,Change in Grade 6 Writing Acheivement Over Three Years,Percentage of Grade 6 Students Achieving the Provincial Standard in Mathematics,Change in Grade 6 Mathematics Achievement Over Three Years,Percentage of Grade 9 Students Achieving the Provincial Standard in Academic Mathematics,Change in Grade 9 Academic Mathematics Acheivement Over Three Years,Percentage of Grade 9 Students Achieving the Provincial Standard in Applied Mathematics,Change in Grade 9 Applied Mathematics Achievement Over Three Years,Percentage of Students That Passed the Grade 10 OSSLT on Their First Attempt,Change in Grade 10 OSSLT Literacy Achievement Over Three Years,Percentage of Children Who Live in Low-Income Households,Percentage of Students Whose Parents Have Some Unviersity Education,Percentage of JK-Grade 3 Classes With 20 Students or Fewer,Percentage of JK-Grade 3 Classes With 23 Students or Fewer,Extract Date
0,Algoma DSB,Algoma Education Connection Secondary School,Public,Alternative,9-12,550 NORTHERN AVENUE,Sault Ste. Marie,P6B4J4,,,236.0,46.53477,-84.30772,,100,,,18.6,,,,,,,,,,,,,,N/D,,N/R,,N/R,,33.88,SP,,,Dec-04-17
1,Algoma DSB,Anna McCrea Public School,Public,Not applicable,JK-8,250 Mark,Sault Ste Marie,P6A3M7,705-945-7106,705-945-7221,168.0,46.50593,-84.28732,SP,100,SP,SP,15.5,,0.77,,0.58,,0.81,,0.8,,0.67,,0.53,,,,,,,,8.1,20.97,1.0,1.0,Dec-04-17


## Clean up our data into the dataframe we will use
Create a dataframe including the most useful columns from the original data, we also get rid of null values and replace them with the average for the column.

In [7]:
# extract only the columns we want
cols= ['School Name','Enrolment','Latitude','Longitude','City']
school_df= data[cols].copy()

#these are the numerical portions of the original data
pct_df= data.iloc[:,-5:-3]

school_df= pd.concat( [school_df, pct_df], axis=1 , sort=True )

# change the column names to make them easier to work with
school_df.columns= ['school','enrol','lat','lng','city','pct_low_income', 'pct_uni_parents']

# drop all entries with null in any of the specified columns
school_df.dropna(subset= ['school','enrol','lat','lng','city'], inplace=True)

# make null entries the average for the numerical data
for col in school_df:
    if not col in ['school', 'enrol', 'lat', 'lng' ,'city']:
        avg= 0
        num_entries= 0
        for val in school_df[col].values:
            if (not val in ['SP','N/R','N/D']) & (val == val):
                avg+= val
                num_entries+= 1
        avg= avg / num_entries
        school_df[col].replace( ['SP','N/R','N/D', np.nan], avg, inplace=True )

print('The schools dataframe has {} rows & {} cols.'.format(school_df.shape[0],school_df.shape[1]))
school_df.head(2)

The schools dataframe has 4357 rows & 7 cols.


Unnamed: 0,school,enrol,lat,lng,city,pct_low_income,pct_uni_parents
0,Algoma Education Connection Secondary School,236.0,46.53477,-84.30772,Sault Ste. Marie,33.88,24.372899
1,Anna McCrea Public School,168.0,46.50593,-84.28732,Sault Ste Marie,8.1,20.97


## Narrow our scope to Mississauga
Some of the resulting schools were very removed from the rest so we ignore them

In [8]:
missis_df= school_df[ school_df.city == 'Mississauga' ].copy()

# there were some schools listed which werent _really_ in mississauga so we omit them, the school listed is the highest latitude of schools within mis
cutoff_lat= missis_df.loc[ missis_df.school == 'Derry West Village Public School'].lat.values[0]
missis_df= missis_df[ missis_df['lat'] <=  cutoff_lat ]

# drop columns we dont need
missis_df.drop('city', axis=1,inplace=True)
missis_df.reset_index(drop=True, inplace=True)

print('There are {} schools in Mississauga'.format(missis_df.shape[0]))
missis_df.head(2)

There are 190 schools in Mississauga


Unnamed: 0,school,enrol,lat,lng,pct_low_income,pct_uni_parents
0,All Saints Catholic School,346.0,43.53156,-79.71057,12.92,25.89
1,Bishop Scalabrini School,557.0,43.5839,-79.63587,32.55,54.6


## Search FourSquare for tutoring services near schools
We make three searches: tutors, math and learn. We also filter the results to ignore irrelevant venues found

In [9]:
# get the foursquare results for searches 'tutor', 'math' and 'learn'

amt= []
unique_tutors= []

print('Working.. 1/3')
results1= foursq_search(missis_df.lat, missis_df.lng, query='tutor')
amt, unique_tutors= extract_results(results1)

print('Working.. 2/3')
results2= foursq_search(missis_df.lat, missis_df.lng, query='math')
amt, unique_tutors= extract_results(results2, amt=amt, unique_venues=unique_tutors, specs=['math ', 'mathematics', 'mathnasium'])

print('Working.. 3/3')
results3= foursq_search(missis_df.lat, missis_df.lng, query='learn')
amt, unique_tutors= extract_results(results3, amt=amt, unique_venues=unique_tutors , specs=[ 'oxford', 'sylvan'])
print('Done!')

# make a column for the number of services near each school
missis_df['tutor_services']= amt

# this is a measure of how good the school is based on how many students are in it and the number of services near it
missis_df['enrol_tutors_ratio']= missis_df.enrol / (missis_df.tutor_services + 1 )

print('Results collected.')
missis_df.head(2)

Working.. 1/3
Working.. 2/3
Working.. 3/3
Done!
Results collected.


Unnamed: 0,school,enrol,lat,lng,pct_low_income,pct_uni_parents,tutor_services,enrol_tutors_ratio
0,All Saints Catholic School,346.0,43.53156,-79.71057,12.92,25.89,1,173.0
1,Bishop Scalabrini School,557.0,43.5839,-79.63587,32.55,54.6,7,69.625


## Create a dataframe for the unique tutoring services¶
These are the unique venues the search gave up. There are 18 in total.

In [10]:
#make a dataframe with the information for each unique service found
unique_tutors_df= pd.DataFrame.from_records(unique_tutors, columns=['lat','lng','name'])
unique_tutors_df

Unnamed: 0,lat,lng,name
0,43.58091,-79.63718,Tutor
1,43.600502,-79.639748,Mississauga Tutor
2,43.60192,-79.62533,Logan's Tutor
3,43.5934,-79.6348,Kip Tutoring
4,43.596947,-79.662345,Ican Education Mississauga Tutoring
5,43.620395,-79.645786,Tutoring
6,43.566586,-79.71257,Mind Over Matter Tutoring
7,43.584335,-79.568275,Calculus Tutor
8,43.601764,-79.624681,Kumon Math & Reading Centre
9,43.497479,-79.707507,Kumon Math &Reading Center


# Mapping tutors and schools

In [12]:
tut_amt= unique_tutors_df.shape[0]

# make yellow circles signifying the effective radius of each tutoring services
area_map= plot_points( unique_tutors_df.lat, 
                         unique_tutors_df.lng,
                         [3000] * tut_amt,
                         ['yellow'] * tut_amt,
                         opacities= [0.1] * tut_amt )

# add the tutoring services to the map
tut_map= plot_points( unique_tutors_df.lat, 
                         unique_tutors_df.lng,
                         [100] * tut_amt,
                         ['red'] * tut_amt, 
                         unique_tutors_df.name, 
                         prev_map=area_map )


# add the schools to the map
sch_amt= missis_df.shape[0]
labels= [ name + ' : {} tutoring services'.format(tut) for name, tut in zip( missis_df.school, missis_df.tutor_services ) ]

full_map= plot_points( missis_df.lat, 
                         missis_df.lng,
                         [80] * sch_amt,
                         ['blue'] * sch_amt, 
                         labels,
                         missis_df.tutor_services / missis_df.tutor_services.max(),
                         prev_map=tut_map )

full_map

# Using K-Means Algorithm to Cluster Schools

In [13]:
# make a temporary dataframe to extract the data to feed the K-Means algorithm
cols= ['school', 'lat', 'lng']
kmeans_tempdf= missis_df.drop(cols, axis=1)
# ensure there are no null values
kmeans_tempdf.head()

Unnamed: 0,enrol,pct_low_income,pct_uni_parents,tutor_services,enrol_tutors_ratio
0,346.0,12.92,25.89,1,173.0
1,557.0,32.55,54.6,7,69.625
2,494.0,27.96,48.35,7,61.75
3,259.0,25.81,29.19,2,86.333333
4,238.0,24.29,33.92,7,29.75


# Normalize our data

In [14]:
# fit our data to emulate a standard normal distribution to make sure all factors are equal
X= np.nan_to_num( kmeans_tempdf.values )
X= StandardScaler().fit_transform(X)
print(X[:5])
print('Data Standardized.')

[[-0.57572535 -1.12218321 -0.88442529 -0.64695138 -0.25975603]
 [ 0.02745848  2.11923356  1.20944271  2.03495616 -0.7383676 ]
 [-0.15263906  1.36130676  0.75361982  2.03495616 -0.77482773]
 [-0.82443147  1.00628659 -0.64375081 -0.19996679 -0.66101039]
 [-0.88446399  0.75529558 -0.29878405  2.03495616 -0.92298319]]
Data Standardized.


# Create the ML model and fit it with our data

In [15]:
clusters= 9

# run k-means on the data separated
kmeans= KMeans(init='k-means++', n_clusters=clusters, n_init= 12)
kmeans.fit(X)
print('Model fit with data.')

Model fit with data.


In [16]:
# make a column for the clusters given to each school
missis_df['cluster']= kmeans.labels_
missis_df[['school','cluster']].head()

Unnamed: 0,school,cluster
0,All Saints Catholic School,2
1,Bishop Scalabrini School,6
2,Canadian Martyrs School,6
3,Christ The King Catholic School,1
4,Corpus Christi School,8


# Analyzing the Clusters

In [17]:
color_map= [ 'red','blue','orange','black','lime','green','pink','purple','brown' ]

# show the number of schools in each cluster as well as the mean ratio for each 
view= missis_df.groupby('cluster').mean().reset_index()
view['color']= view.cluster.apply( lambda c: color_map[c].title() )
view['count'] = missis_df.cluster.value_counts(sort=False)

cols= view.columns.tolist()
cols= cols[-2:] + [cols[-3]] + [cols[1]] + cols[4:-3]
view= view[cols]

view.columns= [ s.replace('_', ' ').title() for s in view.columns ]
view.set_index('Color', inplace=True)
view.index.name= None
view.sort_values('Enrol Tutors Ratio', ascending=False ).apply( lambda x: round(x, 2), axis=1)

Unnamed: 0,Count,Enrol Tutors Ratio,Enrol,Pct Low Income,Pct Uni Parents,Tutor Services
Black,4.0,1127.25,1357.0,17.52,19.87,0.25
Green,17.0,497.09,639.24,15.64,40.76,0.35
Lime,20.0,462.86,1239.25,22.52,27.42,2.1
Red,26.0,214.97,624.38,17.84,57.09,2.08
Purple,34.0,172.62,361.29,13.73,42.38,1.35
Blue,26.0,167.01,391.23,24.62,36.64,1.54
Orange,24.0,125.36,265.5,15.92,21.08,1.29
Pink,17.0,82.81,596.29,31.0,46.87,6.35
Brown,22.0,49.82,351.18,21.8,32.8,6.23


# Map the Clustered Schools

In [18]:
avgs= missis_df.enrol_tutors_ratio
sch_amt= missis_df.shape[0]
labels= [ name + ' : {} naive-expected students'.format( ratio ) for name, ratio in zip( missis_df.school, avgs.apply(int) ) ]
color_map= [ 'red','blue','orange','black','lime','green','deeppink','purple','brown' ]

full_map= plot_points( missis_df.lat, 
                         missis_df.lng,
                         50 + 200*(( avgs - avgs.min() ) / (avgs.max() - avgs.min() )),
                         [ color_map[ cluster ] for cluster in missis_df.cluster ], 
                         labels,
                         [0.5] * sch_amt )#, toner=True)

full_map

# Process Generalization
Here are functions generalizing the steps we took earlier, now we can repeat the process for any Ontario city we desire.

In [19]:
def get_city_df(city):
    if isinstance(city, str):
        city_df= school_df[ school_df.city == city ].copy()
    elif isinstance(city, list):
        city_df= school_df[ school_df.city == city[0] ].copy()
        for c in city[1:]:
            city_df= pd.concat( [ city_df, school_df[ school_df.city == c ] ], axis=0 )
    else:
        return None
    # drop columns we dont need
    city_df.drop('city', axis=1,inplace=True)
    city_df.reset_index(drop=True, inplace=True)
    
    return city_df

def find_tutors(city_df, queries= ['tutor'] ):
    # get the foursquare results for searches 'tutor', 'math' and 'learn'
    ttl= len(queries)
    results= []
    for i, query in enumerate(queries):
        print('Working.. {}/{}'.format(i + 1, ttl))
        results.append(foursq_search(city_df.lat, city_df.lng, query=query ))
        
    print('Done!')
    
    return results
    
def parse_results(city_df, results, specs=[], excls=[] ):
    ttl= len( results )
    amt=[]
    unique_tutors=[]
    
    make_empties= lambda lst: lst if len(lst) == ttl else [[]] * ttl
        
    specs= make_empties(specs)
    excls= make_empties(excls)
    
    for i, result in enumerate(results):
        amt, unique_tutors= extract_results(result, amt=amt, unique_venues=unique_tutors, specs=specs[i], excls= excls[i] )
    # make a column for the number of services near each school
    city_df['tutor_services']= amt

    # this is a measure of how good the school is based on how many students are in it and the number of services near it
    city_df['enrol_tutors_ratio']= city_df.enrol / (city_df.tutor_services + 1 )
    
    unique_tutors_df= pd.DataFrame.from_records(unique_tutors, columns=['lat','lng','name'])

    return city_df, unique_tutors_df

def cluster_schools(city_df, clus= 9):
    cols= ['school', 'lat', 'lng']
    
    X= np.nan_to_num( city_df.drop(cols, axis=1).values )
    X= StandardScaler().fit_transform(X)

    # run k-means on the data separated
    kmeans= KMeans(init='k-means++', n_clusters=clus, n_init= 12)
    kmeans.fit(X)

    # make a column for the clusters given to each school
    city_df['cluster']= kmeans.labels_
    
    return city_df

def map_sch_tut(city_df, unique_tutors_df, clustered=False, zoom=12, prev_map=None):
    tut_amt= unique_tutors_df.shape[0]

    area_map= plot_points( unique_tutors_df.lat, 
                             unique_tutors_df.lng,
                             [3000] * tut_amt,
                             ['yellow'] * tut_amt,
                             opacities= [0.09] * tut_amt,
                             zoom=zoom,
                             prev_map=prev_map )

    #add the tutoring services to the map
    tut_map= plot_points( unique_tutors_df.lat, 
                             unique_tutors_df.lng,
                             [100] * tut_amt,
                             ['red'] * tut_amt, 
                             unique_tutors_df.name, 
                             prev_map=area_map )



    sch_amt= city_df.shape[0]
    labels= [ name + ' : {} Nearby Services'.format(tut) for name, tut in zip( city_df.school, city_df.tutor_services ) ]

    if not clustered:
        full_map= plot_points( city_df.lat, 
                                 city_df.lng,
                                 [80] * sch_amt,
                                 ['blue'] * sch_amt, 
                                 labels,
                                 city_df.tutor_services / city_df.tutor_services.max(),
                                 prev_map=tut_map )

        return full_map
    else:
        return map_clusters(city_df, prev_map= tut_map)

def map_clusters(city_df, prev_map=None):
    avgs= city_df.enrol_tutors_ratio
    sch_amt= city_df.shape[0]
    labels= [ name + ' : {} Naive-expected Students : {} Nearby Services'.format( ratio, tut ) for name, ratio, tut in zip( city_df.school, avgs.apply(int), city_df.tutor_services ) ]
    color_map= [ 'brown','blue','pink','red','purple','black','yellow','orange','green' ]

    full_map= plot_points( city_df.lat, 
                             city_df.lng,
                             50 + 200*(( avgs - avgs.min() ) / (avgs.max() - avgs.min() )),
                             [ color_map[ cluster ] for cluster in city_df.cluster ], 
                             labels,
                             [0.5] * sch_amt,
                             prev_map=prev_map )

    return full_map

# Mississauga, Oakville, Brampton, Etobicoke and North York
We will now do the quick version of our process to the combination of these cities.
Get the schools:

In [20]:
cities= ['Mississauga','Oakville','Brampton','Etobicoke','North York']
city_df= get_city_df(cities)
print('There are {} schools in '.format(city_df.shape[0]), end='')
for i, cit in enumerate(cities):
    print(cit, end= ' ') if cit != cities[-1] else print('& {}.'.format(cit))
city_df.head(2)

There are 610 schools in Mississauga Oakville Brampton Etobicoke & North York.


Unnamed: 0,school,enrol,lat,lng,pct_low_income,pct_uni_parents
0,All Saints Catholic School,346.0,43.53156,-79.71057,12.92,25.89
1,Archbishop Romero Catholic Secondary School,232.0,43.6867,-79.7605,21.48,14.27


# Search for tutoring services:

In [21]:
queries= ['tutors', 'math', 'learning' ]
results= find_tutors(city_df, queries)

Working.. 1/3
Working.. 2/3
Working.. 3/3
Done!


# Extract the venues:¶
These contain irrelevant results, you can see 'Dr. Mathew Dentist Office' on row 2.

In [22]:
city_sch_tut, services= parse_results( city_df, results )
print('There are {} unique relevant results.'.format( len( services ) ))
print(services.name[:5])
city_sch_tut.head(2)

There are 166 unique relevant results.
0    Impel Tutors                     
1    Spirit of Math                   
2    Dr. Mathew Dentist Office        
3    Academy for Mathematics & Science
4    Kumon Math & Reading Centre      
Name: name, dtype: object


Unnamed: 0,school,enrol,lat,lng,pct_low_income,pct_uni_parents,tutor_services,enrol_tutors_ratio
0,All Saints Catholic School,346.0,43.53156,-79.71057,12.92,25.89,6,49.428571
1,Archbishop Romero Catholic Secondary School,232.0,43.6867,-79.7605,21.48,14.27,4,46.4


# Filter our results:
We look through the results and create specifications and exclusions to re-parse the results. These are done manually and could be automated.

In [23]:
specs= [[], ['math ', 'mathstat', 'mathematics'], []]
excls= [    [],
           [ 'copy room', 'humber', 'library', 'class', 'department' ],
           [ 'acend', 'elearning', 'e-learning' ,'playground','pavilion','disabilities',
               'early', 'teksource', 'build','rider','network','adult',
               'enabled', 'york','library','solutions','scotiabank',
               'tykes','child','bmo','international','agincourt','code','engage'
               'e-learning','music','ocadu','rbc','research','smw','ryerson',
               'reiki','employee', 'path' ,'otf','thornhill', 'day care', 'golf', 
                'humber', 'finance','gems'] ]

city_sch_tut, services= parse_results( city_df, results , specs=specs, excls=excls)
print('There are {} unique relevant results.'.format( len( services ) ))
print(services.name[:5])
city_sch_tut.head()

There are 79 unique relevant results.
0    Impel Tutors                     
1    Academy for Mathematics & Science
2    Kumon Math & Reading Centre      
3    Kumon Math &Reading Center       
4    academy for mathematics & english
Name: name, dtype: object


Unnamed: 0,school,enrol,lat,lng,pct_low_income,pct_uni_parents,tutor_services,enrol_tutors_ratio
0,All Saints Catholic School,346.0,43.53156,-79.71057,12.92,25.89,2,115.333333
1,Archbishop Romero Catholic Secondary School,232.0,43.6867,-79.7605,21.48,14.27,4,46.4
2,Ascension of Our Lord Secondary School,841.0,43.72338,-79.65349,26.93,9.88,0,841.0
3,Bishop Scalabrini School,557.0,43.5839,-79.63587,32.55,54.6,3,139.25
4,Canadian Martyrs School,494.0,43.59904,-79.62194,27.96,48.35,4,98.8


# Here are the schools and tutoring services in Toronto:
Each blue marker represents a school.
Each red marker represents a tutoring service.
The yellow circles denote a 3km radius from each service. A 3km radius was used in our searches.

In [24]:
map_sch_tut(city_sch_tut, services, zoom=11 )

# Now we use KMeans to cluster the schools

In [25]:
city_sch_tut= cluster_schools(city_sch_tut)
color_map= [ 'red','blue','orange','black','lime','green','pink','purple','brown' ]

# show the number of schools in each cluster as well as the mean ratio for each 
view= city_sch_tut.groupby('cluster').mean().reset_index()
view['color']= view.cluster.apply( lambda c: color_map[c].title() )
view['count'] = missis_df.cluster.value_counts(sort=False)

cols= view.columns.tolist()
cols= cols[-2:] + [cols[-3]] + [cols[1]] + cols[4:-3]
view= view[cols]

view.columns= [ s.replace('_', ' ').title() for s in view.columns ]
view.set_index('Color', inplace=True)
view.index.name= None
view.sort_values('Enrol Tutors Ratio', ascending=False ).apply( lambda x: round(x, 2), axis=1)

Unnamed: 0,Count,Enrol Tutors Ratio,Enrol,Pct Low Income,Pct Uni Parents,Tutor Services
Lime,20.0,1161.54,1294.69,20.93,22.72,0.15
Black,4.0,437.13,713.93,14.35,52.79,0.87
Blue,26.0,415.99,1271.68,19.85,23.02,2.34
Red,26.0,197.91,354.49,20.96,25.51,1.02
Purple,34.0,162.38,495.72,13.87,46.27,2.15
Pink,17.0,99.79,418.39,28.54,41.1,3.42
Brown,22.0,97.26,398.16,17.12,22.98,3.14
Orange,24.0,96.18,361.11,32.72,16.24,3.22
Green,17.0,47.82,343.07,18.96,39.53,6.4


## Finally here are the clustered schools along with the nearby services.
Here the colours to look out for are orange and green. These schools have the least nearby services as well as the most expected students

In [26]:
map_sch_tut(city_sch_tut, services, clustered=True, zoom=11)