# Segmenting and Clustering Neighborhoods in Toronto


## Webscraping Wikipedia page

In [94]:
import pandas as pd

my_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(my_url)[0]
print("Dataframe successfully retreived... \nDataframe shape: [%i,%i]" % (df.shape[0],df.shape[1]))


Dataframe successfully retreived... 
Dataframe shape: [288,3]


## Removing not assigned boroughs

In [95]:
print("Removing \"not assigned\" boroughs...")
df=df[df.Borough != 'Not assigned']
print("Dataframe shape: [%i,%i]" % (df.shape[0],df.shape[1]))


Removing "not assigned" boroughs...
Dataframe shape: [211,3]


# Grouping by Postcode

In [96]:
df1=df.groupby('Postcode').agg({'Borough':lambda x: set(x),'Neighbourhood':lambda x: list(x)}).reset_index()
#df1['Neighbourhood']='-'.join(df1['Neighbourhood'])
df1['Neighbourhood']=df1.Neighbourhood.apply(', '.join)
df1['Borough']=df1.Borough.apply(', '.join)
df1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Changing "not assigned" neighborhoods to the according borough

In [97]:
df1['Neighbourhood'] = df1['Neighbourhood'].replace('Not assigned',df1['Borough'])
print('See the Postcode = M7A row. The Neighbourhood has changed from "Not assigned" to "Queens Park"')
df1

See the Postcode = M7A row. The Neighbourhood has changed from "Not assigned" to "Queens Park"


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Checking the shape of the dataframe

In [98]:
print('The dataframe has %i rows and %i columns.' % (df1.shape[0],df1.shape[1]))

The dataframe has 103 rows and 3 columns.


## Installing geocoder package for the coordinate extraction

In [99]:
!pip install geocoder
import geocoder # import geocoder




## Adding two columns for longitude and latitude in the dataframe

In [100]:
df1["Latitude"] = ""
df1["Longitude"] = ""
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",,
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,


## Looping through dataframe and filling lat/long pairs using geocoder (with HERE provider)
I registered at HERE.com to get the APP_ID and APP_CODE

In [111]:
import time

for index, row in df1.iterrows():

    # initialize your variable to None
    lat_lng_coords = None
    postal_code=row['Postcode']
    print('Checking Postcode %s' % postal_code)   
    # loop until you get the coordinates
    count=0
    search_string=postal_code + ',Toronto'
    g = geocoder.here(search_string,app_id='d4UaQ8fUrNVIHFjNxTwG',app_code='l7mStgO9JxCRQNVoRea5Fg')
    lat_lng_coords = g.latlng
    #while(lat_lng_coords is None):
    #    g = geocoder.here('{}, Toronto, Ontario'.format(postal_code),app_id='d4UaQ8fUrNVIHFjNxTwG',app_code='l7mStgO9JxCRQNVoRea5Fg')
    #    lat_lng_coords = g.latlng
    #    count+=1
    #    if count>= 1:
    #        lat_lng_coords=[0,0]
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    print("lat/long = %f/%f" % (latitude,longitude))
    df1.Latitude.iloc[index] = latitude
    df1.Longitude.iloc[index] = longitude
    #time.sleep(10)
    

Checking Postcode M1B
lat/long = 43.811530/-79.195520
Checking Postcode M1C
lat/long = 43.785670/-79.158720
Checking Postcode M1E
lat/long = 43.765820/-79.175190
Checking Postcode M1G
lat/long = 43.768370/-79.217590
Checking Postcode M1H
lat/long = 43.769690/-79.239440
Checking Postcode M1J
lat/long = 43.743130/-79.231750
Checking Postcode M1K
lat/long = 43.726280/-79.263630
Checking Postcode M1L
lat/long = 43.713050/-79.285050
Checking Postcode M1M
lat/long = 43.724230/-79.227920
Checking Postcode M1N
lat/long = 43.696770/-79.259970
Checking Postcode M1P
lat/long = 43.759980/-79.268970
Checking Postcode M1R
lat/long = 43.750710/-79.300560
Checking Postcode M1S
lat/long = 43.793940/-79.267980
Checking Postcode M1T
lat/long = 43.784730/-79.299070
Checking Postcode M1V
lat/long = 43.817690/-79.280190
Checking Postcode M1W
lat/long = 43.800880/-79.320740
Checking Postcode M1X
lat/long = 43.834220/-79.216700
Checking Postcode M2H
lat/long = 43.802850/-79.356210
Checking Postcode M2J
lat/lo

KeyboardInterrupt: 

In [None]:
df1