### Exploring and Clustering of Neighborhoods in Toronto.

# OBTAINING DATAFRAME FROM WIKIPEDIA URL

In [1]:
import numpy as np 
import pandas as pd

In [2]:
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs =  pd.read_html(wikipedia_url)
len(dfs)

3

In [3]:
dfs[0].shape

(180, 3)

### The table we need is the first table from the wikipedia url, hence dfs[0]

In [4]:
toronto_neighborhood = dfs[0]
toronto_neighborhood.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


In [5]:
toronto_neighborhood.columns

Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')

In [6]:
# get names of indexes for which 
# column Borough has Not assigned
index_names = toronto_neighborhood[toronto_neighborhood['Borough'] == 'Not assigned' ].index 

# dropiing every entry in the Borough column that has 'Not assigned' as entry
toronto_neighborhood.drop(index_names, inplace = True)   
toronto_neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
# lET'S RESET THE INDEX OF OUR DATARFRAME
toronto_neighborhood = toronto_neighborhood.reset_index(drop=True)
toronto_neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
missing = toronto_neighborhood[toronto_neighborhood['Neighbourhood'] == 'Not assigned']

# checking for Not assigned data is still not present in the Neighborhood column
missing

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [9]:
toronto_neighborhood.shape
# toronto_neighborhood[['Postal Code']]

(103, 3)

# UPDATING DATAFRAME WITH LATITUDE AND LONGITUDE COLUMNS

In [10]:
# import the geocoder library to find the lat and lng of the postal codes
!pip install geocoder
import geocoder



You should consider upgrading via the 'c:\users\mail4\appdata\local\programs\python\python38-32\python.exe -m pip install --upgrade pip' command.


## The geocoder library wasn't working and the csv file wasn't loading either.
## So I decided to use pgeocode library instead which worked perfectly 

In [11]:
!pip install pgeocode
import pgeocode



You should consider upgrading via the 'c:\users\mail4\appdata\local\programs\python\python38-32\python.exe -m pip install --upgrade pip' command.


In [11]:
nomi = pgeocode.Nominatim('ca') # intializing the country to canada
lat_lng = []
code_lat_lng = []
lat = None
lng = None

# for column in toronto_neighborhood[['Postal Code']]:
# postal_codes = toronto_neighborhood['Postal Code'].values
# postal_codes

# postal_codes should contain a list of the codes after this loop
for column in toronto_neighborhood[['Postal Code']]:
    postal_codes = toronto_neighborhood[column].values

#extracting lat and lng for all postal codes    
for code in postal_codes:
    #let's loop until we find both lat and lng incase we encounter None
    while((lat == None) or (lng == None)):
        df_lat_long = nomi.query_postal_code(code) # grabs the lat and lng for a postal code
        lat = df_lat_long['latitude']
        lng = df_lat_long['longitude']
    
#     print("Done with {}, Toronto, Ontario".format(code))
    lat_lng.append([lat, lng])
    code_lat_lng.append([code, lat, lng])
    lat = None
    lng = None

# lat_lng
code_lat_lng

[['M3A', 43.7545, -79.33],
 ['M4A', 43.7276, -79.3148],
 ['M5A', 43.6555, -79.3626],
 ['M6A', 43.7223, -79.4504],
 ['M7A', 43.6641, -79.3889],
 ['M9A', 43.6662, -79.5282],
 ['M1B', 43.8113, -79.193],
 ['M3B', 43.745, -79.359],
 ['M4B', 43.7063, -79.3094],
 ['M5B', 43.6572, -79.3783],
 ['M6B', 43.7081, -79.4479],
 ['M9B', 43.6505, -79.5517],
 ['M1C', 43.7878, -79.1564],
 ['M3C', 43.7334, -79.3329],
 ['M4C', 43.6913, -79.3116],
 ['M5C', 43.6513, -79.3756],
 ['M6C', 43.6915, -79.4307],
 ['M9C', 43.6437, -79.5767],
 ['M1E', 43.7678, -79.1866],
 ['M4E', 43.6784, -79.2941],
 ['M5E', 43.6456, -79.3754],
 ['M6E', 43.6889, -79.4507],
 ['M1G', 43.7712, -79.2144],
 ['M4G', 43.7124, -79.3644],
 ['M5G', 43.6564, -79.38600000000002],
 ['M6G', 43.6683, -79.4205],
 ['M1H', 43.7686, -79.2389],
 ['M2H', 43.8015, -79.3577],
 ['M3H', 43.7535, -79.4472],
 ['M4H', 43.7059, -79.3464],
 ['M5H', 43.6496, -79.3833],
 ['M6H', 43.6655, -79.4378],
 ['M1J', 43.7464, -79.2323],
 ['M2J', 43.7801, -79.3479],
 ['M3J', 

## ['M7R', nan, nan] 
#### Latitude and Long for M7R wasn't returned so we'll update the dataframe by 
#### dropping the row(s) corresponding to the postal code

In [13]:
len(lat_lng), len(postal_codes), toronto_neighborhood.shape
# toronto_neighborhood

NameError: name 'lat_lng' is not defined

In [13]:
# let's find out the index of the entry for the code "M7R"
# The postal code in the mississauga area is L4something and not M7R,
# so it only makes sense to get rid of this outlier from our data
toronto_neighborhood[toronto_neighborhood['Postal Code'] == 'M7R']

Unnamed: 0,Postal Code,Borough,Neighbourhood
76,M7R,Mississauga,Canada Post Gateway Processing Centre


In [18]:
toronto_neighborhood = toronto_neighborhood.drop([76])

In [21]:
#Now we can merge the drop the nan values from out lat and long befor adding to 
# our dataframe
lat_lng[76]

[nan, nan]

In [22]:
del lat_lng[76]
lat_lng

[[43.7545, -79.33],
 [43.7276, -79.3148],
 [43.6555, -79.3626],
 [43.7223, -79.4504],
 [43.6641, -79.3889],
 [43.6662, -79.5282],
 [43.8113, -79.193],
 [43.745, -79.359],
 [43.7063, -79.3094],
 [43.6572, -79.3783],
 [43.7081, -79.4479],
 [43.6505, -79.5517],
 [43.7878, -79.1564],
 [43.7334, -79.3329],
 [43.6913, -79.3116],
 [43.6513, -79.3756],
 [43.6915, -79.4307],
 [43.6437, -79.5767],
 [43.7678, -79.1866],
 [43.6784, -79.2941],
 [43.6456, -79.3754],
 [43.6889, -79.4507],
 [43.7712, -79.2144],
 [43.7124, -79.3644],
 [43.6564, -79.38600000000002],
 [43.6683, -79.4205],
 [43.7686, -79.2389],
 [43.8015, -79.3577],
 [43.7535, -79.4472],
 [43.7059, -79.3464],
 [43.6496, -79.3833],
 [43.6655, -79.4378],
 [43.7464, -79.2323],
 [43.7801, -79.3479],
 [43.7694, -79.4921],
 [43.6872, -79.3368],
 [43.62300000000001, -79.3936],
 [43.648, -79.4177],
 [43.7298, -79.2639],
 [43.7797, -79.3813],
 [43.739, -79.4692],
 [43.6803, -79.3538],
 [43.6469, -79.3823],
 [43.6383, -79.4301],
 [43.7122, -79.2843

In [23]:
# making sure we have consistency with dataframe and lat_lng list
len(lat_lng), toronto_neighborhood.shape

(102, (102, 3))

In [24]:
toronto_neighborhood = toronto_neighborhood.reset_index(drop=True)
toronto_neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
97,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
98,M4Y,Downtown Toronto,Church and Wellesley
99,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
100,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [38]:
# lat_lng now contains a list of lists containing the lat and long for each codes
# We need to prepare a lat_lng list so that it can be inserted into the dataframe called toronto_neighborhood


Latitude = []
Longitude = []
for i in range(len(lat_lng)):
    Latitude.append(lat_lng[i][0])
    Longitude.append(lat_lng[i][1])

# print(Latitude[101])
# print(Longitude[101])

# Add Longitude and Latitude lits as columns into the dataframe
toronto_neighborhood = toronto_neighborhood.assign(**{'Latitude' : Latitude, 'Longitude' : Longitude})
toronto_neighborhood.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
