## Segmenting and clustering neighboorhoods in Toronto

In [20]:
# importing dependencies
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import io
import geocoder

## Start the DataFrame

Using only pandas and the pd.read_html() method. 

In [3]:
# Another method
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
page = bs(req.content)
table = page.find('table')
df = pd.read_html(str(table), header = 0)[0]
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [5]:
# Ignoring the "Not assigned" rows

df = df[df.Borough != 'Not assigned']
df = pd.DataFrame(df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(', '.join))
df = df.reset_index()
df.Neighborhood[df.Neighborhood == 'Not assigned'] = df.Borough
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
# Checking the M5A Postal code
print('Verify that the \'M5A\'-case is correct : \n{}\n '.format(df[df['Postal Code'] == 'M5A']))


Verify that the 'M5A'-case is correct : 
   Postal Code           Borough               Neighborhood
53         M5A  Downtown Toronto  Regent Park, Harbourfront
 


## Last requirement: 

Use the .shape method to print the number of rows of your dataframe.use the .shape method to print the number of rows of your dataframe.

In [9]:
print('The number of rows of the dataframe is {}'.format(df.shape[0]))

The number of rows of the dataframe is 103


## Use the csv

Geocoder was taking too long to get the coordinates, so I used the url made available on Coursera. 

In [23]:
url= "http://cocl.us/Geospatial_data"
req = requests.get(url).content
coord = pd.read_csv(io.StringIO(req.decode('utf-8')))
coord.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## Merging the coordinates with the DataFrame with the Boroughs and Neighborhoods of Toronto. 

In [25]:
df_merged = pd.merge(left = df, right = coord, left_on='Postal Code', right_on='Postal Code', how = 'left')
df_merged.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
