# Explore and cluster the neighborhoods in Toronto.

Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [47]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim 
import folium 
import json
from pandas.io.json import json_normalize 

#### We will ping a website using: "requests.get(url).text" and  this will return us the HTML of the website

In [48]:
web_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = requests.get(web_url).text

#### We will use the BeautifulSoup Function to get the details of HTML into a Beautiful Object and we will use prettify function in BeautifulSoup to view the tags from parsed tree.

#### BeautifulSoup is a python package for parsing HTML and XML documents. This process of creating parse trees from HTML is useful for webscraping


In [49]:
 soup = BeautifulSoup(webpage, 'lxml')

#### By inspecting the HTML script carefully, we have identified the neighborhood and postalcode information for Toronto is available under class Wikitable Sortable

In [None]:
print(soup.prettify())

#### Get the table details using the findall method from Beautifulsoap

In [51]:
table = soup.find_all('table')[0]

#### Let's write a loop to get all the table details into our Pandas Dataframe


In [52]:
n_columns = 0
n_rows=0
column_names = []

# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):

    # Determine the number of rows in the table
    td_tags = row.find_all('td')
    if len(td_tags) > 0:
        n_rows+=1
        if n_columns == 0:
            # Set the number of columns for our table
            n_columns = len(td_tags)

    # Handle column names if we find them
    th_tags = row.find_all('th') 
    if len(th_tags) > 0 and len(column_names) == 0:
        for th in th_tags:
            column_names.append(th.get_text())

# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
    raise Exception("Column titles do not match the number of columns")

columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,
                  index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        df.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1
    if len(columns) > 0:
        row_marker += 1

#### Lets check our dataframe header

In [53]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


#### We see that there are newline characters existing our df, let's remove them

In [54]:
df = df.replace('\n','', regex=True)

In [55]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Let's create a new Dataframe to drop all the rows that are having the "not assigned" values in the Borough column

In [76]:
T_df = df[df.Borough != 'Not assigned'].reset_index()

In [77]:
T_df.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


#### Delete the index column from old dataframe

In [78]:
del T_df['index']

In [79]:
T_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [80]:
T_df.columns = ['Postcode', 'Borough', 'Neighbourhood']

#### Rename the columns to remove the hidden new line characters in the header

In [81]:
T_df = T_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

#### aggregate the Neighbourhoods by Boroughs with a comma separated values and print the dataframe

In [82]:
T_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [83]:
T_df.shape

(103, 3)

In [84]:
T_df.loc[new_df['Neighbourhood']=='Not assgined', 'Neighbourhood'] = T_df['Borough']

#### Copy the Borough information to Neighborhood where Neighborhood column has values as "Not Assigned"

In [85]:
T_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Let's see the shape of dataframe

In [86]:
T_df.shape

(103, 3)

#### Now that we have the required details in dataframe Let's move on to get longitude and latitude details for the locations in the data frame

#### We have downloaded the CSV file from the link given in the task and we will proceed further by using pandas read_csv method to import the data into data frame

#### CSV File Link: http://cocl.us/Geospatial_data

In [88]:
geos_df = pd.read_csv("Geospatial_Coordinates.csv")

In [68]:
geospat_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Let's look at the shape of geospatial info Dataframe and assign the column headers to make dataframe eligible for join


In [71]:
geospat_df.shape

(103, 3)

In [74]:
geo_df.columns = ['Postcode', 'Latitude', 'Longitude']

#### Combine the dataframe using the Pandas merge function

In [75]:
final_df = pd.merge(new_df, geospat_df, how='inner', on='Postcode')
combi_df.head(30)

KeyError: 'Postcode'