Capstone Project

Imports all packages

In [17]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [22]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

Create a class with functions to parse data from the website into a dataframe

In [32]:
class TabelParser:

    def parse_url(self,url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text,'lxml')
        return [(self.parse_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]
    def parse_table(self,table):
        n_rows = 0 
        n_columns = 0
        column_name = []
        
        for row in table.find_all('tr'):
            td_tags = row.find_all('td')
            
            if len(td_tags) > 0:
                n_rows += 1
                
                if n_columns == 0:
                    n_columns = len(td_tags)
                    
            th_tags = row.find_all('th')
            if len(th_tags) > 0 and len(column_name) == 0:
                for th in th_tags:
                    column_name.append(th.get_text())
        
        if len(column_name) > 0 and len(column_name) != n_columns:
            raise Exception("Headers do not match the number of columns")
            
        columns = column_name if len(column_name) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns=columns,index = range(0,n_rows))
        
        row_index = 0
        for row in table.find_all('tr'):
            column_index = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_index,column_index] = column.get_text()
                column_index += 1
            if len(columns) > 0:
                row_index += 1
                
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
            
        return df

In [33]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tp = TabelParser()
df = tp.parse_url(url)[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
9,M8A,Not assigned,Not assigned\n


In [34]:
a = df.copy()

In [35]:
a = a[a.Borough != 'Not assigned']

In [36]:
a.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


In [None]:
Clean data, remove all the 'Not assigned'

In [45]:
a = a.reset_index().drop(['index'],axis=1)
a = a.replace('\n','',regex=True)

In [46]:
a.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [52]:
a.rename(columns={'Neighbourhood\n':'Neighbourhood'},inplace=True)

In [67]:
a.Neighbourhood[a.Neighbourhood=='Not assigned '] = a.Borough

In [69]:
a.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [80]:
df_full = a.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df_full = df_full.sample(frac=1).reset_index(drop=True)
df_full.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4A,North York,Victoria Village
1,M4Y,Downtown Toronto,Church and Wellesley
2,M2J,North York,"Fairview , Henry Farm , Oriole"
3,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
5,M1L,Scarborough,"Clairlea , Golden Mile , Oakridge"
6,M5N,Central Toronto,Roselawn
7,M6K,West Toronto,"Brockton , Exhibition Place , Parkdale Village"
8,M6G,Downtown Toronto,Christie
9,M6N,York,"The Junction North , Runnymede"


In [81]:
df_full.shape

(103, 3)

In [72]:
url = 'http://cocl.us/Geospatial_data'
latlong = pd.read_csv(url)

In [73]:
latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [75]:
latlong.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [82]:
latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [83]:
df_full = df_full.merge(latlong,on='Postcode')

In [84]:
df_full

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.725882,-79.315572
1,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
2,M2J,North York,"Fairview , Henry Farm , Oriole",43.778517,-79.346556
3,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558
5,M1L,Scarborough,"Clairlea , Golden Mile , Oakridge",43.711112,-79.284577
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M6K,West Toronto,"Brockton , Exhibition Place , Parkdale Village",43.636847,-79.428191
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M6N,York,"The Junction North , Runnymede",43.673185,-79.487262
