# Coursera capstone - Neighborhoods of Toronto

In [6]:
#import required modules
from bs4 import BeautifulSoup # library to scrape information from web pages
import urllib.request # library to open URLs
import pandas as pd # library for data analsysis

## 1. Gathering and collecting data from wikipedia table

Beautifulsoup package is used for scraping webpages and urllib is used to open URLS. Information is colleted and stored repectively in python list.

In [7]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url) 
soup = BeautifulSoup(page, 'lxml')

In [8]:
#list to store wiki data
wiki_data = []

# Find table in wiki page
# loop for every row in table
for row in soup.findAll('tr'):
    
    # list to store row information
    lst = []
    
    # loop through every data in a table row
    for col in row.findAll('td'):
        
        # collect row data
        lst.append(col.text.strip())
    
    # aviod adding unwanted information
    if len(lst) == 3:
        wiki_data.append(lst)

## 2. Creating pandas dataframe out of wiki data



In [9]:
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(wiki_data)

df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 2.1. Rename dataframe columns

In [10]:
df.columns = ['Postalcode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 2.2  Ignore cells with a borough that is " *Not assigned* ".

In [11]:
df_filterd = df.drop(df[df['Borough'] == "Not assigned"].index, axis=0)
df_filterd.reset_index(drop=True, inplace=True)
df_filterd.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### 2.3 Combine multiple neighborhoods in one postal code using comma

In [12]:
df_filterd_grp = df_filterd.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(lambda x: ",".join(x)).reset_index()
df_filterd_grp.tail()

Unnamed: 0,Postalcode,Borough,Neighborhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove Gardens, Richview Gardens"
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens"
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


### 2.4  Make the neighborhood same as the borough, if neighborhood not asigned to a defined borough

In [13]:
df_filterd_grp['Neighborhood'].replace('Not assigned', df['Borough'], inplace=True)
df_filterd_grp.tail()

Unnamed: 0,Postalcode,Borough,Neighborhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove Gardens, Richview Gardens"
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens"
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


### 2.5 Dataframe shape

In [23]:
df_filterd_grp.shape

(103, 3)

## Completed scraping and cleaning the wiki data.