#### Exploring neighbourhoods in Toronto

import libraries

In [2]:
from bs4 import BeautifulSoup #beautifulSoup package for scraping data

import requests #library to handle requests

import pandas as pd #library for data analysis

import numpy as np # to handle data in vectorized forms

print('libraries imported')

libraries imported


#### Define url containing data to be scraped


In [11]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

ERROR! Session/line number was not unique in database. History logging moved to new session 372


### Data Scraping

#### Send GET request and examine results 

In [48]:
response = requests.get(url)

#create a variable to grab relevant data
soup = BeautifulSoup(response.text, 'html.parser')

#print(soup)

#### Create variable to grab relevant data needed for this project



In [49]:
#create variable to grab table 
table = soup.find('table', {'class':'wikitable sortable'}).tbody

#create variable to access the tr tags in table
rows = table.find_all('tr')

#create variable to access the th tags in table
columns = [a.text.replace('\n','') for a in rows[0].find_all('th')]

print(columns)

['Postal Code', 'Borough', 'Neighbourhood']


#### Create pandas dataframe

In [50]:
columns = ['PostalCode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood


#### Loop through the data and fill the dataframe

In [51]:
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) ==3:
        values = [tds[0].text.replace('\n', ''), tds[1].text.replace('\n', ''), tds[2].text.replace('\n', '')]
    else:
        values = [td.text.replace('\n', '') for td in tds]

    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)

In [52]:
#display summary of data
df.info()

#display first 5 rows of data
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
PostalCode       180 non-null object
Borough          180 non-null object
Neighbourhood    180 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Data Cleaning

In [53]:
#check for  duplicates
duplicate_rows_df = df[df.duplicated()]
print("number of duplicated rows: ", duplicate_rows_df.shape)

number of duplicated rows:  (0, 3)


In [54]:
#drop duplicate rows

df = df.drop_duplicates()

#### Get and Drop rows where neighbourhoods are 'Not Assigned'

In [55]:
# Get names of indexes for which column Neighbourhood has value Not assigned
indexNames = df[ df['Neighbourhood'] == 'Not assigned' ].index

# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

#### Resetting index of dataframe after deleting rows


In [56]:
# setting index 
df.set_index(["PostalCode", "Borough", "Neighbourhood"], inplace = True)
                                
# resetting index
df.reset_index(inplace = True)

#### Display rows to confirm index reset

In [57]:
#display first and last 5 rows
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


#### Get the dimension of cleaned data

In [58]:
df.shape

(103, 3)