# Scraping a Wikipedia Page
## Scraping Wikipedia Data w/ BeautifulSoup

In [1]:
!pip install beautifulsoup4



In [2]:
!pip install lxml



In [3]:
!pip install requests



In [4]:
from bs4 import BeautifulSoup
import requests 

I couldn't figure out how to use BeautifulSoup, so I tried something else.

## Scraping Wikipedia Data w/ Pandas

In [5]:
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url)

In [6]:
df=tables[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


## Cleaning Data
### Drop rows with "Not Assigned" in column "Borough"

In [7]:
borough_mask = df.index[df['Borough'] == 'Not assigned']
neighborhood_mask = df.index[df['Neighbourhood'] == 'Not assigned']
neighborhood_and_borough_mask = borough_mask & neighborhood_mask

In [8]:
df.drop(df.index[borough_mask], inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### Replace "Not assigned"  in  "Neighborhood" column with "Borough" name in that cell

In [9]:
neighborhood_mask = df.index[df['Neighbourhood'] == 'Not assigned']

for idx in neighborhood_mask:
    df['Neighbourhood'][idx] = df['Borough'][idx]
print(df.shape)   
df.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## Combine rows with same Postcode, separated by commas

In [14]:
df2 = df.groupby(['Postcode','Borough'], sort=False).agg( ', '.join)
df_grouped=df2.reset_index()
df_grouped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


##  Shape of Final Table

In [16]:
print('Shape of Final Table:', df_grouped.shape)

Shape of Final Table: (103, 3)


### Group by "Postcode" column and consolidate content in "Neighbourhood" cells

In [10]:
f_neighborhoods = lambda x: "%s" % ', '.join(x)
f_boroughs = lambda x: set(x).pop()

temp = df.groupby('Postcode')
temp_neighborhoods = temp['Neighbourhood'].apply(f_neighborhoods)
temp_boroughs = temp['Borough'].apply(f_boroughs)

columns_list = list(zip(temp_boroughs.index, temp_boroughs, temp_neighborhoods))
df_grouped = pd.DataFrame(columns_list)

df_grouped.columns = ['Postcode', 'Borough', 'Neighbourhood']

df_grouped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Final Shape of Table

In [11]:
print('Final Shape of Table:', df_grouped.shape)

Final Shape of Table: (103, 3)
