# Week 3
## Scraping data from wikipedia

!pip install -q beautifulsoup4

!pip install -q lxml

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
content = requests.get(url)
content = BeautifulSoup(content.content, 'lxml')

In [4]:
table_html = content.select('.wikitable tbody tr')
table = [row.select('td,th') for row in table_html]
table = list(map(lambda line: [element.get_text().replace("\n","") for element in line], table))

In [5]:
table[:5]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

## Data cleaning

In [6]:
import pandas as pd

In [7]:
data = pd.DataFrame(table[1:], columns=table[0])

In [8]:
data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Displaying repeated postal codes

In [9]:
l = data.groupby('Postcode').count().sort_values('Borough', ascending=False)
l = l[l.Neighbourhood > 1]

In [10]:
l.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M8Y,8,8
M9V,8,8
M5V,7,7
M4V,5,5
M9B,5,5


In [11]:
l.shape

(56, 2)

### Dropping repeated values and creating a new line with the names of the concatenated neighborhoods

In [12]:
data['Neighbourhood'] = data.apply(lambda row: row[1] if row[2] == 'Not assigned' else row[2], axis=1)

In [13]:
for code in l.index:
    mask = data['Postcode'] == code
    b = list(data[mask].Borough)[-1]
    n = ', '.join(list(data[mask].Neighbourhood))
    data = data[~mask]
    s =  pd.Series({'Postcode':code, 'Borough':b, 'Neighbourhood':n})
    data = data.append(s, ignore_index=True) 

### Result

In [14]:
l_a = data.groupby('Postcode').count().sort_values('Borough', ascending=False)
l_a = l_a[l_a.Neighbourhood > 1]
l_a.shape

(0, 2)

In [15]:
data = data.sort_values('Postcode')
data = data.reset_index()
data.head(10)

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,157,M1B,Scarborough,"Rouge, Malvern"
2,143,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,144,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,27,M1G,Scarborough,Woburn
5,36,M1H,Scarborough,Cedarbrae
6,42,M1J,Scarborough,Scarborough Village
7,147,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
8,149,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
9,150,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"


In [16]:
data.shape

(180, 4)