### 1. Use the Notebook to build the code to scrape the following Wikipedia page

In [1]:
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.") 
soup = BeautifulSoup(req.content,'lxml') 
table = soup.find_all('table')[0] 
data = pd.read_html(str(table)) 

df=pd.DataFrame(data[0])

In [2]:
# check neighborhood data
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 2. The dataframe will consist of three columns: Postal Code, Borough, and Neighborhood

In [3]:
# rename columns
df = df.rename(columns={'Postcode': 'Postal Code', 'Neighbourhood': 'Neighborhood'})
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# shape of df
df.shape

(287, 3)

### 3. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
# ignore cells with a 'Borough' that is 'Not assigned'
not_assigned = df[df['Borough'] == "Not assigned"].index
df.drop(not_assigned, inplace=True)

In [6]:
# shape of df
df.shape

(210, 3)

### 4. Duplicated rows will be combined into one row with the neighborhoods separated with a comma.

In [7]:
# count 'Postal Code'
df.groupby('Postal Code').count()

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,2,2
M1C,3,3
M1E,3,3
M1G,1,1
M1H,1,1
M1J,1,1
M1K,3,3
M1L,3,3
M1M,3,3
M1N,2,2


In [8]:
# groupby df
df = df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [9]:
# shape of df
df.shape

(103, 3)

### 5. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [10]:
# replace 'Not assigned' neighborhood
df.loc[df['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df['Borough']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 6. Print the number of rows of this dataframe.

In [11]:
df.shape

(103, 3)

### 7. Read the 'Geospatial_data.csv' and merge it with df

In [12]:
# read csv
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# merge two data frames
main_df = df.merge(df_geo, left_on='Postal Code', right_on='Postal Code')

In [14]:
# check main_df
main_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### 8. Generate maps to visualize neighborhoods and how they cluster together

In [15]:
# install folium
! pip install folium

