## Week 3 Part 2
#### Build a dataframe of the postal code of each neighborhood along with borough name and neighborhood name in Toronto
#### Get the Geographical coordinates of the neighborhoods in toronto

In [None]:
!pip install folium
!pip install geopy

In [None]:
import numpy as np
import pandas as pd 
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### scraping data from wikipedia and storing in a dataframe

In [None]:
data=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text # sending GET request

In [3]:
bs=BeautifulSoup(data,'html.parser') #parsing data from html

In [4]:
PostalCodeList=[]
BoroughList=[]
NeighborhoodList=[]

In [5]:
for row in bs.find('table').find_all('tr'):  #finding all the rows in the table and append them into the lists
    cells=row.find_all('td')
    if(len(cells)>0):
        PostalCodeList.append(cells[0].text)
        BoroughList.append(cells[1].text)
        NeighborhoodList.append(cells[2].text.rstrip('\n')) # stripping new lines in the neighborhood cell

In [6]:
Toronto_df=pd.DataFrame({"PostalCode": PostalCodeList, "Borough":BoroughList, "Neighborhood": NeighborhoodList})  #creating a new dataframe with the lists
Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Dropping the "Not assigned" cells in borough

In [3]:
Toronto_df_dropna=Toronto_df[Toronto_df.Borough!="Not assigned"].reset_index(drop=True)
Toronto_df_dropna.head()

NameError: name 'Toronto_df' is not defined

### Grouping neighborhoods in the same borough

In [8]:
Toronto_df_grouped=Toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
Toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Make the value same as Borough for Neighborhood="Not assigned"

In [9]:
for index, row in Toronto_df_grouped.iterrows():
    if row["Neighborhood"]=="Not assigned":
        row["Neighborhood"]=row["Borough"]
Toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Checking whether the dataframe is same as shown in the question

In [11]:
column=["PostalCode", "Borough", "Neighborhood"]
test_df=pd.DataFrame(columns=column)
test_list=["M5G","M2H","M4B","M1J","M4G","M4M","M1R","M9V","M9L","M5V","M1B","M5A"]
for postcode in test_list:
    test_df=test_df.append(Toronto_df_grouped[Toronto_df_grouped["PostalCode"]==postcode], ignore_index=True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens, Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale, Wexford"
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


### Print the number of rows of the cleaned dataframe

In [12]:
Toronto_df_grouped.shape

(103, 3)

### Load the Coordinates from the csv file

In [16]:
coordinates=pd.read_csv('http://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
coordinates.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Combining 2 tables for coordinates

In [19]:
Toronto_df_new=Toronto_df_grouped.merge(coordinates, on="PostalCode", how="left")
Toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Check whether coordinates are added 

In [22]:
column_names=["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df=pd.DataFrame(columns=column_names)
test_list=["M5G","M2H","M4B","M1J","M4G","M4M","M1R","M9V","M9L","M5V","M1B","M5A"]
for postcode in test_list:
    test_df=test_df.append(Toronto_df_new[Toronto_df_new["PostalCode"]==postcode],ignore_index=True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
