# Toronto Data First Question
### by Konstantinos Georgopoulos

In [1]:
import pandas as pd
import numpy as np

In [2]:
from bs4 import BeautifulSoup
import requests
import csv

In [3]:
# set up the URL
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 

# Create the dataframe
df = pd.read_html(str(table))[0]

df = df.rename(columns=df.iloc[0]).drop(df.index[0]) # set first rows as column names

df = df.rename(index=str, columns={"Postcode": "PostalCode", "Neighbourhood": "Neighborhood"}) # rename some columns as asked

df = df.drop(df[df.Borough == "Not assigned"].index) # drop the "Not assigned" Boroughs

df = df.reset_index(drop=True) # reset index to 0 and use the drop parameter to avoid the old index being added as a column

#if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.Neighborhood[df.Neighborhood=="Not assigned"] = df.Borough[df.Neighborhood=="Not assigned"]

df=df.groupby(['PostalCode']).agg({'Borough': 'first','Neighborhood':', '.join})

df.reset_index(inplace=True)

df = df[['PostalCode', 'Borough', 'Neighborhood']]

df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim #a tool to search OSM data by name and address and to generate synthetic addresses of OSM points

url="http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv"
coordinates=pd.read_csv(url)
coordinates.columns = ['PostalCode', 'Latitude', 'Longitude']
df2 = pd.merge(df,coordinates, on="PostalCode")

df2 = df2[df2['Borough'].str.contains('Toronto')].reset_index(drop=True)
df2.head(10)

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.17.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00   5.00 MB/s
geopy-1.17.0-p 100% |################################| Time: 0:00:00   5.71 MB/s


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
