# Capstone Project for Coursera IBM Data Science Professional Certificate

### Install the geocoder library

In [6]:
!conda install -c conda-forge geocoder 

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  57.12 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00   9.11 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  39.28 MB/s


### import the libraries that are needed for the assignment

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Scrape the wikipedia webpage and parse the data using beautifulsoup. Records where borough was not assigned, were ommitted from the final dataframe.
### For records where neighborhood as not assigned, the borough name was assigned to the neighhorhood name per assignment instructions

In [2]:
#get data from wikipedia page
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
Canada_neighborhoods = pd.DataFrame(columns=column_names) 

#loop through the table rows and columns to pull the data
table_body = table.find('tbody')
rows = table_body.find_all('tr')
column_count = 0
m_postcode = ""
m_borough = ""
m_neighborhood = ""

for tr in rows:
    cols = tr.find_all('td')
    for td in cols:
        if column_count == 0:
            m_postcode = td.text
            column_count = column_count + 1
        elif column_count == 1:
            m_borough = td.text
            column_count = column_count + 1
        else:
            m_neighborhood = td.text
            column_count = 0
    
       
    # replace neighborhood name with borough name if neighborhood name is not assigned
    if m_neighborhood == "Not assigned\n":
        m_neighborhood = m_borough
   
    # ignore records that have borough not assigned
    if m_postcode == "":
        pass
    else:
        if m_borough == "Not assigned":
            pass
        else:
            Canada_neighborhoods = Canada_neighborhoods.append({'Postcode': m_postcode,
                                                 'Borough': m_borough,
                                                'Neighborhood': m_neighborhood}, ignore_index=True)

#the neighborhood name came with a new line character '\n' so I had to clean it with this statement
Canada_neighborhoods['Neighborhood'] = Canada_neighborhoods['Neighborhood'].map(lambda x: x.rstrip('\n')) 
Canada_neighborhoods.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### I made the assumption that the same boroughs will have the same postalcode, so I grouped by postal code and borough, and then concatencated the neighborhood name

In [3]:
New_Canada_neighborhoods = Canada_neighborhoods.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
New_Canada_neighborhoods.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
New_Canada_neighborhoods.shape

(103, 3)

## the geocoder is unable to get coordinates from google. It gets stuck in the while loop because it is not returning any values. so I opted to use the csv file.
import geocoder # import geocoder

#### initialize your variable to None

lat_lng_coords = None
postcode_lst = []
latitude_lst = []
longitude_lst = []

postcode_lst = New_Canada_neighborhoods['Postcode']
for postal_code in postcode_lst:
    # loop until you get the coordinates
    #lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        print(lat_lng_coords)

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    latitude_lst.append(latitude)
    longitude_lst.append(longitude)

#print(latitude_lst)

### Using the csv file instead

In [9]:
file_source = "http://cocl.us/Geospatial_data"
df_geospatial = pd.read_csv(file_source)
df_geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### merge our original data from wikipedia with the geospatial data from the csv file

In [20]:
Canada_neighborhoods_final = pd.merge(left=New_Canada_neighborhoods, right=df_geospatial, how='left', left_on='Postcode', right_on='Postal Code')
Canada_neighborhoods_final.drop(['Postal Code'], axis=1, inplace=True)
Canada_neighborhoods_final.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### check the final size of new table

In [19]:
Canada_neighborhoods_final.shape

(103, 5)