<a id="Top"></a>

# Week 3, Capstone Course, Peer-grades Assignment

1. [Part I](#PartI) 
2. [Part II](#PartII) 
3. [Part III](#PartIII) 

In [1]:
import numpy as np
import pandas as pd
import folium
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

<a id="PartI"></a>

# Part I

[Top](#Top)

### Import BeautifulSoup and Wiki html page

In [2]:
from bs4 import BeautifulSoup

with open("List of postal codes of Canada  M - Wikipedia.html") as fp:
    soup = BeautifulSoup(fp,'html.parser')


### Parsing

In [3]:
PostalList =[]
for row in soup.find_all('tr'):
    line = row.text.strip('\r').lstrip('\n').rstrip('\n').rstrip('\r').split('\n')
    if len(line)==1: break
    PostalList.append(line) 
    

In [4]:
PostalList = pd.DataFrame(PostalList[1:], columns=PostalList[0])
print('Size of Table:',len(PostalList))
PostalList.head()

Size of Table: 288


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Unique Postal Codes

In [5]:
UniquePostalCodes = PostalList['Postcode'].unique()
UniquePostalBoroughs = PostalList['Borough'].unique()
print('Unique Postal Codes:',len(UniquePostalCodes))
print('Unique Boroughs:',len(UniquePostalBoroughs))

Unique Postal Codes: 180
Unique Boroughs: 12


## Cleaning up Table of Postal Codes

Please see comments in cell for details

In [6]:
Table = []

for PC in UniquePostalCodes:
    df1 = PostalList[PostalList.Postcode == PC]                 # subset defined by unique postal code
    df1.reset_index(inplace = True)                             # reset the index to 0, 1,2,3, ... for k
    if pd.Series(df1.Neighbourhood == 'Not assigned').all():    # Check if all of the neighbourhoods are 'Not assigned'
        NH_str = 'Not assigned'                                 
        k=0
    else:  
        k = np.min(np.where(df1.Neighbourhood != 'Not assigned')[0]) # find first cell that is not 'Not assigned'
        NH_str = df1.loc[k,'Neighbourhood'].strip()                  # 1st neighbourhood in string
        if len(df1)>k:                                               # renaming neighbourhoods
            for NH in df1.loc[k+1:,'Neighbourhood']:
                NH_str += (', '+ NH.strip())            
    if pd.Series(df1.Borough == 'Not assigned').all():               # check if all boroughs are 'Not assigned'
        B = df1.loc[k,'Neighbourhood']                               # if so then borough is 1st neighbourhood ...
    else:                                                            # ... is not 'Not assigned'
        k = np.min(np.where(df1.Borough != 'Not assigned')[0])
        B = df1.loc[k,'Borough']                                     # 1st borough that is not 'Not assigned'
    Table.append([PC, B, NH_str])                                    # append row to list

df = pd.DataFrame(Table, columns = PostalList.columns)               # converting list to dataframe 
df = df[df['Borough'] != 'Not assigned']                             # There may be still empty boroughs
df.reset_index(inplace =  True, drop=True)                           # clean up
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [7]:
print('Shape of Dataframe:', df.shape)

Shape of Dataframe: (103, 3)


<a id="PartII"></a>

# Part II

[Top](#Top)

In [8]:
import geocoder

### Give geocoder a try . . . 

In [9]:
lat_lng_coords = None

postal_code = 'M5G'
counter = 0
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng
  print('.', sep='',end= '') 
  counter += 1
  if counter > 100: 
    print(' geocoder not responding')
    break

..................................................................................................... geocoder not responding


Giving up on geocoder. Loading csv file.

In [10]:
df_PC = pd.read_csv('PostalCodesLL.csv')
df_PC.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Get coordinates

In [11]:
Latitude = []
Longitude = []
NoCoord = []
print('No coordinates found for: ', end ='')
for PC in df['Postcode']:
    LL = df_PC[df_PC['Postal Code'] == PC]
    if LL.empty:
        print(PC, end =' ')
        NoCoord.append(df.index.get_loc(df.index[df['Postcode'] == PC][0]))
        Latitude.append(0)
        Longitude.append(0)
    else:    
        Latitude.append(LL['Latitude'].values[0])
        Longitude.append(LL['Longitude'].values[0])
print('\n\nA total of', len(NoCoord))

No coordinates found for: 

A total of 0


## Insert coordinates into dataframe

In [12]:
df.insert(3,'Latitude',Latitude)
df.insert(4,'Longitude',Longitude)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662301,-79.389494


<a id="PartIII"></a>

# Part III

[Top](#Top)

### Boroughs with 'Toronto'

In [13]:
dfdt = df[df['Borough'].str.contains('Toronto')]
dfdt.reset_index(inplace =  True, drop=True)
print("Number of boroughs that contain 'Toronto':",len(dfdt))

Number of boroughs that contain 'Toronto': 38


In [18]:
map_toronto = folium.Map(location=[43.651070,-79.411079], zoom_start=12)
for lat, lng in zip(dfdt['Latitude'], dfdt['Longitude']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup='',
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto) 
map_toronto

## Cluster Analysis: Where is it most dense?

### Cluster analysis

In [15]:
X = np.array(dfdt[['Latitude','Longitude']])
clust = DBSCAN(eps=0.01).fit(X)
idx = clust.core_sample_indices_

### Visualize

In [16]:
map_toronto = folium.Map(location=[43.651070,-79.411079], zoom_start=12)
for lat, lng in zip(dfdt['Latitude'], dfdt['Longitude']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup='',
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto) 
for i in idx:
    folium.CircleMarker(
        [dfdt.loc[i,'Latitude'], dfdt.loc[i,'Longitude']],
        radius=5,
        popup='',
        color='',
        fill=True,
        fill_color='red',
        fill_opacity=1,
        parse_html=False).add_to(map_toronto) 

map_toronto