# This notebook will be mainly used for the capstone project

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Week 3

## Scrape data

In [3]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# Clearn the data frame
# 1. Borough not assigned
df["Borough"].dropna(inplace=True)
df = df[df["Borough"]!="Not assigned"]
# 2. Combined Neighbourhoods with same postal code
df = df.groupby(["Postal Code", "Borough"], as_index=False).agg({'Neighbourhood' : lambda x: ",".join(x)})#['Neighbourhood'].apply(lambda x: ', '.join(x))
# 3. Not assigned neighbourhood but borough
df[df["Neighbourhood"]=="Not assigned"]["Neighbourhood"] = df[df["Neighbourhood"]=="Not assigned"]["Borough"]
df.head(100)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
95,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Humberlea, Emery"
98,M9N,York,Weston


In [5]:
len(df["Postal Code"].unique())

103

## Load spatial coordinates

In [6]:
df_new = pd.read_csv("/home/lars/Downloads/Geospatial_Coordinates.csv")
df = df.merge(df_new, on=["Postal Code"], how="left")
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
import folium
# create map of Toronto using latitude and longitude values
map_manhattan = folium.Map(location=[df.loc[0,"Latitude"], df.loc[0,"Longitude"]], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [59]:
from sklearn.cluster import DBSCAN
eps = np.linspace(0.001,0.1,100)
print(eps)
min_samples = np.linspace(1,10,10)
print(min_samples)
max_clusters = 0
for e in eps:
    for min_s in min_samples:
        dbscan = DBSCAN(eps=e, min_samples= min_s,algorithm='ball_tree', metric='haversine')
        coord = df[["Latitude", "Longitude"]].values
        clustering = dbscan.fit(coord)
        labels = clustering.labels_
        kclusters = max(labels)+1
        if kclusters > max_clusters:
            max_clusters = kclusters
            best_labels = labels
print(max_clusters)
print(best_labels)

[0.001 0.002 0.003 0.004 0.005 0.006 0.007 0.008 0.009 0.01  0.011 0.012
 0.013 0.014 0.015 0.016 0.017 0.018 0.019 0.02  0.021 0.022 0.023 0.024
 0.025 0.026 0.027 0.028 0.029 0.03  0.031 0.032 0.033 0.034 0.035 0.036
 0.037 0.038 0.039 0.04  0.041 0.042 0.043 0.044 0.045 0.046 0.047 0.048
 0.049 0.05  0.051 0.052 0.053 0.054 0.055 0.056 0.057 0.058 0.059 0.06
 0.061 0.062 0.063 0.064 0.065 0.066 0.067 0.068 0.069 0.07  0.071 0.072
 0.073 0.074 0.075 0.076 0.077 0.078 0.079 0.08  0.081 0.082 0.083 0.084
 0.085 0.086 0.087 0.088 0.089 0.09  0.091 0.092 0.093 0.094 0.095 0.096
 0.097 0.098 0.099 0.1  ]
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
103
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84 

In [56]:
# create map
map_clusters = folium.Map(location=[df.loc[0,"Latitude"], df.loc[0,"Longitude"]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(max_clusters)
ys = [i + x + (i*x)**2 for i in range(max_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], best_labels):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters