# Clustering the neighborhood

## Objectives:
1. Get the neighborhood data
2. Get the  Geo-graphical data for each postal code
3. Explore  and Cluster the neihborhoods

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url).text
soup = BeautifulSoup(data,'lxml') #Soup object

table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':# ignoring the 'Not assigned'
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# Making the data frame our data        
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North


Load the Geo spatial data

In [5]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Let's change the name of column<code>Postal Code</code> *to* <code>PostalCode</code> for our geo_data, in order to find a way to merge the two dataframe

In [6]:
geo_data.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the two dataframe with <code>.merge</code>

In [7]:
new_df = pd.merge(df,geo_data, on='PostalCode')
new_df.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188


**Getting all the rows from our DataFrame which contains *Toronto* in their Borough**

In [8]:
df_Tor = new_df[new_df['Borough'].str.contains('Toronto',regex=False)]
df_Tor.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
35,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


In [9]:
import folium
import numpy as np
from sklearn.cluster import KMeans

### Let's visualize the Neighborhoods using Folium

In [11]:
map_tor = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df_Tor['Latitude'],df_Tor['Longitude'],df_Tor['Borough'],df_Tor['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_tor)
map_tor

### To cluster Neighborhoods we use the K-Means Clustering

In [15]:
k=4 # number of clusters

cluster_toronto = new_df.drop(['PostalCode','Borough','Neighborhood'],1)

K_means = KMeans(n_clusters = k,random_state=0) #instatiate the KMeans
K_means.fit(cluster_toronto) #Fit the model

K_means.labels_ #Labels of our model

array([2, 2, 0, 3, 0, 1, 2, 3, 0, 0, 3, 1, 2, 0, 0, 0, 3, 1, 2, 0, 0, 3,
       2, 0, 0, 0, 2, 3, 3, 0, 0, 0, 2, 3, 3, 0, 0, 0, 2, 3, 3, 0, 0, 0,
       2, 3, 1, 0, 0, 1, 1, 2, 3, 1, 0, 3, 1, 1, 2, 3, 1, 3, 3, 1, 1, 2,
       3, 3, 3, 1, 1, 2, 3, 3, 0, 1, 1, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 0,
       1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 0, 0, 1, 1])

In [18]:
new_df.insert(0, 'ClusterLabels', K_means.labels_) #insert labels into a dataframe
new_df.head(8)

Unnamed: 0,ClusterLabels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.753259,-79.329656
1,2,M4A,North York,Victoria Village,43.725882,-79.315572
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,0,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,1,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,2,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,3,M3B,North York,Don Mills North,43.745906,-79.352188


In [22]:
import matplotlib.cm as cm
import matplotlib.colors as colors


In [23]:
map_cluster = folium.Map(location=[43.651070,-79.347015],zoom_start=10) #Create map

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lon, neighborhood, cluster in zip(new_df['Latitude'], new_df['Longitude'], new_df['Neighborhood'], new_df['ClusterLabels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_cluster)
       
map_cluster