# Obtaining the final dataframe with latitude & longitude

In [24]:
import pandas as pd

df = pd.read_csv('Dataframe.csv')
lat_long_data = pd.read_csv('http://cocl.us/Geospatial_data')

df2 = lat_long_data.rename(columns={'Postal Code':'PostalCode'})
final_df = pd.merge(df,df2, on = 'PostalCode')
final_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Clustering neighborhoods containing "Toronto"

In [25]:
clust_df = final_df[final_df['Borough'].str.contains('Toronto',regex=False)]
clust_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Plotting the obtained neighborhoods

In [26]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [27]:
map_1 = folium.Map(location=[43.6532,-79.3832],zoom_start=11)
for lat,lng,b,ngh in zip(clust_df['Latitude'],clust_df['Longitude'],clust_df['Borough'],clust_df['Neighborhood']):
    label = '{}, {}'.format(ngh, b)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius = 5,
    popup = label,
    color = 'blue',
    fill = True,
    fill_color = '#3186cc',
    fill_opacity = 0.75,
    parse_html = False).add_to(map_1)
map_1

### Clustering the neighborhoods into 5 clusters using k-means algorithm

In [28]:
from sklearn.cluster import KMeans
tot_clust = 5
toronto_clust = clust_df.drop(['PostalCode','Borough','Neighborhood'], 1)
k_means = KMeans(n_clusters = tot_clust, random_state=0).fit(toronto_clust)

# cluster labeling for each row in clust_df dataframe
k_means.labels_[0:10]

array([4, 4, 4, 4, 2, 2, 2, 2, 2, 2])

Adding the cluster labels in the clust_df dataframe 

In [29]:
clust_df.insert(0, 'cluster labels', k_means.labels_)
clust_df.head()

Unnamed: 0,cluster labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,4,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Plotting the map after clustering showing the 5 different clusters

In [30]:
map_2 = folium.Map(location=[43.6532,-79.3832],zoom_start=11)
import numpy as np
# set color scheme for the clusters
q = np.arange(tot_clust)
w = [i + q + (i*q)**2 for i in range(tot_clust)]
color_arr = cm.rainbow(np.linspace(0, 1, len(w)))
rainbow = [colors.rgb2hex(i) for i in color_arr]

for lat,lng,b,ngh,clust in zip(clust_df['Latitude'],clust_df['Longitude'],clust_df['Borough'],clust_df['Neighborhood'],clust_df['cluster labels']):
    label = folium.Popup('cluster' + str(clust), parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius = 5,
    popup = label,
    color = rainbow[clust-1],
    fill = True,
    fill_color = rainbow[clust-1],
    fill_opacity = 0.75,
    parse_html=False).add_to(map_2)
map_2