# Segmenting and Clustering Neighborhoods in Toronto

## Is existing Borough distribution proper geographically?<br>
#### There are Neighborhoods belonged to Boroughs, but they are not geographically distributed.
#### With Postal Code of Neighborhoods, Some Boroughs contains more Neighborhoods than other.
#### How can we distribute Neighborhoods to Boroughs?
#### So, in this section, we will compare existing Borough distribution and the result of k=means clustering with geographical data(latitude, longitude).

### K-Means Clustering 

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
pcode_df = pd.read_csv('Neighborhood_data.csv')

In [4]:
latlong_df = pd.read_csv('Geospatial_Coordinates.csv')

In [5]:
all_df = pd.merge(pcode_df, latlong_df, on='Postal Code')

In [6]:
# Data Normalization for K-means Clustering
latitude_scale = all_df[['Latitude']]
longitude_scale = all_df[['Longitude']]
X=preprocessing.StandardScaler().fit(latitude_scale).transform(latitude_scale)
Y=preprocessing.StandardScaler().fit(longitude_scale).transform(longitude_scale)

In [7]:
scale_df = pd.DataFrame(data={'Postal Code':all_df['Postal Code']})
scale_df['latitude_scale']=X
scale_df['longitude_scale']=Y

In [8]:
k_means = KMeans(init = "k-means++", n_clusters = 10, n_init = 12)
k_means.fit(scale_df[['latitude_scale','longitude_scale']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=12, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
k_means_labels = k_means.labels_
k_means_labels

array([5, 5, 5, 5, 5, 1, 1, 1, 1, 9, 1, 1, 8, 8, 8, 8, 5, 8, 8, 3, 3, 3,
       3, 3, 3, 1, 7, 7, 3, 3, 4, 4, 4, 4, 1, 9, 9, 9, 7, 7, 9, 9, 9, 6,
       7, 7, 7, 7, 7, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 2, 2,
       6, 6, 6, 6, 6, 4, 2, 2, 2, 2, 2, 6, 6, 4, 2, 2, 2, 2, 0, 6, 0, 9,
       0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4])

In [10]:
all_df["Clus_km"] = k_means_labels

## Mapping the result comparing with existing Borough

In [11]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

### Distribution of Neighborhoods with Postal Code

In [12]:
mean_lat, mean_long = all_df.Latitude.mean(), all_df.Longitude.mean()

In [13]:
toronto_map = folium.Map(location=[mean_lat, mean_long],zoom_start=10)

In [14]:
for lat,lng,borough,neighborhood in zip(all_df['Latitude'],all_df['Longitude'],all_df['Borough'],all_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(toronto_map)
toronto_map

### Before Clustering

In [15]:
all_df.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [16]:
# Add Borough as a code for mapping
Borough_list=list(all_df['Borough'].unique())
Borough_dict={}
borough_numb=0
for i in Borough_list:
    Borough_dict[i]=borough_numb
    borough_numb+=1

In [17]:
borough_code=[]
for i in all_df['Borough']:
    for k in Borough_dict:
        if i==k:
            borough_code.append(Borough_dict[k])

In [18]:
all_df['Borough Code']=borough_code

In [19]:
# create map
toronto_map = folium.Map(location=[mean_lat, mean_long],zoom_start=10)

# set color scheme for the clusters
x = np.arange(10)
ys = [i + x + (i*x)**2 for i in range(10)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, borough_code, borough_name in zip(all_df['Latitude'], all_df['Longitude'], all_df['Neighborhood'], all_df['Borough Code'], all_df['Borough']):
    label = folium.Popup(' Borough ' + str(borough_name), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[borough_code-1],
        fill=True,
        fill_color=rainbow[borough_code-1],
        fill_opacity=0.7).add_to(toronto_map)
       
toronto_map

### After Clustering

In [20]:
# create map
cluster_map = folium.Map(location=[mean_lat, mean_long],zoom_start=10)

# set color scheme for the clusters
x = np.arange(10)
ys = [i + x + (i*x)**2 for i in range(10)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(all_df['Latitude'], all_df['Longitude'], all_df['Neighborhood'], all_df['Clus_km']):
    label = folium.Popup(' Borough ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(cluster_map)
       
cluster_map