## Segmenting and Clustering Neighborhoods in Toronto
### Capstone, April 2020

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
# import folium # map rendering library  # install folium using '!pip -q install folium'

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [3]:
!pip -q install folium

### Import Toronto data from Wikipedia page to IBM Watson Studio Python Notebook

In [67]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M8A,Not assigned,
9,M9A,Etobicoke,Islington Avenue


In [68]:
df_data_0.shape

(184, 3)

### Clean the data via adding column names, and dropping rows in which Borough = "Not assigned"

In [69]:
headers =['Postal code', 'Borough', 'Neighborhood']
df_data_0.columns=headers
df_data_0.head(5)

Unnamed: 0,Postal code,Borough,Neighborhood
0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [70]:
df_data_1=df_data_0.drop([0])
df_data_1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


In [71]:
# Get names of indexes for which column Borough is Not Assigned
indexNames = df_data_1[ df_data_1['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
df_data_2= df_data_1.drop(indexNames)
df_data_2.head()

Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [29]:
df_data_2.shape

(103, 3)

### In the Neighborhood column, there is no 'Not assigned' item

In [72]:
#df[df.education == '9th'].count()
df_data_2[df_data_2.Neighborhood == 'Not assigned'].count()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

### Separate items in Neighborhood column by "," in the dataset

In [73]:
df_data_2['Neighborhood']=df_data_2['Neighborhood'].str.split(pat='/')
df_data_2['Neighborhood'] =df_data_2['Neighborhood'].str.join(', ')
df_data_2.head(15)

Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park , Harbourfront"
6,M6A,North York,"Lawrence Manor , Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,"Malvern , Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill , Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [74]:
print ("df_data_2 shape =",df_data_2.shape)

df_data_2 shape = (106, 3)


### Get the latitude and the longitude coordinates of each neighborhood
#### Import geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [75]:

body = client_0c0e53d6a6954d8f808e525ab8b4fc0b.get_object(Bucket='pythonbasics1-donotdelete-pr-7qe9iiv6o4ubmz',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# If you are reading an Excel file into a pandas DataFrame, replace `read_csv` by `read_excel` in the next statement.
df_data_latLong = pd.read_csv(body)
#df_data_latLong['Postal code']=df_data_latLong['Postal Code']
#df_data_latLong_2= df_data_latLong.drop(columns = 'Postal Code')
df_data_latLong.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [80]:
df_data_latLong.shape

(103, 3)

In [76]:
df_data_latLong_2= df_data_latLong.rename(columns = {'Postal Code':'Postal code'})

In [77]:
df_data_latLong_2.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Toronto data set with the new one having Latitude/Longitude corresponding to Postal code

In [78]:
df_data_3 = pd.merge(df_data_2, df_data_latLong_2, on='Postal code', how='left')
df_data_3.head(15)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [79]:
df_data_3.shape

(106, 5)