# **Segmenting and Clustering Neighborhoods in Toronto**

#### Install lxml pacakge to read HTML links

In [80]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


### Import libraraies

In [81]:
import pandas as pd
import numpy as np

### Read wikipedia link with read_html func

In [82]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


### Save the data to CSV file

In [83]:
df.to_csv("Toronto.csv")

### Read and count every column of the data 

In [84]:
df['Postal code'].value_counts()

M1M    1
M3J    1
M4Z    1
M8Z    1
M5B    1
      ..
M5T    1
M5K    1
M5X    1
M5P    1
M9A    1
Name: Postal code, Length: 180, dtype: int64

In [85]:
df['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [86]:
df['Neighborhood'].value_counts()

Downsview                                                           4
Don Mills                                                           2
Willowdale                                                          2
Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood    1
Birch Cliff / Cliffside West                                        1
                                                                   ..
St. James Town                                                      1
Guildwood / Morningside / West Hill                                 1
Milliken / Agincourt North / Steeles East / L'Amoreaux East         1
Lawrence Manor / Lawrence Heights                                   1
Woodbine Heights                                                    1
Name: Neighborhood, Length: 98, dtype: int64

### Giving missing data value "NaN" to be able to deal with it 

In [87]:
missing_data=df.isnull()
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,180,180,103
unique,180,11,98
top,M1M,Not assigned,Downsview
freq,1,77,4


### Replace 'Not assinged' value with 'Nan' value

In [88]:
df.replace("Not assigned", np.nan, inplace = True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


### Drop any 'NaN' value 

In [89]:
df.dropna(axis=0, inplace=True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


### Reset the index due to all the drop rows we did

In [90]:
df.reset_index(drop=True, inplace=True)

### Remove the slash symbol "/" with comma ","

In [91]:
df["Neighborhood"] = df["Neighborhood"].str.replace("/",",")

### To display all the rows maximum 1000 row

In [92]:
pd.set_option('display.max_rows', 15)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


### Save the clean data to CSV file

In [93]:
df.to_csv("Toronto_rev1.csv")

### Count the table (Rows, Columns)

In [95]:
df.shape

(103, 3)

### Read Geospatial link into dataframe

In [99]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging geo_df table with old one df

In [105]:
geo_df.rename(columns={'Postal Code':'Postal code'},inplace=True)
merge_df= pd.merge(geo_df, df, on='Postal code')
merge_df

Unnamed: 0,Postal code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern , Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,York,Weston
99,M9P,43.696319,-79.532242,Etobicoke,Westmount
100,M9R,43.688905,-79.554724,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov..."
101,M9V,43.739416,-79.588437,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."


### Make a new variable that contain the 'Latitude' and 'Longitude'

In [107]:
new_df=merge_df[['Postal code','Borough','Neighborhood','Latitude','Longitude']]
new_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Count the table from (rows, columns)

In [108]:
new_df.shape

(103, 5)

In [109]:
pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl (104kB)
[K     |████████████████████████████████| 112kB 6.6MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0
Note: you may need to restart the kernel to use updated packages.


### Import necessary libraries

In [110]:
import folium
from geopy.geocoders import Nominatim

### Explore and cluster the neighborhoods in Toronto

In [111]:
address = 'Toronto'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Visualize neighborhood of Toronto

In [116]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'],new_df['Borough'], new_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto