# **Segmenting and Clustering Neighborhoods in Toronto**

#### Install lxml pacakge to read HTML links

In [80]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


### Import libraraies

In [81]:
import pandas as pd
import numpy as np

### Read wikipedia link with read_html func

In [82]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


### Save the data to CSV file

In [83]:
df.to_csv("Toronto.csv")

### Read and count every column of the data 

In [84]:
df['Postal code'].value_counts()

M1M    1
M3J    1
M4Z    1
M8Z    1
M5B    1
      ..
M5T    1
M5K    1
M5X    1
M5P    1
M9A    1
Name: Postal code, Length: 180, dtype: int64

In [85]:
df['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [86]:
df['Neighborhood'].value_counts()

Downsview                                                           4
Don Mills                                                           2
Willowdale                                                          2
Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood    1
Birch Cliff / Cliffside West                                        1
                                                                   ..
St. James Town                                                      1
Guildwood / Morningside / West Hill                                 1
Milliken / Agincourt North / Steeles East / L'Amoreaux East         1
Lawrence Manor / Lawrence Heights                                   1
Woodbine Heights                                                    1
Name: Neighborhood, Length: 98, dtype: int64

### Giving missing data value "NaN" to be able to deal with it 

In [87]:
missing_data=df.isnull()
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,180,180,103
unique,180,11,98
top,M1M,Not assigned,Downsview
freq,1,77,4


### Replace 'Not assinged' value with 'Nan' value

In [88]:
df.replace("Not assigned", np.nan, inplace = True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


### Drop any 'NaN' value 

In [89]:
df.dropna(axis=0, inplace=True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


### Reset the index due to all the drop rows we did

In [90]:
df.reset_index(drop=True, inplace=True)

### Remove the slash symbol "/" with comma ","

In [91]:
df["Neighborhood"] = df["Neighborhood"].str.replace("/",",")

### To display all the rows maximum 1000 row

In [92]:
pd.set_option('display.max_rows', 15)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


### Save the clean data to CSV file

In [93]:
df.to_csv("Toronto_rev1.csv")

### Count the table (Rows, Columns)

In [94]:
df.shape

(103, 3)