# STEP 1 : EXTRACT THE DATAFRAME 

# Import the Packages

In [247]:
import geocoder
import pandas as pd
import numpy as np
import os
import folium
import requests
import sklearn
from sklearn.cluster import KMeans

<h1> WEB SCRAPPING </h1>

In [209]:
html = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df = pd.DataFrame(html)

df = df.at[0, 0]
df

  values = np.array([convert(v) for v in values])


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


<h1> Transform the data </h1>

In [210]:
df["Postal Code"].replace("Not assigned", np.nan, inplace = True)
df["Borough"].replace("Not assigned", np.nan, inplace = True)
df["Neighbourhood"].replace("Not assigned", np.nan, inplace = True)
df.dropna(axis = 0, inplace=True, subset=["Borough"])
df = df.groupby("Postal Code", as_index=False).sum()
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


<h1> Add the coordinates </h1>

In [211]:
lat_lng = pd.read_csv("Geospatial_Coordinates.csv")
df["Latitude"] = lat_lng["Latitude"]
df["Longitude"] = lat_lng["Longitude"]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


# STEP 2 : Neighborhood segmentation 

In [212]:
# @hidden_cell
CLIENT_ID = '04NJYWOZCIBJKJL0QCB0IHF4TAJOVM331IAYAFDHEIQ2UY0M'
CLIENT_SECRET = 'DZZI0GXJ2VW5RVBIZNOWHHOVFCDODSZ0O1HWRPQE0BH5CRFA'
VERSION = "20180605"
LIMIT = 100

In [222]:
venues = pd.DataFrame()
for i in zip(df["Latitude"], df["Longitude"], df["Postal Code"]):
    url = url = f"https://api.foursquare.com/v2/venues/search?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&ll={i[0]},{i[1]}&v={VERSION}&radius={500}&limit={LIMIT}"
    result = requests.get(url).json()
    categories = [j["categories"] for j in result["response"]["venues"]]
    name = [j["name"] for j in result["response"]["venues"]]
    categorie = []

    for k in categories:
        try:
            categorie.append(k[0]["name"])
        except:
            pass

    categorie
    for j in range(len(categorie)):
        venues = venues.append({'categorie': categorie[j], 'names':name[j], "Postal code" : i[2]}, ignore_index=True)

# One hot dataframe

In [239]:
venues.groupby("Postal code").count()
venues_one_hot = pd.get_dummies(venues["categorie"])
venues_one_hot["Postal code"] = venues["Postal code"]

In [240]:
venues_one_hot

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit,Postal code
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W
8779,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W
8780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W
8781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W


In [242]:
venues_grouped = venues_one_hot.groupby('Postal code').mean().reset_index()

# KMeans

In [250]:
venues_grouped_clustering = venues_grouped.drop('Postal code', axis = 1)

kmeans = KMeans(n_clusters=5).fit(venues_grouped_clustering)

venues_grouped_clustering["Postal code"] = df["Postal Code"]
venues_grouped_clustering["Labels"] = kmeans.labels_

In [251]:
venues_grouped_clustering

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit,Postal code,Labels
0,0.0,0.0,0.0,0.0,0.000000,0.011111,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.011111,M1B,0
1,0.0,0.0,0.0,0.0,0.000000,0.011236,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.011236,0.000000,0.0,0.0,0.0,0.000000,M1C,0
2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M1E,3
3,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M1G,3
4,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M1H,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0.0,0.0,0.0,0.0,0.011236,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M9N,3
99,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M9P,3
100,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.012195,0.0,0.0,0.0,0.000000,M9R,3
101,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,M9V,3


# Let's plot the map 

In [258]:
carte = folium.Map(location=[df["Latitude"].mean(), df["Longitude"].mean()], zoom_start=11)

color = ['red', 'blue', 'yellow', "green", "pink"]

for lat, lng, postal, neighborhood, label in zip(df["Latitude"], df["Longitude"], venues_grouped_clustering["Postal code"], df["Neighbourhood"], venues_grouped_clustering["Labels"]):
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup= f"Neighborhood : {neighborhood}, Postal code : {postal}",
        color= color[label],
        fill=True,
        fill_color=color[label],
        fill_opacity=0.7).add_to(carte)
    

In [259]:
carte