In [8]:
import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
import requests

In [9]:
# THIS IS THE FIRST PART OF QUESTIONS
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', index_col=0)

In [10]:
source = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050').text
soup = BeautifulSoup(source, 'lxml')

In [11]:
class Scrapy:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.parse_html_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                        
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

In [12]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050'
hp = Scrapy()
table = hp.parse_url(url)[0] 
table.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Etobicoke,Islington Avenue\n


In [13]:
#not assigned has to be removed
table = table[table.Borough != 'Not assigned']
table = table.replace('\n',' ', regex=True)
table = table[table['Neighbourhood\n'] != 'Not assigned']
table.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [14]:
#GROUPBY postcode and hoods
df = table.groupby(['Postcode','Borough'])['Neighbourhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)

In [15]:
print(df.shape)
print(df)

(103, 3)
    Postcode           Borough  \
0        M6R      West Toronto   
1        M6K      West Toronto   
2        M1W       Scarborough   
3        M9M        North York   
4        M6G  Downtown Toronto   
5        M4K      East Toronto   
6        M5J  Downtown Toronto   
7        M3H        North York   
8        M3K        North York   
9        M1C       Scarborough   
10       M3B        North York   
11       M1J       Scarborough   
12       M5S  Downtown Toronto   
13       M9W         Etobicoke   
14       M6C              York   
15       M1H       Scarborough   
16       M4B         East York   
17       M6H      West Toronto   
18       M4A        North York   
19       M9P         Etobicoke   
20       M1E       Scarborough   
21       M4Y  Downtown Toronto   
22       M9C         Etobicoke   
23       M8W         Etobicoke   
24       M1G       Scarborough   
25       M5P   Central Toronto   
26       M4G         East York   
27       M5H  Downtown Toronto   
28   

In [16]:
#BEGIN THE SECOND QUESTION!!!!!
url1="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url1)
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [17]:
full_table = df.set_index('Postcode').join(geo_data.set_index('Postal Code'))
full_table = full_table.sample(frac=1).reset_index(drop=True)
full_table.head(20)

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Agincourt North , L'Amoreaux East , Milliken ,...",43.815252,-79.284577
1,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848
2,West Toronto,"Little Portugal , Trinity",43.647927,-79.41975
3,Central Toronto,"Forest Hill North , Forest Hill West",43.696948,-79.411307
4,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
5,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
6,Etobicoke,"Alderwood , Long Branch",43.602414,-79.543484
7,Scarborough,"East Birchmount Park , Ionview , Kennedy Park",43.727929,-79.262029
8,Downtown Toronto,"Cabbagetown , St. James Town",43.667967,-79.367675
9,Central Toronto,North Toronto West,43.715383,-79.405678


In [18]:
print(list(df))
print(list(geo_data))

full_table = df.set_index('Postcode').join(geo_data.set_index('Postal Code'))
full_table = full_table.sample(frac=1).reset_index(drop=True)
full_table.head(20)

['Postcode', 'Borough', 'Neighbourhood\n']
['Postal Code', 'Latitude', 'Longitude']


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Clarks Corners , Sullivan , Tam O'Shanter",43.781638,-79.304302
1,East York,Woodbine Heights,43.695344,-79.318389
2,Etobicoke,"Alderwood , Long Branch",43.602414,-79.543484
3,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,Central Toronto,Lawrence Park,43.72802,-79.38879
5,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763
6,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
7,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848
8,East York,East Toronto,43.685347,-79.338106
9,North York,"Emery , Humberlea",43.724766,-79.532242


In [19]:
#LAST AND FINAL PART!
#Explore and cluster the neighborhoods in Toronto. 


In [20]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

In [21]:
address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
Latitude = 43.653963
Longitude = -79.387207

  


In [34]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(full_table['Latitude'], full_table['Longitude'], full_table['Neighbourhood\n']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='black',
        fill=True,
        fill_color='black',
        fill_opacity=0.10,
        parse_html=False).add_to(map_geo)  
#click on each button on the map to see the hood you're looking at. 

<folium.folium.Map object at 0x0000018B61085B38>


In [35]:
map_geo