## Battle of Neighborhood - The Preparation

### Import important Libraries that will be required entire Project
#### Here, we are considering to continue the complete project in same Notebook

In [1]:
# Import all required libraries
import numpy as np # To handle data in a vectorize manner

import pandas as pd # To get the data in DataFrame

import json # To get and read the JSON file

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # To convert an address into latitude and longitude values

import requests # To handle JSON requests

from pandas.io.json import json_normalize # To conver json file data into DataFrame

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans # To create cluster by K-Means 

!conda install -c conda-forge folium=0.5.0 --yes
import folium # Map rendering library

from bs4 import BeautifulSoup # Install and import BeautifulSoup4 for scraping the Wikipedia page and get the table in csv format

import lxml.html as lh
import re

print('Libraries imported')

Solving environment: done

# All requested packages already installed.

Libraries imported


### Read the table from Wikipedia page and get the table data into pandas DataFrame

In [4]:
# Read URL
input_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Fetch the content from the url, using the requests library
wiki_data = requests.get(input_url)

# Used 'lxml' parser to parse the url content and store it in a variable.
wiki_content = BeautifulSoup(wiki_data.content, "lxml")

table_data = wiki_content.find("table", class_="wikitable sortable")

# Create DataFrame
df_columns = []
df_rows = []

for i in table_data.find_all('th'):
    df_columns.append(re.split("\n", i.text)[0])

table_rows = table_data.find_all('tr')

for tr in table_rows:
    # Fetch Rows
    td = tr.find_all('td')
    row = [re.split("\n", tr.text)[0] for tr in td]
    df_rows.append(row)

PostalCode_df = pd.DataFrame(df_rows[1:],columns=df_columns)
PostalCode_df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Drop the rows with value as 'Not assigned' in Borough.
#### If Only Neighborhood is 'Not assigned', replace it with value in Borough column.

In [5]:
# Drop rows with "Not assigned"
T_df = PostalCode_df[~PostalCode_df['Borough'].isin(['Not assigned'])]
Toronto_df = pd.DataFrame(T_df.groupby(['Postcode', 'Borough'],sort=False)['Neighbourhood'].apply(','.join)).reset_index()
Toronto_df.rename(columns={'Postcode':'PostalCode','Neighbourhood':'Neighborhood'}, inplace=True)

for i, value in enumerate(Toronto_df['Neighborhood']):
    if value == 'Not assigned':
        Toronto_df['Neighborhood'][i] = Toronto_df['Borough'][i]
        
Toronto_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [6]:
# Display shape of the DataFrame
Toronto_df.shape

(103, 3)

### Collect location details i.e. latitude and longitude for each Postal Code

In [8]:
# Collect Latitude and Longitude data from 'http://cocl.us/Geospatial_data'
geo_data_url = 'http://cocl.us/Geospatial_data'

geo_data = pd.read_csv(geo_data_url)
geo_data.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Add the Latitude and Longitude details to Postal Code DataFrame

In [9]:
# Merge the Neighbourhood DataFrame Geospatial data to get latitude and longitude info
Toronto_data = pd.merge(Toronto_df, geo_data, how='inner', on='PostalCode')
Toronto_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
