## Website Scrapping Assignment Using BeautifulSoup 

In [1]:
import pandas as pd
import numpy as np
#Importing the Beautiful Soup Package from bs4(latest version)
from bs4 import BeautifulSoup
import requests

__To get HTML code of Wikipedia page which contains the data__

In [2]:
html_doc = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
Soup = BeautifulSoup(html_doc,'html.parser')

In [3]:
# Parsing to the div class which contains the Data of the Table
a = Soup.find('div',class_='mw-content-ltr')
#print(a.table.tr)

# Fetching the Column names for Data Frame
column_nm = []
for th in a.table.tr.find_all('th'):
    column_nm.append(th.text.strip())
print(column_nm)

['Postcode', 'Borough', 'Neighbourhood']


__The Data in the Soup Object has to be parsed to get the rows of data. So, the tags have to be parsed.__ 

In [4]:
df = [] # 
sub = [] # Sub list where the data fetched is made in a list of three items and then appended to main list df 
i=0
# looping through the data
for tbl in a.table.find_all('td'):
    
    sub.append(tbl.text.strip())
    #print(tbl.text)
    i+=1
    if (i%3) == 0: # to split the sub into list of three items because we need to create a Data frame of 3 Columns(Postcode,Borough,Neighbourhood)
        df.append(sub)
        sub = []

#Converting the list into Data Frame
Data = pd.DataFrame(df)

# Renaming the Columns
Data.columns=column_nm

In [5]:
Data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### We will be cleaning the Data from hereon:

In [6]:
# Replacing "Not assigned" as NA for Column: Borough
Data['Borough'] = Data['Borough'].replace('Not assigned',np.nan)

#Dropping rows with Borough as NA
Data.dropna(subset = ['Borough'],axis=0,inplace=True)

# Resetiing the Index
Data.reset_index(drop=True,inplace=True)

# Replace Neighbours "Not assigned" with Borough
Data["Neighbourhood"] = Data["Neighbourhood"].replace("Not assigned",Data["Borough"] )

In [7]:
Data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [8]:
# To Combine the Neighbourhood having same Postcodes

df10 = Data.groupby('Postcode')['Neighbourhood'].apply(lambda x: "%s" % ",".join(x))
df10 = df10.to_frame()
df10.reset_index(drop=False,inplace=True)

#No of Unique PostCodes in DataFrame
print(Data['Postcode'].unique().shape)

(103,)


__Need to add Column Borough in df10 because it was grouped on Postcode and contains just Post Code and Neighbourhood__

In [9]:
i=0
df11=[]
# cretaed a list df11 and appending it according to Postcodes in df10(Grouped DataFrame)
for i,pc in enumerate(df10['Postcode']):
    df11.append(Data[Data['Postcode']==pc].Borough.unique()[0])
    if i > len(df10):
        break

# Joining all the Colums to create the Final Data Frame 
df10 = pd.DataFrame({'Postcode':list(df10['Postcode']),'Borough':df11,'Neighbourhood':list(df10['Neighbourhood'])})

In [10]:
df10.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
df10.shape

(103, 3)

### Now we need to get the Geospacial data of these postal codes and append it to our data frame finalised above:

In [13]:
#!pip install geocoder
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [24]:
#import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
postal_code='M5G'
# loop until you get the coordinates
#while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#    lat_lng_coords = g.latlng

#g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#lat_lng_coords = g.latlng

#g
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

address = 'Toronto, Ontario'
address2 = '{}, Toronto, Ontario'.format(postal_code)
address3 = 'Central Bay Street, Downtown Toronto'

address4 = 'Regent Park, Downtown Toronto'
address5 = 'harbourfront, downtown Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address4)
latitude = location.latitude
longitude = location.longitude
print(latitude,longitude)

AttributeError: 'NoneType' object has no attribute 'latitude'

_I'm going to use the CSV provided as Geocoder API is returning [Request Denied] and while using Nominatim all Vaues of Latitude & Longitude are not returned_

In [29]:
url ='https://cocl.us/Geospatial_data'
Coord = pd.read_csv(url)

Coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
df10['Latitude'] = Coord[Coord['Postal Code']==df10['Postcode']]['Latitude']
df10['Longitude'] = Coord[Coord['Postal Code']==df10['Postcode']]['Longitude']

In [37]:
df10.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
