### Explore London city dataset for boroughs and neighborhoods

#### Download all the libraries needed.

In [1]:
#### import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# conda install -c anaconda beautiful-soup --yes
from bs4 import BeautifulSoup # package for parsing HTML and XML documents

import csv # implements classes to read and write tabular data in CSV form

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# import the library to open URLs
import urllib.request

print('Libraries imported.')


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


#### Web scrapping of relevant data of London city from Wikipedia.

In [2]:
# specify which URL/web page to be scraping
url = "https://en.wikipedia.org/wiki/List_of_areas_of_London"

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# import the BeautifulSoup library to parse HTML and XML documents
from bs4 import BeautifulSoup

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [3]:
# take a look at our underlying HTML code with Beautiful Soup’s prettify function
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of areas of London - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_areas_of_London","wgTitle":"List of areas of London","wgCurRevisionId":903635220,"wgRevisionId":903635220,"wgArticleId":11915713,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Use dmy dates from August 2015","Use British English from August 2015","Lists of coordinates","Geographic coordinate lists","Articles with Geo","Articles with OS grid coordinates","Commons category link is on Wikidata","Areas of London","Geography of London","Lists of places in London","Lists of neighbourhoods"],"wgBreakFrames":!1,"wgPageContentLanguage":"en

In [4]:
# with title function, Beautiful Soup will return the HTML tags for the title and the content between them
soup.title

<title>List of areas of London - Wikipedia</title>

In [5]:
# get BS to only bring back the table data for this particular table and keep 
# that in a variable called ”right_table“:
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable" style="clear:both;">
<tbody><tr>
<th>Location</th>
<th>London borough</th>
<th>Post town</th>
<th>Postcode district</th>
<th>Dial code</th>
<th>OS grid ref
</th></tr>
<tr>
<td><a href="/wiki/Abbey_Wood" title="Abbey Wood">Abbey Wood</a></td>
<td>Bexley,  Greenwich <sup class="reference" id="cite_ref-mills1_2-0"><a href="#cite_note-mills1-2">[2]</a></sup></td>
<td>LONDON</td>
<td>SE2</td>
<td>020</td>
<td><span class="plainlinks nourlexpansion" style="white-space: nowrap"><a class="external text" href="https://tools.wmflabs.org/os/coor_g/?pagename=List_of_areas_of_London&amp;params=TQ465785_region%3AGB_scale%3A25000">TQ465785</a></span>
</td></tr>
<tr>
<td><a href="/wiki/Acton,_London" title="Acton, London">Acton</a></td>
<td>Ealing, Hammersmith and Fulham<sup class="reference" id="cite_ref-mills2_3-0"><a href="#cite_note-mills2-3">[3]</a></sup></td>
<td>LONDON</td>
<td>W3, W4</td>
<td>020</td>
<td><span class="plainlinks nourlexpansion" style="white-sp

In [6]:
# use the Beautiful Soup ‘find_all’ function to look for the string ‘tr’
# set up a FOR loop for each row within that array and set Python to loop through the rows, one by one.
# use find_all again to search each row for <td> tags with the ‘td’ string
# add all of these to a variable called ‘cells’ and then check to make sure that there are 6 items
# in our ‘cells’ array (i.e. one for each column).
# use the find(text=True)) option to extract the content string from within each <td> element
# in that row and add them to the A-4 lists

A=[]
B=[]
C=[]
D=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==6:
        A.append(cells[1].find(text=True))
        B.append(cells[0].find(text=True))
        C.append(cells[2].find(text=True))
        D.append(cells[3].find(text=True))
        

In [7]:
# use pandas and create a dataframe assigning each of the lists A-C into a column
# with the name of the source table columns i

df=pd.DataFrame(A,columns=['Borough'])
df['Neighborhood']=B
df['Post town']=C
df['Postcode']=D
df

Unnamed: 0,Borough,Neighborhood,Post town,Postcode
0,"Bexley, Greenwich",Abbey Wood,LONDON,SE2
1,"Ealing, Hammersmith and Fulham",Acton,LONDON,"W3, W4"
2,Croydon,Addington,CROYDON,CR0
3,Croydon,Addiscombe,CROYDON,CR0
4,Bexley,Albany Park,"BEXLEY, SIDCUP","DA5, DA14"
5,Redbridge,Aldborough Hatch,ILFORD,IG2
6,City,Aldgate,LONDON,EC3
7,Westminster,Aldwych,LONDON,WC2
8,Brent,Alperton,WEMBLEY,HA0
9,Bromley,Anerley,LONDON,SE20


In [8]:
# Reduce list of locations with only boroughs that contain the word London

dfL = df[df['Post town'].str.contains("LONDON")]
#df_London = df_London.reset_index(drop = True)

In [9]:
del dfL['Post town']

In [10]:
!conda install -c conda-forge geocoder --yes
print("Installation Done!")
import geocoder # import geocoder
print("Geo Coder imported!")

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/marcus/anaconda3

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          53 KB

The following packages will be UPDATED:

  geocoder                                      1.38.1-py_0 --> 1.38.1-py_1



Downloading and Extracting Packages
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Installation Done!
Geo Coder imported!


In [11]:
def get_geocoder(neighborhood_from_dfL):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, United Kingdom'.format(neighborhood_from_dfL.strip()))
        lat_lng_coords = g.latlng
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    return latitude,longitude

In [12]:
dfL['Latitude'], dfL['Longitude'] = zip(*dfL['Neighborhood'].apply(get_geocoder))
dfL.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Borough,Neighborhood,Postcode,Latitude,Longitude
0,"Bexley, Greenwich",Abbey Wood,SE2,51.49245,0.12127
1,"Ealing, Hammersmith and Fulham",Acton,"W3, W4",51.51324,-0.26746
6,City,Aldgate,EC3,51.513308,-0.077762
7,Westminster,Aldwych,WC2,51.513307,-0.117092
9,Bromley,Anerley,SE20,51.41233,-0.06539


In [13]:
print('The dataframe has {} boroughs and {} neighborhood.'.format(
        len(dfL['Borough'].unique()),
        dfL.shape[0]
    )
)

The dataframe has 47 boroughs and 308 neighborhood.


In [14]:
#dfL.to_csv('London.csv') .csv',index=False)

In [15]:
address = 'London City, UK'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of London are 51.5118606, -0.0780174.


In [16]:
# create map of London using latitude and longitude values
map_London = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfL['Latitude'], dfL['Longitude'], dfL['Borough'], dfL['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_London)  

legend_html =   '''
                <div style="position: fixed; 
                            top: 50px; right: 50px; height: 30px; background: white;
                            border:2px solid red; z-index:9999; font-size:15px;
                            ">&nbsp; London neighborhoods<br>
                              
                </div>
                ''' 

map_London.get_root().html.add_child(folium.Element(legend_html))  
    
    
    
map_London