In [1]:
# import libraries

import pandas as pd
import numpy as np

# For web scraping
from bs4 import BeautifulSoup as bs
import requests

# To get geo-location data
from geopy.geocoders import Nominatim

In [2]:
# Get list of neighborhoods in Boston

source = requests.get("https://en.wikipedia.org/wiki/Neighborhoods_in_Boston").text
soup = bs(source, 'lxml')

In [3]:
list = soup.find('div', class_='columns')
print(list.prettify())

<div class="div-col columns column-width" style="-moz-column-width: 30em; -webkit-column-width: 30em; column-width: 30em;">
 <ul>
  <li>
   <a href="/wiki/Allston" title="Allston">
    Allston
   </a>
  </li>
  <li>
   <a href="/wiki/Back_Bay,_Boston" title="Back Bay, Boston">
    Back Bay
   </a>
  </li>
  <li>
   <a class="mw-redirect" href="/wiki/Bay_Village,_Boston,_Massachusetts" title="Bay Village, Boston, Massachusetts">
    Bay Village
   </a>
  </li>
  <li>
   <a href="/wiki/Beacon_Hill,_Boston" title="Beacon Hill, Boston">
    Beacon Hill
   </a>
  </li>
  <li>
   <a class="mw-redirect" href="/wiki/Brighton,_Massachusetts" title="Brighton, Massachusetts">
    Brighton
   </a>
  </li>
  <li>
   <a class="mw-redirect" href="/wiki/Charlestown,_Massachusetts" title="Charlestown, Massachusetts">
    Charlestown
   </a>
  </li>
  <li>
   <a href="/wiki/Chinatown,_Boston" title="Chinatown, Boston">
    Chinatown
   </a>
   /
   <a class="mw-redirect" href="/wiki/Leather_District,_Bo

In [4]:
# Store the list in an array

neigh = []

for n in list.find_all('li'):
    neigh.append(n.text)

In [5]:
# Convert the array to a pandas dataframe

df = pd.DataFrame({'Neighborhoods': neigh})

In [6]:
# Clean up the dataframe

df.replace(regex={r'^Dorchester.*$': 'Dorchester', r'^Fenway.*$': 'Fenway Kenmore', r'^Chinatown.*$': 'Chinatown'}, inplace=True)

In [7]:
df.shape

(22, 1)

In [8]:
# Get latitude and longitude data for each neighborhood

geolocator = Nominatim(user_agent='boston_agent')

def get_geo_location(neighborhood):
    g = None
    while (g is None):
        location = geolocator.geocode('{}, Boston, MA'.format(neighborhood))
        g = [location.latitude, location.longitude]
    return g

In [9]:
coords = [ get_geo_location(nbhd) for nbhd in df["Neighborhoods"].tolist() ]

In [10]:
coords

[[42.3554344, -71.1321271],
 [42.3507067, -71.0797297],
 [42.35001105, -71.0669477958571],
 [42.3587085, -71.067829],
 [42.3500971, -71.1564423],
 [42.3778749, -71.0619957],
 [42.3513291, -71.0626228],
 [42.2973205, -71.0744952],
 [52.971148799999995, -0.059809371175602276],
 [42.3750973, -71.0392173],
 [42.34422445, -71.09444515776886],
 [42.2556543, -71.1244963],
 [42.3098201, -71.1203299],
 [42.2675657, -71.0924273],
 [42.33255965, -71.10360773640765],
 [42.3650974, -71.0544954],
 [42.2912093, -71.1244966],
 [42.3248426, -71.0950158],
 [42.3334312, -71.0494949],
 [42.34131, -71.0772298],
 [42.3639186, -71.0638993],
 [42.2792649, -71.1494972]]

In [11]:
df_coords = pd.DataFrame(coords, columns=["Latitude", "Longitude"])

In [12]:
df["Latitude"] = df_coords["Latitude"]
df["Longitude"] = df_coords["Longitude"]

df

Unnamed: 0,Neighborhoods,Latitude,Longitude
0,Allston,42.355434,-71.132127
1,Back Bay,42.350707,-71.07973
2,Bay Village,42.350011,-71.066948
3,Beacon Hill,42.358708,-71.067829
4,Brighton,42.350097,-71.156442
5,Charlestown,42.377875,-71.061996
6,Chinatown,42.351329,-71.062623
7,Dorchester,42.29732,-71.074495
8,Downtown,52.971149,-0.059809
9,East Boston,42.375097,-71.039217
