# Imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

In [2]:
from __future__ import division, print_function
import requests
import bs4 as bs 
import pandas as pd

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 RuxitSynthetic/1.0 v6709816200 t38550 ath9b965f92 altpub cvcv=2',
    'From': 'data-x@gmail.com' 
}

# Webscrapping

In [3]:
# Initiate stores addresses, and gather data from Yelp

stores_addresses = [] 

interval = [10*i for i in range(0,25)] # The Yelp website has 25 pages with stores located in Berkeley, and display 10 stores per page. So for page i, stores displayed are stores number 10*i-11*i-1  

for i in range(len(interval)):
    source = requests.get("https://www.yelp.com/search?cflt=shopping&find_loc=Berkeley%2C%20CA&sortby=review_count&start="+str(interval[i]), headers=headers) 
    soup = bs.BeautifulSoup(source.content, features='html.parser')
    for p in soup.find_all(class_='raw__09f24__3Obuy'): #This is the class in which the address is stored
        stores_addresses.append(p.text)

In [4]:
# Store addresses in dataFrame

berkeley_data = pd.DataFrame(stores_addresses)
berkeley_data.shape

(290, 1)

In [5]:
# Test if the addresses begin with a number, if yes, they will be kept in the dataframe

isAddress = []

for i in berkeley_data[0]:
    try:
        int(i[0])
        isAddress.append(True)
    except ValueError:
        isAddress.append(False)

In [6]:
# Adding the city to all addresses, and removing all rows that are not real addresses

berkeley_data['isAddress']=isAddress
berkeley_data_clean = berkeley_data[berkeley_data['isAddress']==True]

In [7]:
berkeley_data.head()

Unnamed: 0,0,isAddress
0,"Shopping in Berkeley, CA",False
1,2163 Shattuck Ave,True
2,1120 Davis St,True
3,4400 Shellmound St,True
4,1238 5th St,True


# Get the GPS Locations

For this task, we will gather all the addresses we've found, and use an external tool, available at https://www.gpsvisualizer.com/, that will convert them into GPS coordinates. Their tool is fairly easy to use, so we just copy/paste our addresses and the name of the city, and it will provide us with a .gpx file

In [8]:
for i in berkeley_data_clean[0]: #We add the city as it will be required to get the GPS location - Right now in the dataset we only have an address (# + name of the street).
    print(i+' Berkeley,')

2163 Shattuck Ave Berkeley,
1120 Davis St Berkeley,
4400 Shellmound St Berkeley,
1238 5th St Berkeley,
1555 40th St Berkeley,
1057 Eastshore Hwy Berkeley,
5616 Bay St Berkeley,
2510 Durant Ave Berkeley,
2338 Shattuck Ave Berkeley,
2840 College Ave Berkeley,
1901 Fourth St Berkeley,
3288 Pierce St Berkeley,
69 Bolinas Rd Berkeley,
2727 Milvia St Berkeley,
1937 Ashby Ave Berkeley,
1065 Ashby Ave Berkeley,
1600 University Ave Berkeley,
750 Hearst Ave Berkeley,
470A 49th St Berkeley,
2398 Telegraph Ave Berkeley,
2332 Telegraph Ave Berkeley,
2163 Shattuck Ave Berkeley,
1120 Davis St Berkeley,
5959 Shellmound St Berkeley,
4075 Telegraph Ave Berkeley,
1834 4th St Berkeley,
5640 College Ave Berkeley,
2513 Telegraph Ave Berkeley,
5606 Bay St Berkeley,
1564 Solano Ave Berkeley,
1809B 4th St Berkeley,
5010 Telegraph Ave Berkeley,
69 Bolinas Rd Berkeley,
4801 Central Ave Berkeley,
5630 Bay St Berkeley,
1405 Martin Luther King Jr Way Berkeley,
1240 Solano Ave Berkeley,
2187 Shattuck Ave Berkeley,
2

We now have the addresses in the 'addresses.gpx' file - we are going to parse it to generate coordinates that can be read by Hash.ai (latitude and longitude are inverted!). We will copy/paste the results directly in the Hash.ai code. 

In [9]:
!pip install gpxpy
import gpxpy
import gpxpy.gpx

# Parsing an existing file:

gpx_file = open('addresses.gpx', 'r')

gpx = gpxpy.parse(gpx_file)

for waypoint in gpx.waypoints: #this is what we have in the file. Hash.ai uses longitude first and then latitude, so we will have to invert them in the next step.
    print('waypoint {0} -> ({1},{2})'.format(waypoint.name, waypoint.latitude, waypoint.longitude))

waypoint 4244 Judah Street San Francisco, -> (37.760351,-122.507798)
waypoint 4000 Judah Street San Francisco -> (37.760467,-122.505079)
waypoint 4400 Shellmound St Berkeley, -> (37.849074,-122.295394)
waypoint 1238 5th St Berkeley, -> (37.879541,-122.302624)
waypoint 1555 40th St Berkeley, -> (41.887768,-87.913181)
waypoint 1057 Eastshore Hwy Berkeley, -> (37.88384,-122.308142)
waypoint 5616 Bay St Berkeley, -> (37.849074,-122.295394)
waypoint 2510 Durant Ave Berkeley, -> (37.867881,-122.258463)
waypoint 2338 Shattuck Ave Berkeley, -> (37.866924,-122.267772)
waypoint 2840 College Ave Berkeley, -> (37.858608,-122.253204)
waypoint 1901 Fourth St Berkeley, -> (37.868393,-122.300136)
waypoint 3288 Pierce St Berkeley, -> (37.887526,-122.305307)
waypoint 5959 Shellmound St Berkeley, -> (37.848932,-122.295351)
waypoint 1937 Ashby Ave Berkeley, -> (37.854369,-122.270429)
waypoint 1065 Ashby Ave Berkeley, -> (37.851856,-122.288156)
waypoint 1600 University Ave Berkeley, -> (37.870652,-122.2795

In [10]:
lat_long = []

for idx,waypoint in enumerate(gpx.waypoints):
    lat_long.append("\"Store"+str(idx)+"\""+' :'+str([waypoint.longitude,waypoint.latitude])+",") # This is the format of locations data that will work for Hash.ai
for i in lat_long:
    print(i) #We now just copy/paste the values in Hash.ai!

"Store0" :[-122.507798, 37.760351],
"Store1" :[-122.505079, 37.760467],
"Store2" :[-122.295394, 37.849074],
"Store3" :[-122.302624, 37.879541],
"Store4" :[-87.913181, 41.887768],
"Store5" :[-122.308142, 37.88384],
"Store6" :[-122.295394, 37.849074],
"Store7" :[-122.258463, 37.867881],
"Store8" :[-122.267772, 37.866924],
"Store9" :[-122.253204, 37.858608],
"Store10" :[-122.300136, 37.868393],
"Store11" :[-122.305307, 37.887526],
"Store12" :[-122.295351, 37.848932],
"Store13" :[-122.270429, 37.854369],
"Store14" :[-122.288156, 37.851856],
"Store15" :[-122.27958, 37.870652],
"Store16" :[-122.299856, 37.868963],
"Store17" :[-90.335165, 38.750032],
"Store18" :[-122.260091, 37.852889],
"Store19" :[-122.259059, 37.868115],
"Store20" :[-90.335165, 38.750032],
"Store21" :[-122.269376, 37.859595],
"Store22" :[-122.258812, 37.866919],
"Store23" :[-122.300712, 37.870173],
"Store24" :[-122.252444, 37.851005],
"Store25" :[-122.258451, 37.864898],
"Store26" :[-122.295394, 37.849074],
"Store27" :[-122

In [11]:
# Generate coordinates in Berkeley that will represent households

import random

NE = [-122.293597,37.880772] #We generate GPS coordinates within a square in Berkeley. The vertices are the four coordinates: NE, NW, SW, SE. 
NW = [-122.249037,37.880772]
SW = [-122.249037,37.854968]
SE = [-122.293597,37.854968]

households=[]
for longitude in range(20):
    for latitude in range(20):
        long = random.uniform(NW[0],SE[0]) # West and East
        lat = random.uniform(SW[1],NE[1]) # North and South
        households.append("\"Household"+str(longitude)+str(latitude)+"\""+' :'+str([long,lat])+",")

for i in households:
    print(i)

"Household00" :[-122.26376938149168, 37.86108571768871],
"Household01" :[-122.2830880300976, 37.86983329920735],
"Household02" :[-122.29121427803769, 37.86537803701824],
"Household03" :[-122.27159147023738, 37.87838574212015],
"Household04" :[-122.29109864233564, 37.865012258139],
"Household05" :[-122.25947615376313, 37.87764904091175],
"Household06" :[-122.27497164622515, 37.85833914938892],
"Household07" :[-122.2894908458562, 37.85835310870042],
"Household08" :[-122.2720998306281, 37.872377623765466],
"Household09" :[-122.29117211695657, 37.870842228149144],
"Household010" :[-122.25208298631159, 37.86971376434944],
"Household011" :[-122.2629562658542, 37.87351173473143],
"Household012" :[-122.28657039523297, 37.86360066338412],
"Household013" :[-122.24966189254283, 37.86738672285685],
"Household014" :[-122.26986776023273, 37.88059398416967],
"Household015" :[-122.28773769701166, 37.862326755127974],
"Household016" :[-122.25744599359274, 37.87106449685595],
"Household017" :[-122.25994

These locations were programmed into the json file for globals.json in our HASH.AI simulation. These are the key locations we see on the map.