In [30]:
import requests
import pandas as pd
import numpy as np
# from datetime import datetime, timedelta
from json.decoder import JSONDecodeError
import folium
from copy import deepcopy
from shapely import wkt
from shapely.geometry import Point, Polygon
from shapely.ops import unary_union
from geopandas import GeoDataFrame
import ast

### Scraping Data

In [15]:
# now = datetime.now() # datetime object containing current date and time
# yday = now - timedelta(days = 1) # yesterday's date and time 
# now = str(now)[:10] + 'T' + str(now)[10:] # proper format for api
# yday = str(yday)[:10] + 'T' + str(yday)[10:] 
# now = now.replace(' ', '') # removing spaces
# yday = yday.replace(' ', '')

# starting_page_url = '/browse/feed?clientVersion=embed&highlights.start=' + now + 'Z&highlights.finish=' + yday + 'Z&highlights.limit=20&highlights.eventLimit=25&highlights.filterToListIds=true&bBox=-74.24986002359906,40.55458101981121,-73.68063089762502,40.886081435378856&events.occurrences.start=' + now + 'Z&events.occurrences.finish=' + yday + 'Z&events.occurrences.limit=100&events.limit=100&hasEventsOnly=false&hasEventPostsOnly=false&searchAgainst=venues&searchAgainst=events&venuesPerPage=50&eventOccurrencesPerAggregate=16&listsPerAggregate=9&openOnly=false'


In [3]:
# uploading council geo dataset

council_geographies = pd.read_csv('../data/output/council_geographies.csv')

In [4]:
# api components

base_url = 'https://consumer-api.liveapp.com/v4.0'
starting_page_url = '/browse/feed?clientVersion=embed&highlights.start=2024-03-29T01%3A29%3A28.464Z&highlights.finish=2024-04-28T01%3A29%3A28.464Z&highlights.limit=20&highlights.eventLimit=25&highlights.filterToListIds=true&bBox=-74.24986002359906%2C40.55458101981121%2C-73.68063089762502%2C40.886081435378856&events.occurrences.start=2024-03-29T01%3A29%3A28.464Z&events.occurrences.finish=2024-04-28T01%3A29%3A28.464Z&events.occurrences.limit=100&events.limit=100&hasEventsOnly=false&hasEventPostsOnly=false&searchAgainst=venues&searchAgainst=events&venuesPerPage=50&eventOccurrencesPerAggregate=16&listsPerAggregate=9&openOnly=false'

original_url = base_url + starting_page_url # first url

# empty df

scraped_df = pd.DataFrame()

In [5]:
# getting example of relative url for "next page," will serve as template to use for accessing further pages beyond the landing page

response = requests.get(original_url) # making api request
response_list = response.json()

next_page_template = response_list['nextPage']

In [6]:
# dividing NYC into smaller sections to gather all the data without reaching the api page limit (going region by region)

bottom_left = (-74.24986002359906, 40.55458101981121) # bottom-left coordinates of the bounding box (from original api for entire city)
top_right = (-73.68063089762502, 40.886081435378856)  # top-right coordinates of the bounding box (from original api for entire city)
rows = 30 # number of rows to divide the bounding box into
cols = 30 #number of columns to divide the bounding box into

def subdivide_bounding_box(bottom_left, top_right, rows, cols):
    """
    Subdivide a bounding box into smaller boxes of equal size.

    Args:
    - bottom_left (tuple): Bottom-left coordinates of the bounding box (x, y).
    - top_right (tuple): Top-right coordinates of the bounding box (x, y).
    - rows (int): Number of rows to divide the bounding box into.
    - cols (int): Number of columns to divide the bounding box into.

    Returns:
    - list of tuples: List of smaller bounding boxes defined by top-left and bottom-right coordinates.
    """
    x1, y1 = bottom_left
    x2, y2 = top_right

    width = (x2 - x1) / cols
    height = (y2 - y1) / rows

    boxes = []

    for i in range(rows):
        for j in range(cols):
            # Calculate coordinates for each sub-box
            sub_box_bottom_left = (y1 + (rows - i - 1) * height, x1 + j * width)
            sub_box_top_right = (y1 + (rows - i) * height, x1 + (j + 1) * width)
            boxes.append((sub_box_bottom_left, sub_box_top_right))

    return boxes

nyc_sub_boxes = subdivide_bounding_box(bottom_left, top_right, rows, cols)

In [7]:
def visualize_bounding_boxes_on_map(boxes):
    """
    Visualize bounding boxes on a map of New York City using Folium.

    Args:
    - boxes (list of tuples): List of bounding boxes defined by top-left and bottom-right coordinates.
    """
    # create a map centered around New York City
    nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

    # add bounding boxes to the map
    for box in boxes:
        # extract coordinates
        (lat1, lon1), (lat2, lon2) = box
        # create a rectangle as a Folium rectangle marker
        folium.Rectangle(bounds=[(lat1, lon1), (lat2, lon2)], color='red', fill=False).add_to(nyc_map)

    return nyc_map

visualize_bounding_boxes_on_map(nyc_sub_boxes)

In [8]:
# creating a list of URLs that pull storefront data for each bbox 

orig_bbox = '-74.24986002359906%2C40.55458101981121%2C-73.68063089762502%2C40.886081435378856' # bbox for all of NYC contained in the original url

def url_maker(base_url, starting_page_url, next_page_url, sub_boxes, orig_bbox):
    
    """
    Create a dictionary with live.xyz URLs that pull data from different geographical bounding boxes

    Args:
    - base_url (str): The base url to access the live.xyz api
    - starting_page_url (str): The relative url that takes us to the landing page for a given bbox region
    - next_page_url (str): The relative url that takes us to the pages beyond the landing page (using page=# in url)
    - sub_boxes (list of tuples): List of sub bboxes that comprise NYC
    - orig_bbox (str): The bbox as found in the starting URL (so that it can be cut from the original link and replaced)

    Returns:
    - dict: Dictionary of live.xyz URLs with unique IDs
    """

    final_urls = {} # list for final URLs
    bbox_num = 0 # for labeling final dict

    for box in sub_boxes:

        # extracting new coordinates for bbox

        lat1 = str(box[0][0])
        lon1 = str(box[0][1])

        lat2 = str(box[1][0])
        lon2 = str(box[1][1])

        regional_bbox = lon1 + '%2C' + lat1 + '%2C' + lon2 + '%2C' + lat2 # creating url string version of new coordinates

        regional_starting_page = starting_page_url.replace(orig_bbox, regional_bbox) # replacing original bbox string with regional string
        regional_next_page = next_page_url.replace(orig_bbox, regional_bbox) 

        regional_starting_page_final = base_url + regional_starting_page
        regional_next_page_final = base_url + regional_next_page

        bbox_num += 1 # creating ID for each bbox

        final_urls[bbox_num] = [regional_starting_page_final, regional_next_page_final] # adding to running list
    
    return final_urls

api_dict = url_maker(base_url, starting_page_url, next_page_template, nyc_sub_boxes, orig_bbox)


In [None]:
def pull_storefronts(api_dict, scraped_df):

        """
        Create a dataframe with storefront data pulled from live.xyz API

        Args:
        - api_dict (dict): dictionary of live.xyz URLs with unique IDs
        - scraped_df (dataframe): empty dataframe

        Returns:
        - dataframe: with storefront data
        """

        # 1) ITERATING THROUGH EACH BBOX REGION

        for key in api_dict.keys(): # iterating through URLs for each bbox 

                landing_page_api = api_dict[key][0] # accessing landing page api for given bbox

                # creating first rows of the df

                response = requests.get(landing_page_api) # making api request
                response_list = response.json()

                if len(response_list['items']) > 0: # if this list is not empty

                        try:
                                # 2) ITERATING THROUGH EACH STOREFRONT ON THE LANDING PAGE

                                for storefront in response_list['items']: # each item is a dict

                                        # adding entries to dict (will become columns in scraped_df)

                                        storefront['value']['pageURL'] = landing_page_api # recording the link that was used to access this data
                                        storefront['value']['nextPage'] = api_dict[key][1] # recording the link that will be used in next loop
                                        storefront['value']['bbox'] = key # to keep track of what bbox region the storefront is in

                                        # adding new row to scraped_df 

                                        new_row = pd.DataFrame([storefront['value']])
                                        scraped_df = pd.concat([scraped_df, new_row], ignore_index=True) # adding dict values as new row in df

                        except (TypeError, KeyError, JSONDecodeError) as e: # passing by any errors (no errors are expected)

                                print("An error occurred in the first loop:", e)
                                print(key, ':', landing_page_api) # to view the page with the error to see if there's cause for concern

                                break

                else: # if list empty, this means this region has no storefronts on live.xyz (most likely covers a solely non-NYC area), so should be skipped

                        continue # proceeds to next item in for loop

                next_page_url = api_dict[key][1] # using pre-generated template of the url that accesses pages with subsequent batches of venues after landing page (template drawn from 'nextPage' entry found on landing pages)
                page = 1 # starting at page 1 (parameter in the nextPage url is -> page=1)
                # total_venues = response_list['venueCount'] # total venues on the site

                # 3) ITERATING THROUGH EACH PAGE THAT COMES AFTER THE LANDING PAGE IN GIVEN BBOX REGION 

                while len(scraped_df) < 999999: # will continue in loop until number of pages in the api runs out, triggering break in the except statement caused by JSONDecodeError
                # while len(scraped_df) < total_venues: # continue loop until all venues are scraped

                        try:

                                this_page_url = next_page_url
                                # final_url = base_url + this_page_url

                                response = requests.get(this_page_url) # making api request
                                response_list = response.json()

                                this_page_string = 'page=' + str(page) # current page string
                                page+=1 # preparing to visit the next page
                                next_page_string = 'page=' + str(page) # next page string
                                next_page_url = this_page_url.replace(this_page_string, next_page_string) # replacing current page with next page in URL

                                # 4) ITERATING THROUGH EACH STOREFRONT ON A GIVEN PAGE

                                for storefront in response_list['items']:  # each item is a dict

                                        # adding entries to dict (will become columns in scraped_df)

                                        storefront['value']['pageURL'] = this_page_url # recording the link that was used to access this data
                                        storefront['value']['nextPage'] = next_page_url # recording the link that will be used in next loop
                                        storefront['value']['bbox'] = key # to keep track of what bbox region the storefront is in

                                        # adding new row to scraped_df

                                        new_row = pd.DataFrame([storefront['value']])
                                        scraped_df = pd.concat([scraped_df, new_row], ignore_index=True) # adding dict values as new row in df

                        except (TypeError, KeyError, JSONDecodeError) as e: # expecting JSONDecodeError when number of pages runs out for each region

                                # print(key)

                                if not isinstance(e, JSONDecodeError): # not expecting any other errors, so will investigate if these come up

                                        print("An error occurred in the second loop:", e)
                                        print(key, ':', this_page_url) 
                                
                                break # exit the inner loop and continue to the next iteration of the outer loop

        return scraped_df


storefronts_df = pull_storefronts(api_dict, scraped_df)

# storefronts_df.to_csv('../data/output/scraped_storefronts_livexyz.csv')


In [220]:
# so don't have to rerun code

storefronts_df = pd.read_csv('../data/scraped_storefronts_livexyz.csv').drop(columns=['Unnamed: 0'])

  storefronts_df = pd.read_csv('../data/scraped_storefronts_livexyz.csv').drop(columns=['Unnamed: 0'])


### Data Cleaning

In [201]:
# Rerun orginal df and compare... why are some rows off? Does it stem from original df or is it from the saving process?
# Can remove by sorting for IDs that contains '-'

# storefronts_df_drop_dups.loc[83209]

In [255]:
# cleaning 

# editing df columns

storefronts_df_drop_dups = storefronts_df.drop_duplicates(subset=['id'], keep='first') # dropping duplicate IDs
storefronts_df_drop_dups = storefronts_df_drop_dups[~storefronts_df_drop_dups['id'].str.contains('-')].reset_index().drop(columns=['index']) # dropping weird errors where all the column values are shifted
storefronts_df_drop_dups['name'] = storefronts_df_drop_dups['name'].replace('**Vacant', 'Vacant') # cleaning names
storefronts_df_drop_dups = storefronts_df_drop_dups.reset_index().drop(columns=['index']) # reset index
storefronts_df_drop_dups['vacant'] = np.where(storefronts_df_drop_dups['name'].isin(['Vacant']), 'Yes', 'No') # creating vacancy column
storefronts_df_drop_dups['localizedAddress'] = storefronts_df_drop_dups['localizedAddress'].apply(lambda x: x if isinstance(x, float) else ast.literal_eval(x)) # converting to list from string, converting floats (NaN) to empty dict
# storefronts_df_drop_dups['localizedAddress'] = storefronts_df_drop_dups['localizedAddress'].apply(lambda x: x if (isinstance(x, float) or 'https' in x) else ast.literal_eval(x)) # converting to list from string, passing over floats (NaN)
storefronts_df_drop_dups = storefronts_df_drop_dups.assign(localizedAddress=storefronts_df_drop_dups['localizedAddress'].apply(lambda l: ' '.join(l) if isinstance(l, list) else l)) # cleaning addresses

# extracting coordinates from dicts in this column

unknown_dict_geom = {'type': np.nan, 'coordinates': np.nan} # replace NaN values with empty dict to avoid error when creating new DF 
storefronts_df_drop_dups['geometry'] = storefronts_df_drop_dups['geometry'].apply(lambda x: unknown_dict_geom if isinstance(x, float) else ast.literal_eval(x)) # converting to dict from string, converting floats (NaN) to empty dict
# storefronts_df_drop_dups['geometry'] = storefronts_df_drop_dups['geometry'].apply(lambda x: unknown_dict_geom if (isinstance(x, float) or isinstance(x, list)) else x) # converting floats and lists to empty dict
geom=pd.DataFrame(data=storefronts_df_drop_dups['geometry'].tolist()) # converting dict keys to columns
storefronts_df_drop_dups=pd.concat((storefronts_df_drop_dups,geom),axis=1) # adding columns to original df
storefronts_df_drop_dups=storefronts_df_drop_dups.drop(columns=['type','geometry']) # dropping unneeded columns
storefronts_df_drop_dups['longitude'] = storefronts_df_drop_dups['coordinates'].str[0]
storefronts_df_drop_dups['latitude'] = storefronts_df_drop_dups['coordinates'].str[1] # extracting lat and lon
storefronts_df_drop_dups.drop_duplicates(subset=(['name','latitude', 'longitude'])) # dropping duplicates of stores with same name and location

# extracting primary tags from dicts in this column

unknown_dict_tag = {'id': np.nan, 'color': np.nan, 'name': np.nan, 'type': np.nan, 'count': np.nan, 'searchable': np.nan} # replace NaN values with empty dict to avoid error when creating new DF 
storefronts_df_drop_dups['primaryTag'] = storefronts_df_drop_dups['primaryTag'].apply(lambda x: unknown_dict_tag if isinstance(x, float) else ast.literal_eval(x)) # converting to dict from string, converting floats (NaN) to empty dict
# storefronts_df_drop_dups['primaryTag'] = storefronts_df_drop_dups['primaryTag'].apply(lambda x: unknown_dict_tag if (isinstance(x, float) or isinstance(x, int)) else x) # converting floats and ints to empty dict
tag=pd.DataFrame(data=storefronts_df_drop_dups['primaryTag'].tolist()) # converting dict keys to columns
tag = tag.rename(columns={'name':'tag', 'id': 'drop_id'}) # so no repeats of column names
storefronts_df_drop_dups=pd.concat((storefronts_df_drop_dups,tag),axis=1) # adding columns to original df
storefronts_df_drop_dups=storefronts_df_drop_dups.drop(columns=['drop_id', 'color', 'type', 'count', 'searchable', 'primaryTag']) # dropping unneeded columns
storefronts_df_drop_dups=storefronts_df_drop_dups.rename(columns={'tag':'primaryTag'}) # returning to original column name

# extracting link to picture of storefronts from dicts in this column

unknown_dict_media_part1 = {'items': np.nan, 'count': np.nan} # replace NaN values with empty dict to avoid error when creating new DF 
storefronts_df_drop_dups['media'] = storefronts_df_drop_dups['media'].apply(lambda x: unknown_dict_media_part1 if isinstance(x, float) else ast.literal_eval(x)) # converting to dict from string, converting floats (NaN) to empty dict
media=pd.DataFrame(data=storefronts_df_drop_dups['media'].tolist()) # converting dict keys to columns
media['items'] = media['items'].str[0] # extracting dict from list
unknown_dict_media_part2 = {'id': np.nan, 'url': np.nan, 'tagIds': np.nan} # replace NaN values with empty dict to avoid error when creating new DF 
media['items'] = media['items'].apply(lambda x: unknown_dict_media_part2 if isinstance(x, float) else x) # replacing NaN
media=pd.DataFrame(data=media['items'].tolist()) # converting dict keys to columns
media=media.rename(columns={'id':'drop_id'}) # so no repeats of column names
storefronts_df_drop_dups=pd.concat((storefronts_df_drop_dups,media),axis=1) # adding columns to original df
storefronts_df_drop_dups=storefronts_df_drop_dups.drop(columns=['drop_id', 'tagIds', 'media']) # dropping unneeded columns
storefronts_df_drop_dups=storefronts_df_drop_dups.rename(columns={'url':'media'}) # returning to original column name

# polished df

storefronts_df_cleaned = storefronts_df_drop_dups[['id','name', 'localizedAddress', 'genre', 'primaryTag', 'spaceStatusType', 'vacant', 'statusType', 'media', 'latitude', 'longitude']]

# storefronts_df_cleaned.to_csv('../data/output/storefronts_data_cleaned.csv')

In [269]:
storefronts_df_cleaned['genre'].value_counts()

genre
Misc              25826
Food              22533
Essentials        15960
Services          13284
Municipal         11366
Body              10369
Groups             9578
Home & Hobby       9489
Fashion            7100
Auto               4136
Drinks             2495
Fitness            1368
Arts & Culture      934
Lodging             768
Entertainment       564
Transport           557
Parks & Rec         116
Name: count, dtype: int64

In [263]:
sorted(storefronts_df_cleaned[storefronts_df_cleaned['genre'].notnull()]['genre'].unique())

['Arts & Culture',
 'Auto',
 'Body',
 'Drinks',
 'Entertainment',
 'Essentials',
 'Fashion',
 'Fitness',
 'Food',
 'Groups',
 'Home & Hobby',
 'Lodging',
 'Misc',
 'Municipal',
 'Parks & Rec',
 'Services',
 'Transport']

In [270]:
sorted(storefronts_df_cleaned[storefronts_df_cleaned['genre'] == 'Misc']['primaryTag'].unique())

['Accounting Firm',
 'Advertising Agency',
 'Architect',
 'Armory',
 'Army Base',
 'Bail Bondsman',
 'Book Binding Shop',
 'Bookbinding Shop',
 'Branding Agency',
 'Business Center',
 'Business Office',
 'Business Services',
 'Business, Commercial & Professional Services',
 'Cable Provider Store',
 'Campaign Office',
 'Catering Company',
 'Cemeteries & Funeral Homes',
 'Certified Public Accountant',
 'Chamber Of Commerce',
 'Chemical Manufacturer',
 'City Hall',
 'Cleaning Company',
 'Clothing Store Supply Store',
 'Coffee Roasters',
 'Concrete Company',
 'Construction Company',
 'Construction Site',
 'Consulting Firm',
 'Councilman’s Office',
 'Courthouse',
 'Customer Service Office',
 'Delivery Company',
 'Department Of Education',
 'Department of Health',
 'Design Studio',
 'Distribution Company',
 'E-Commerce Company',
 'Electric Utility Company',
 'Electrician',
 'Employment Agency',
 'Environmental Services Company',
 'Event Equipment Rental Company',
 'Event Planning Specialist'

In [None]:
# taking a look at distribution of stores around NYC

# map specifications
zoom = 9.8
lat = 40.706000
lon = -73.976300

# create a Folium map centered around New York City
m = folium.Map(location=[lat, lon], zoom_start=zoom, tiles='cartodbpositron')

# add the council boundary lines as layers to the map
council_style = {'color': '#CACACA', 'weight': '4'}
for idx, row in council_geographies.iterrows():
    folium.GeoJson(data=row['the_geom'], style_function=lambda x: council_style).add_to(m)
    
# define popup columns and their aliases
popup_mapping = {'name': 'Name', 
                 'localizedAddress': 'Address', 
                 'genre': 'Genre', 
                 'primaryTag': 'Primary Tag',
                 'spaceStatusType': 'Status'}

# function to set color based on condition
def set_circle_color(vacancy_status):
    if vacancy_status == 'Yes':
        return '#B63F26'  # change color for vacant properties
    else:
        return '#1D5FD6'  # default color for other properties

# create markers and add them to the map
for index, location_info in storefronts_df_cleaned[storefronts_df_cleaned['latitude'].notnull()].iterrows():
    # Determine circle color based on vacancy status
    circle_color = set_circle_color(location_info['vacant'])
    popup_content = '<br>'.join([f"<b>{popup_mapping[col]}:</b> {location_info[col]}" for col in popup_mapping.keys()])
    folium.Marker(
        [location_info['longitude'], location_info['latitude']], 
        popup=popup_content,
        opacity=0.5,
        icon=folium.DivIcon(html=f"""
            <div><svg>
                <circle cx="2" cy="2" r="2" fill="{circle_color}" opacity="1"/>
            </svg>< /div>""")
    ).add_to(m)


# define legend HTML content
legend_html = """
    <div style="font-family: Georgia; font-size: 13pt; position: fixed; bottom: 50px; left: 50px; background-color: white; padding: 10px; border-radius: 5px; z-index: 1000;">
        <p><b>Legend</b></p>
        <p style="font-size: 10pt;"><svg width="20" height="20"><circle cx="10" cy="10" r="5" fill="#B63F26" opacity="1"/></svg> <span style="font-size: 11pt;">Vacant</span></p>
        <p style="font-size: 10pt;"><svg width="20" height="20"><circle cx="10" cy="10" r="5" fill="#1D5FD6" opacity="1"/></svg> <span style="font-size: 11pt;">Occupied</span></p>
    </div>
"""

# add legend as a control to the map
m.get_root().html.add_child(folium.Element(legend_html))

# title
text = "Business Corridor: 86th Street between 18th Ave and 25th Ave"
text_html = f'<div style="font-family: Georgia; position: absolute; z-index: 10000; bottom: 180px; left: 48px; max-width: 350px; font-size: 16pt" ><b>{text}</b></div>'
m.get_root().html.add_child(folium.Element(text_html))

map_name = '../visuals/storefonts_nyc-wide.html'
    
m.save(map_name)

m

In [249]:
# finding vacancy rate for NYC

vacancy_rate = round((100*len(storefronts_df_drop_dups[storefronts_df_drop_dups['name'].isin(['Vacant', '**Vacant'])]) / len(storefronts_df_drop_dups)),2)

print('The NYC storefront vacancy rate as of is', vacancy_rate, '%')

The NYC storefront vacancy rate as of is 10.3 %
