In [1]:
# To ignore unimporant system warnings
import warnings
warnings.filterwarnings("ignore")

# Import libaries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import usaddress
import os

In [2]:
url = "https://www1.nyc.gov/site/mome/industries/studios-stages.page"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")

In [3]:
#get addresses from each table row
facilities = []

content = soup.find_all('tr')
for i, row in enumerate(content):
    #skip header
    if i == 0:
        continue
        
        
    cell = row.find_all('td')[0]
    facility = row.find_all('td')[-1].text
    try:
        title = cell.find('strong').text
    except:
        title = cell.find('a').text   

    text = cell.get_text(separator = ',', strip = True)
    address = text.replace(title,'', 1)
    address = address.replace(',','', 1)
    facilities.append([title,address,facility])

In [4]:
facilities_df = pd.DataFrame(facilities, columns = ['Name','Address','Type'])
facilities_df.head()

Unnamed: 0,Name,Address,Type
0,1717 Troutman,"1717 Troutman Street, #300,Ridgewood, NY 11385",Not applicable
1,The 1896 Studio & Stages,"592 Johnson Avenue/,211-215 Ingraham Avenue,Br...",Level 1
2,19th Avenue Stage,"19-02 Steinway Street,Astoria, NY 11105",Level 2
3,718 Studios,"130 Thames Street,Brooklyn, NY 11247,718studio...",Not applicable
4,94 Jewel St.,"94 Jewel Street, Ground level,Brooklyn, NY 112...",Not applicable


In [36]:
facilities_df2 = pd.read_csv('facilities_df_geocoded.csv')

In [32]:
#clean up addresses before geocoding
def cleanAddreses(row):
    addr = row['Address']
    #past to usaddrees, if it fails use blank
    try:
        addr_tags = usaddress.tag(row['Address'])[0]
    except:
        return ''
    
    #use keep array to order a return with the address items needed + comma before zipcode
    keep_addr_tags = ['AddressNumber','StreetNamePreDirectional', 'StreetName', 'StreetNamePostType', '_Comma', 'ZipCode']
    addr_tags['_Comma'] = ','
    cleaned_addr = ''
    
    for tag in keep_addr_tags:
        if tag in addr_tags:
            cleaned_addr = f'{cleaned_addr} {addr_tags[tag]}'
    return cleaned_addr
        
facilities_df2['_Address'] = facilities_df2.apply(cleanAddreses, axis = 1)

facilities_df2['_Address']

0        1717 Troutman Street , 11385
1          592 Johnson Avenue , 11237
2       19-02 Steinway Street , 11105
3           130 Thames Street , 11247
4             94 Jewel Street , 11222
                    ...              
124         21-10 51st Avenue , 11101
125           3905 2nd Avenue , 11232
126      320 West 66th Street , 10023
127      300 Kingsland Avenue , 11234
128               55 Broadway , 10007
Name: _Address, Length: 129, dtype: object

In [38]:
facilities_df2.head()

Unnamed: 0,Name,Address,Type,_Address,lat,lng,bbl
0,1717 Troutman,"1717 Troutman Street, #300,Ridgewood, NY 11385",Not applicable,"1717 Troutman Street , 11385",,,
1,The 1896 Studio & Stages,"592 Johnson Avenue,Brooklyn, NY 11237",Level 1,"592 Johnson Avenue , 11237",,,
2,19th Avenue Stage,"19-02 Steinway Street,Astoria, NY 11105",Level 2,"19-02 Steinway Street , 11105",,,
3,718 Studios,"130 Thames Street,Brooklyn, NY 11247",Not applicable,"130 Thames Street , 11247",,,
4,94 Jewel St.,"94 Jewel Street, Ground level,Brooklyn, NY 11222",Not applicable,"94 Jewel Street , 11222",,,


In [57]:
#if file exist then import, otherwise copy from facilities_df... this will help with manual fixes
file = 'facilities_df_geocoded.csv'
if os.path.exists(file):
    facilities_df_geocoded = pd.read_csv(file)
else:
    facilities_df_geocoded = facilities_df2.copy()
    facilities_df_geocoded.to_csv(file, index = False)
    print('facilities_df_geocoded generated')
# facilities_df_geocoded = facilities_df2.copy()

geocoded = []
def geocode(row):
    lat, lng = np.nan, np.nan
    if 'bbl' in row:
        bbl = row['bbl']
    else:
        bbl = np.nan
    addr = row['_Address']
    #only geocode if bbl is blank (NA) and addr is not empty
    if pd.isna(bbl) and (addr != '' and pd.notna(addr)):
        print(bbl, addr)
        
        #geocode! 
        url = f'https://geosearch.planninglabs.nyc/v1/search?text={addr}&size=120'
        r = requests.get(url)
        features = r.json()['features']
        if len(features):
            feature = features[0]
            lng, lat = feature['geometry']['coordinates']
            bbl = feature['properties']['pad_bbl']

    
    return pd.Series([lat,lng, bbl])
    

facilities_df_geocoded[['lat','lng','bbl']] = facilities_df_geocoded.apply(geocode, axis = 1)
    
#save results to file
facilities_df_geocoded.to_csv('facilities_df_geocoded.csv', index = False)

nan  1717 Troutman Street , 11385
nan  592 Johnson Avenue , 11237
nan  19-02 Steinway Street , 11105
nan  130 Thames Street , 11247
nan  94 Jewel Street , 11222
nan  630 Flushing Avenue , 11206
nan  221 West 26th Street , 10001
nan  123 Bowery , 10002
nan  81 Walker Street , 10013
nan  570 Lexington Avenue , 10022
nan  524 Broadway , 10012
nan  11-05 44th Road , 11101
nan  268 Mulberry Street , 10012
nan  52 Bridge Street , 11201
nan  42-24 Ninth Street , 11101
nan 152 West 25th Street, 10001
nan  1298-1304 Willoughby Ave , 11237
nan  352 Troutman Street , 11237
nan  102-120 Ingraham Street , 11237
nan  1028 Grand Street , 11237
nan  272 Seigel Street , 11206
nan  229 West 28th Street , 10001
nan  445 Albee Square , 11201
nan  457 Broome Street , 10013
nan  18 West 21st Street , 10010
nan  34-31 10th Street , 11106
nan  235 Bond Street , 11217
nan 40 West 27th Street, 10001
nan  20 Brick Ct , 10309
nan  203 Meserole Ave , 11222
nan  259 Greene Street , 11222
nan  47-60 29th Street ,
na