# Enriching NYC Airbnb Dataset

## WEB SCRAPING

In [65]:
import numpy as np
import pandas as pd
import src.enriching_utils as eu
import requests
from bs4 import BeautifulSoup
import re

In [66]:
#Select our url
url = "https://en.wikipedia.org/wiki/List_of_National_Historic_Landmarks_in_New_York_City"

In [67]:
#Obtein the data
html = requests.get(url)

In [68]:
soup = BeautifulSoup(html.content, "html.parser")

In [69]:
# We want to take the information of a table so we see how many tables are in our site and which one we want to take
tablas = soup.find_all("table")

In [70]:
len(tablas)

5

In [71]:
monumento = tablas[0]

In [72]:
# look for the data we want to get
elemento = monumento.find_all("a")[2]
elemento

<a href="/wiki/69th_Regiment_Armory" title="69th Regiment Armory">69th Regiment Armory</a>

In [73]:
elemento.text

'69th Regiment Armory'

In [74]:
monumentos = soup.find_all("table")[0]

In [75]:
# We go through the data to get what we want.
monumentos.find_all("tr")[1]

<tr class="vcard">
<th style="background-color: #87CEEB;"><small>1</small>
</th>
<td><span class="mapframe-coord-name"><a href="/wiki/69th_Regiment_Armory" title="69th Regiment Armory">69th Regiment Armory</a></span>
</td>
<td><div class="center"><div class="floatnone"><a class="image" href="/wiki/File:69th-regiment-armory.JPG" title="69th Regiment Armory"><img alt="69th Regiment Armory" class="thumbborder" data-file-height="2946" data-file-width="3464" decoding="async" height="102" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/69th-regiment-armory.JPG/120px-69th-regiment-armory.JPG" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/d5/69th-regiment-armory.JPG/180px-69th-regiment-armory.JPG 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/d5/69th-regiment-armory.JPG/240px-69th-regiment-armory.JPG 2x" width="120"/></a></div></div><div class="noprint" style="text-align:center;"><a class="image" href="/wiki/File:Commons-logo.svg"><img alt="" class="noviewer" data-f

In [76]:
monumentos.find_all("tr")[1].find_all("td")[0].find("a").text.strip()

'69th Regiment Armory'

In [77]:
monumentos.find_all("tr")[1].find_all("td")[3].text

'Manhattan 40°44′30″N 73°59′01″W\ufeff / \ufeff40.741648°N 73.983607°W\ufeff / 40.741648; -73.983607\ufeff (69th Regiment Armory)\n'

In [78]:
# Iterate this data. in order to get all values
mon_barr = []
for f in monumentos.find_all("tr"): 
    fila = [e for e in f.find_all("td")]
    if len(fila) > 0:
        pais = { "Monument" : fila[0].find("a").text.strip(),
                "Borough": fila[3].text
              
        }
        
        mon_barr.append(pais)

In [79]:
data_monument = pd.DataFrame(mon_barr)
data_monument

Unnamed: 0,Monument,Borough
0,69th Regiment Armory,Manhattan 40°44′30″N 73°59′01″W﻿ / ﻿40.741648°...
1,Admiral David Glasgow Farragut Gravesite,Bronx 40°53′32″N 73°51′57″W﻿ / ﻿40.892165°N 73...
2,African Burial Ground,Manhattan 40°42′52″N 74°00′16″W﻿ / ﻿40.714558°...
3,Ambrose (lightship),Manhattan 40°42′17″N 74°00′09″W﻿ / ﻿40.704844°...
4,American Stock Exchange,Manhattan 40°42′32″N 74°00′45″W﻿ / ﻿40.7090°N ...
...,...,...
111,Wards Point Archeological Site,Tottenville 40°29′56″N 74°15′07″W﻿ / ﻿40.49888...
112,Woodlawn Cemetery,Bronx 40°53′21″N 73°52′24″W﻿ / ﻿40.889167°N 73...
113,Woolworth Building,Manhattan 40°42′44″N 74°00′29″W﻿ / ﻿40.712222°...
114,Wyckoff House,Brooklyn 40°38′40″N 73°55′15″W﻿ / ﻿40.644342°N...


In [80]:
# Split column to get Borough
data_monument[['Borough','Location']] = data_monument["Borough"].str.split(" ", 1, expand=True)
data_monument.head()

Unnamed: 0,Monument,Borough,Location
0,69th Regiment Armory,Manhattan,40°44′30″N 73°59′01″W﻿ / ﻿40.741648°N 73.98360...
1,Admiral David Glasgow Farragut Gravesite,Bronx,40°53′32″N 73°51′57″W﻿ / ﻿40.892165°N 73.86586...
2,African Burial Ground,Manhattan,40°42′52″N 74°00′16″W﻿ / ﻿40.714558°N 74.00438...
3,Ambrose (lightship),Manhattan,40°42′17″N 74°00′09″W﻿ / ﻿40.704844°N 74.00246...
4,American Stock Exchange,Manhattan,40°42′32″N 74°00′45″W﻿ / ﻿40.7090°N 74.0126°W﻿...


In [81]:
data_monument["Borough"].value_counts(dropna=False)

Manhattan       84
Brooklyn        10
Bronx            5
Columbia         3
Tottenville      2
Kew              1
Jamaica          1
Richmondtown     1
Flushing         1
Pelham           1
Rosebank         1
Midtown          1
New              1
Harlem           1
Van              1
Sailors'         1
Corona           1
Name: Borough, dtype: int64

In [82]:
#Clean Borough in order to get the top 3
data_monument["Borough"] = data_monument["Borough"].apply(eu.cleaning)
data_monument

Unnamed: 0,Monument,Borough,Location
0,69th Regiment Armory,Manhattan,40°44′30″N 73°59′01″W﻿ / ﻿40.741648°N 73.98360...
1,Admiral David Glasgow Farragut Gravesite,Bronx,40°53′32″N 73°51′57″W﻿ / ﻿40.892165°N 73.86586...
2,African Burial Ground,Manhattan,40°42′52″N 74°00′16″W﻿ / ﻿40.714558°N 74.00438...
3,Ambrose (lightship),Manhattan,40°42′17″N 74°00′09″W﻿ / ﻿40.704844°N 74.00246...
4,American Stock Exchange,Manhattan,40°42′32″N 74°00′45″W﻿ / ﻿40.7090°N 74.0126°W﻿...
...,...,...,...
111,Wards Point Archeological Site,other,40°29′56″N 74°15′07″W﻿ / ﻿40.498889°N 74.25194...
112,Woodlawn Cemetery,Bronx,40°53′21″N 73°52′24″W﻿ / ﻿40.889167°N 73.87333...
113,Woolworth Building,Manhattan,40°42′44″N 74°00′29″W﻿ / ﻿40.712222°N 74.00805...
114,Wyckoff House,Brooklyn,40°38′40″N 73°55′15″W﻿ / ﻿40.644342°N 73.92077...


In [93]:
drop_cols = ["Location"]
data_monument = data_monument.drop(drop_cols, axis =1)
data_monument

Unnamed: 0,Monument,Borough
0,69th Regiment Armory,Manhattan
1,Admiral David Glasgow Farragut Gravesite,Bronx
2,African Burial Ground,Manhattan
3,Ambrose (lightship),Manhattan
4,American Stock Exchange,Manhattan
...,...,...
111,Wards Point Archeological Site,other
112,Woodlawn Cemetery,Bronx
113,Woolworth Building,Manhattan
114,Wyckoff House,Brooklyn


In [83]:
data_monument["Borough"].value_counts(dropna=False)

Manhattan    84
other        17
Brooklyn     10
Bronx         5
Name: Borough, dtype: int64

In [84]:
#Export Data
data_monument.to_csv("output/data_NYmonuments_output.csv", index = False)

## API

In [26]:
import requests

In [27]:
# Import the url
url = "https://data.cityofnewyork.us/resource/pitm-atqc.csv"

In [28]:
url_params = "https://data.cityofnewyork.us/resource/pitm-atqc.json"

In [29]:
# Get the data
response_params = requests.get(url_params).json()

In [30]:
data_api = pd.DataFrame(response_params)
data_api.head()

Unnamed: 0,objectid,globalid,seating_interest_sidewalk,restaurant_name,legal_business_name,doing_business_as_dba,bulding_number,street,borough,zip,...,community_board,council_district,census_tract,bin,bbl,nta,roadway_dimensions_length,roadway_dimensions_width,roadway_dimensions_area,landmarkdistrict_terms
0,100,c4b3155b-31a0-4e95-846f-fce09f245437,sidewalk,Pomp and Circumstance Hospitality,Pomp and Circumstance Hospitality LLC,Pomp and Circumstance Hospitality LLC,577,Lorimer Street,Brooklyn,11211,...,1,34,501,3068653.0,3027560028.0,East Williamsburg,,,,
1,1000,753495d8-4429-43e5-85a3-dcf6230ef749,both,Charm Kao,193 Schemerhorn INC,Charm Kao,193,Schermerhorn St.,Brooklyn,11201,...,2,33,37,3000493.0,3001640041.0,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill,24.0,8.0,192.0,
2,10000,{3842B5C5-EF04-41A4-8216-D6EA627DCE5E},openstreets,SAKE BAR HAGI 46,"HAMA NEW YORK, INC.",SAKE BAR HAGI 46,358,W. 46TH STREET,Manhattan,10036,...,4,3,121,1025025.0,1010360057.0,Clinton,,,,
3,10001,{C212A0FC-C115-4425-8F95-931B12C5F86A},openstreets,Yum yum too,Boythaicorp,Boythaicorp,662,9ave,Manhattan,10036,...,4,3,127,1025038.0,1010370001.0,Clinton,,,,
4,10002,{DA48265D-7730-416F-8E1C-EBC8C8ACE2C2},openstreets,Xochil Pizza Corp,Xochil Pizza Corp,Xochil Pizza Corp,4632,5th Avenue,Brooklyn,11220,...,7,38,80,,,Sunset Park West,,,,


In [31]:
# We look what types of datas do we have, all are objects
data_api.dtypes

objectid                         object
globalid                         object
seating_interest_sidewalk        object
restaurant_name                  object
legal_business_name              object
doing_business_as_dba            object
bulding_number                   object
street                           object
borough                          object
zip                              object
business_address                 object
food_service_establishment       object
sidewalk_dimensions_length       object
sidewalk_dimensions_width        object
sidewalk_dimensions_area         object
approved_for_sidewalk_seating    object
approved_for_roadway_seating     object
qualify_alcohol                  object
sla_serial_number                object
sla_license_type                 object
landmark_district_or_building    object
healthcompliance_terms           object
time_of_submission               object
latitude                         object
longitude                        object


In [32]:
# As with the others dataset, we want to focuss on borough so the columns we want are very clean. 
data_api.isnull().sum().apply(lambda x: x/data_api.shape[0]).sort_values(ascending=False)

landmarkdistrict_terms           0.848
sla_serial_number                0.405
sla_license_type                 0.405
roadway_dimensions_width         0.378
roadway_dimensions_length        0.378
roadway_dimensions_area          0.378
sidewalk_dimensions_length       0.217
sidewalk_dimensions_width        0.217
sidewalk_dimensions_area         0.217
bin                              0.091
bbl                              0.091
longitude                        0.085
latitude                         0.085
council_district                 0.085
census_tract                     0.085
nta                              0.085
community_board                  0.085
doing_business_as_dba            0.000
bulding_number                   0.000
globalid                         0.000
legal_business_name              0.000
restaurant_name                  0.000
seating_interest_sidewalk        0.000
borough                          0.000
street                           0.000
qualify_alcohol          

In [33]:
# rows are also very clean
data_api.isnull().sum(axis=1).apply(lambda x: x/data_api.shape[1]).sort_values(ascending=False)

17     0.485714
61     0.428571
279    0.428571
81     0.400000
652    0.400000
         ...   
197    0.000000
752    0.000000
133    0.000000
296    0.000000
522    0.000000
Length: 1000, dtype: float64

In [34]:
# there are no duplicate data
data_api.duplicated().sum()

0

In [64]:
# We drop those columns that are not usefull
col_list = ["restaurant_name", "borough", "street","longitude","latitude"]
data_api_clean = data_api[col_list]
data_api_clean

Unnamed: 0,restaurant_name,borough,street,longitude,latitude
0,Pomp and Circumstance Hospitality,Brooklyn,Lorimer Street,-73.949416,40.714264
1,Charm Kao,Brooklyn,Schermerhorn St.,-73.986352,40.689107
2,SAKE BAR HAGI 46,Manhattan,W. 46TH STREET,-73.989528,40.760463
3,Yum yum too,Manhattan,9ave,-73.990683,40.761081
4,Xochil Pizza Corp,Brooklyn,5th Avenue,-74.008216,40.64714
...,...,...,...,...,...
995,DONT TELL MAMA,Manhattan,343 WEST 46 STREET,-73.989196,40.76034
996,OLD MAN HUSTLE,Manhattan,39 ESSEX STREET,-73.989467,40.716135
997,Tsurutontan Udon Noodle Brasserie,Manhattan,48th,,
998,PHO PLUS,Queens,13351 37TH AVE,-73.83328,40.760757


In [86]:
data_api_clean = data_api_clean[data_api_clean["latitude"].notnull()]
data_api_clean

Unnamed: 0,restaurant_name,borough,street,longitude,latitude
0,Pomp and Circumstance Hospitality,Brooklyn,Lorimer Street,-73.949416,40.714264
1,Charm Kao,Brooklyn,Schermerhorn St.,-73.986352,40.689107
2,SAKE BAR HAGI 46,Manhattan,W. 46TH STREET,-73.989528,40.760463
3,Yum yum too,Manhattan,9ave,-73.990683,40.761081
4,Xochil Pizza Corp,Brooklyn,5th Avenue,-74.008216,40.64714
...,...,...,...,...,...
994,THE GUTTER LES,Manhattan,242 BROOME STREET,-73.989279,40.717845
995,DONT TELL MAMA,Manhattan,343 WEST 46 STREET,-73.989196,40.76034
996,OLD MAN HUSTLE,Manhattan,39 ESSEX STREET,-73.989467,40.716135
998,PHO PLUS,Queens,13351 37TH AVE,-73.83328,40.760757


In [89]:
data_api_clean = data_api_clean[data_api_clean["longitude"].notnull()]
data_api_clean

Unnamed: 0,restaurant_name,borough,street,longitude,latitude
0,Pomp and Circumstance Hospitality,Brooklyn,Lorimer Street,-73.949416,40.714264
1,Charm Kao,Brooklyn,Schermerhorn St.,-73.986352,40.689107
2,SAKE BAR HAGI 46,Manhattan,W. 46TH STREET,-73.989528,40.760463
3,Yum yum too,Manhattan,9ave,-73.990683,40.761081
4,Xochil Pizza Corp,Brooklyn,5th Avenue,-74.008216,40.64714
...,...,...,...,...,...
994,THE GUTTER LES,Manhattan,242 BROOME STREET,-73.989279,40.717845
995,DONT TELL MAMA,Manhattan,343 WEST 46 STREET,-73.989196,40.76034
996,OLD MAN HUSTLE,Manhattan,39 ESSEX STREET,-73.989467,40.716135
998,PHO PLUS,Queens,13351 37TH AVE,-73.83328,40.760757


In [90]:
data_api_clean["borough"].value_counts(dropna=False)

Manhattan        445
Brooklyn         222
Queens           195
Bronx             45
Staten Island      8
Name: borough, dtype: int64

In [91]:
# We export the datas
data_api_clean.to_csv("output/data_NYrestaurants_output.csv", index = False)