# Enriching NYC Airbnb Dataset

## WEB SCRAPING

In [80]:
import numpy as np
import pandas as pd
import src.enriching_utils as eu
import requests
from bs4 import BeautifulSoup
import re

In [81]:
url_NYmonuments = "https://en.wikipedia.org/wiki/List_of_buildings,_sites,_and_monuments_in_New_York_City"

In [82]:
response = requests.get(url_NYmonuments)

In [83]:
soup = BeautifulSoup(response.content)

In [84]:
NYmonuments_tags = soup.find_all(
    name="div", 
    class_="div-col"
)

In [85]:
NYmonuments_tags_li = soup.find_all("li")

In [86]:
NYmonuments_tags_text_li = [monument.text for monument in NYmonuments_tags_li]
NYmonuments_tags_text_li[2]

"America's Response Monument (Manhattan)"

In [87]:
len(NYmonuments_tags_text_li)

307

In [88]:
NYmonuments_tags_text_li2 = NYmonuments_tags_text_li[0:105]
NYmonuments_tags_text_li2[:5]

['American Museum of Natural History (Manhattan)\nRose Center for Earth and Space',
 'Rose Center for Earth and Space',
 "America's Response Monument (Manhattan)",
 'Apollo Theater (Manhattan)',
 'Bank of America Tower (Manhattan)']

In [89]:
df_NYmonuments = pd.DataFrame(NYmonuments_tags_text_li2)
df_NYmonuments.columns = ["monuments_neighbourhood"]
df_NYmonuments.head()

Unnamed: 0,monuments_neighbourhood
0,American Museum of Natural History (Manhattan)...
1,Rose Center for Earth and Space
2,America's Response Monument (Manhattan)
3,Apollo Theater (Manhattan)
4,Bank of America Tower (Manhattan)


In [90]:
df_NYmonuments[["monuments","neighbourhood"]] = df_NYmonuments.monuments_neighbourhood.str.split("\(|\)", expand=True).iloc[:,[0,1]]
df_NYmonuments.head()

Unnamed: 0,monuments_neighbourhood,monuments,neighbourhood
0,American Museum of Natural History (Manhattan)...,American Museum of Natural History,Manhattan
1,Rose Center for Earth and Space,Rose Center for Earth and Space,
2,America's Response Monument (Manhattan),America's Response Monument,Manhattan
3,Apollo Theater (Manhattan),Apollo Theater,Manhattan
4,Bank of America Tower (Manhattan),Bank of America Tower,Manhattan


In [91]:
drop_cols = ["monuments_neighbourhood"]
df_NYmonuments = df_NYmonuments.drop(drop_cols, axis =1)
df_NYmonuments

Unnamed: 0,monuments,neighbourhood
0,American Museum of Natural History,Manhattan
1,Rose Center for Earth and Space,
2,America's Response Monument,Manhattan
3,Apollo Theater,Manhattan
4,Bank of America Tower,Manhattan
...,...,...
100,Woodlawn Cemetery,Bronx
101,Woolworth Building,Manhattan
102,World Trade Center site,Manhattan
103,World Financial Center,Manhattan


In [92]:
df_NYmonuments["neighbourhood"].value_counts(dropna=False)

Manhattan                                       64
NaN                                             12
Brooklyn                                         8
Bronx                                            6
Queens                                           5
connects Manhattan and New Jersey                3
connects Manhattan and Brooklyn                  2
connects Brooklyn and Manhattan                  1
Queens; demolished 2009                          1
The Bronx                                        1
formerly known as the New York State Theater     1
Staten Island                                    1
Name: neighbourhood, dtype: int64

In [93]:
df_NYmonuments = df_NYmonuments[df_NYmonuments["neighbourhood"].notnull()]
df_NYmonuments

Unnamed: 0,monuments,neighbourhood
0,American Museum of Natural History,Manhattan
2,America's Response Monument,Manhattan
3,Apollo Theater,Manhattan
4,Bank of America Tower,Manhattan
5,Battery Park,Manhattan
...,...,...
100,Woodlawn Cemetery,Bronx
101,Woolworth Building,Manhattan
102,World Trade Center site,Manhattan
103,World Financial Center,Manhattan


In [94]:
df_NYmonuments["neighbourhood"] = df_NYmonuments["neighbourhood"].apply(eu.cleaning)
df_NYmonuments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,monuments,neighbourhood
0,American Museum of Natural History,manhattan
2,America's Response Monument,manhattan
3,Apollo Theater,manhattan
4,Bank of America Tower,manhattan
5,Battery Park,manhattan
...,...,...
100,Woodlawn Cemetery,bronx
101,Woolworth Building,manhattan
102,World Trade Center site,manhattan
103,World Financial Center,manhattan


In [32]:
df_NYmonuments["neighbourhood"].value_counts(dropna=False)

manhattan        70
brooklyn          8
bronx             7
queens            6
other             1
staten island     1
Name: neighbourhood, dtype: int64

In [33]:
df_NYmonuments.to_csv("output/df_NYmonuments_scraping.csv", index = False)

## API

In [53]:
import requests

In [60]:
url = "https://data.cityofnewyork.us/resource/pitm-atqc.csv"

In [66]:
url_params = "https://data.cityofnewyork.us/resource/pitm-atqc.json"

In [67]:
response_params = requests.get(url_params).json()

In [76]:
data_api = pd.DataFrame(response_params)
data_api.head()

Unnamed: 0,objectid,globalid,seating_interest_sidewalk,restaurant_name,legal_business_name,doing_business_as_dba,bulding_number,street,borough,zip,...,community_board,council_district,census_tract,bin,bbl,nta,roadway_dimensions_length,roadway_dimensions_width,roadway_dimensions_area,landmarkdistrict_terms
0,100,c4b3155b-31a0-4e95-846f-fce09f245437,sidewalk,Pomp and Circumstance Hospitality,Pomp and Circumstance Hospitality LLC,Pomp and Circumstance Hospitality LLC,577,Lorimer Street,Brooklyn,11211,...,1,34,501,3068653.0,3027560028.0,East Williamsburg,,,,
1,1000,753495d8-4429-43e5-85a3-dcf6230ef749,both,Charm Kao,193 Schemerhorn INC,Charm Kao,193,Schermerhorn St.,Brooklyn,11201,...,2,33,37,3000493.0,3001640041.0,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill,24.0,8.0,192.0,
2,10000,{3842B5C5-EF04-41A4-8216-D6EA627DCE5E},openstreets,SAKE BAR HAGI 46,"HAMA NEW YORK, INC.",SAKE BAR HAGI 46,358,W. 46TH STREET,Manhattan,10036,...,4,3,121,1025025.0,1010360057.0,Clinton,,,,
3,10001,{C212A0FC-C115-4425-8F95-931B12C5F86A},openstreets,Yum yum too,Boythaicorp,Boythaicorp,662,9ave,Manhattan,10036,...,4,3,127,1025038.0,1010370001.0,Clinton,,,,
4,10002,{DA48265D-7730-416F-8E1C-EBC8C8ACE2C2},openstreets,Xochil Pizza Corp,Xochil Pizza Corp,Xochil Pizza Corp,4632,5th Avenue,Brooklyn,11220,...,7,38,80,,,Sunset Park West,,,,


In [71]:
data_api.dtypes

objectid                         object
globalid                         object
seating_interest_sidewalk        object
restaurant_name                  object
legal_business_name              object
doing_business_as_dba            object
bulding_number                   object
street                           object
borough                          object
zip                              object
business_address                 object
food_service_establishment       object
sidewalk_dimensions_length       object
sidewalk_dimensions_width        object
sidewalk_dimensions_area         object
approved_for_sidewalk_seating    object
approved_for_roadway_seating     object
qualify_alcohol                  object
sla_serial_number                object
sla_license_type                 object
landmark_district_or_building    object
healthcompliance_terms           object
time_of_submission               object
latitude                         object
longitude                        object


In [73]:
data_api.isnull().sum().apply(lambda x: x/data_api.shape[0]).sort_values(ascending=False)

landmarkdistrict_terms           0.848
sla_serial_number                0.405
sla_license_type                 0.405
roadway_dimensions_width         0.378
roadway_dimensions_length        0.378
roadway_dimensions_area          0.378
sidewalk_dimensions_length       0.217
sidewalk_dimensions_width        0.217
sidewalk_dimensions_area         0.217
bin                              0.091
bbl                              0.091
longitude                        0.085
latitude                         0.085
council_district                 0.085
census_tract                     0.085
nta                              0.085
community_board                  0.085
doing_business_as_dba            0.000
bulding_number                   0.000
globalid                         0.000
legal_business_name              0.000
restaurant_name                  0.000
seating_interest_sidewalk        0.000
borough                          0.000
street                           0.000
qualify_alcohol          

In [74]:
data_api.isnull().sum(axis=1).apply(lambda x: x/data_api.shape[1]).sort_values(ascending=False)

17     0.485714
61     0.428571
279    0.428571
81     0.400000
652    0.400000
         ...   
197    0.000000
752    0.000000
133    0.000000
296    0.000000
522    0.000000
Length: 1000, dtype: float64

In [75]:
data_api.duplicated().sum()

0

In [79]:
col_list = ["restaurant_name", "borough", "street","longitude","latitude"]
data_api_clean = data_api[col_list]
data_api_clean

Unnamed: 0,restaurant_name,borough,street,longitude,latitude
0,Pomp and Circumstance Hospitality,Brooklyn,Lorimer Street,-73.949416,40.714264
1,Charm Kao,Brooklyn,Schermerhorn St.,-73.986352,40.689107
2,SAKE BAR HAGI 46,Manhattan,W. 46TH STREET,-73.989528,40.760463
3,Yum yum too,Manhattan,9ave,-73.990683,40.761081
4,Xochil Pizza Corp,Brooklyn,5th Avenue,-74.008216,40.64714
...,...,...,...,...,...
995,DONT TELL MAMA,Manhattan,343 WEST 46 STREET,-73.989196,40.76034
996,OLD MAN HUSTLE,Manhattan,39 ESSEX STREET,-73.989467,40.716135
997,Tsurutontan Udon Noodle Brasserie,Manhattan,48th,,
998,PHO PLUS,Queens,13351 37TH AVE,-73.83328,40.760757
