# Airbnb Capstone Project

## London listings.csv test

In [1]:
import pandas as pd
import numpy as np
from py_functions import increase_bbox
import requests
import json
import gzip

In [2]:
# London
# Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "united-kingdom"
state = "england"
city = "london"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-14/data/{gz_file}"

In [None]:
# Create new directory for city
!mkdir {path}{city}

In [None]:
# Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [3]:
# Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:

    listings = pd.read_csv(f)

London OSM dataframe

In [4]:
# Increase outside border of listings
london_bbox = increase_bbox(listings)

In [5]:
# Get OSM data for slightly bigger bbox
# Example for nodes with an entry for "cuisine"
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(
    node["amenity"~"^(bar|pub|restaurant|cafe|fast_food)$"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
);
out body;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

data_norm = pd.json_normalize(data, record_path="elements")

In [6]:
data_norm.shape

(15234, 466)

In [None]:
data_norm.head()

In [None]:
data_norm[["id", "lat", "lon","tags.name", "tags.amenity", "tags.cuisine", "tags.diet:vegetarian", "tags.diet:vegan"]]

In [None]:
data_norm["tags.vegetarian"].unique()

In [None]:
data_norm["tags.amenity"].value_counts()

In [None]:
data_norm.dropna(thresh=len(data_norm)-50, axis=1)

In [None]:
df_amenity = pd.DataFrame(data_norm["tags.amenity"])

In [None]:
df_amenity_vc = df_amenity.value_counts().reset_index()

In [None]:
df_new = data_norm.merge(df_amenity_vc, on="tags.amenity", how="left")

In [None]:
df_new.head(10)

In [None]:
data_norm.shape

In [None]:
df_new[df_new[0] > 100]

In [None]:
data_norm.groupby("tags.amenity").sum()

In [None]:
pd.DataFrame(data_norm["tags.amenity"]).reset_index()

In [None]:
# Outermost longitude/latitude points of the Airbnb listings
listings.describe()[["latitude", "longitude"]].loc[["min", "max"]]

(northern hemisphere)

latitude max = north

latitude min = south

longitude max = east

longitude min = west

Increasing the maxs by 0.01 and decreasing the mins by 0.01 will shift the outline's border by a bit more than 1km in each direction.

See increase_bbox function in py_functions.py

requests and json method:

In [None]:
import requests
import json
from py_functions import increase_bbox

In [None]:
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
(node["amenity"~"^(bar|pub|restaurant)$"](51.2867602,-0.5103751,51.6918741,0.3340155);
);
out body;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

In [None]:
data_norm = pd.json_normalize(data, record_path="elements")

In [None]:
pd.set_option('display.max_columns', 50)
data_norm.head()

In [None]:
data_norm[["tags.name", "tags.amenity", "tags.cuisine"]]

In [None]:
data_norm.shape

## Lisbon pipeline test

In [None]:
import gzip

In [None]:
# Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "portugal"
state = "lisbon"
city = "lisbon"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-19/data/{gz_file}"

In [None]:
# Create new directory for city
!mkdir {path}{city}

In [None]:
# Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [None]:
# Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:

    listings = pd.read_csv(f)

In [None]:
# Increase outside border of listings
lisbon_bbox = increase_bbox(listings)

In [None]:
lisbon_bbox

In [None]:
# Get OSM data for slightly bigger bbox
# Example for nodes with an entry for "cuisine"
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(node["cuisine"]({lisbon_bbox["south_shifted"]},{lisbon_bbox["west_shifted"]},{lisbon_bbox["north_shifted"]},{lisbon_bbox["east_shifted"]});
);
out body;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

data_norm = pd.json_normalize(data, record_path="elements")

In [None]:
data_norm.shape

In [None]:
data_norm.head()

In [None]:
data_norm[["tags.name","tags.cuisine","tags.amenity"]]

Web scraping test

Check out Selenium