# Airbnb Capstone Project

## 1.Import all Libraries

In [1]:
### import all libraries and set settings 
import pandas as pd
import numpy as np
import requests
import json
import gzip
from py_functions import increase_bbox 

pd.set_option('display.max_columns', None) # show all columns  

## 2.Inside Airbnb pipeline

In [2]:
### Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "united-kingdom"
state = "england"
city = "london"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-14/data/{gz_file}"

In [3]:
### Create new directory for city
!mkdir {path}{city}

mkdir: data/london: File exists


In [4]:
### Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [5]:
### Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:

    listings = pd.read_csv(f)

In [6]:
### select only desired columns 
columns_keeper = (["id",
                   "listing_url",
                   "scrape_id",
                   "last_scraped",
                   "name",
                   "picture_url",
                   "host_id",
                   "host_response_time",
                   "host_response_rate",
                   "host_acceptance_rate",
                   "host_is_superhost",
                   "host_listings_count",
                   "host_total_listings_count",
                   "neighbourhood_cleansed",
                   "neighbourhood_group_cleansed",
                   "latitude",
                   "longitude",
                   "property_type",
                   "room_type",
                   "accommodates",
                   "bathrooms",
                   "bathrooms_text",
                   "bedrooms",
                   "beds",
                   "amenities",
                   "price",
                   "minimum_nights",
                   "maximum_nights",
                   "instant_bookable",
                   "number_of_reviews",
                   "number_of_reviews_ltm",
                   "number_of_reviews_l30d",
                   "first_review",
                   "last_review",
                   "review_scores_rating",
                   "review_scores_accuracy",
                   "review_scores_cleanliness",
                   "review_scores_checkin",
                   "review_scores_communication",
                   "review_scores_location",
                   "review_scores_value",
                   "reviews_per_month"]
                  )

In [7]:
### filter columns 
listings_short = listings[columns_keeper]

### 2.2.First Look - Airbnb Data

In [8]:
listings_short.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,picture_url,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,20230314070633,2023-03-14,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,,f,1.0,1.0,Haringey,,51.59728,-0.13933,Private room in condo,Private room,1,,1 shared bath,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",$100.00,1,365,f,0,0,0,,,,,,,,,,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,20230314070633,2023-03-14,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,within an hour,100%,100%,f,14.0,31.0,Barnet,,51.636518,-0.177475,Entire rental unit,Entire home/apt,1,,1 bath,1.0,1.0,[],$65.00,180,365,t,0,0,0,,,,,,,,,,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,20230314070633,2023-03-14,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,within an hour,100%,91%,t,4.0,8.0,Harrow,,51.60818,-0.2774,Entire rental unit,Entire home/apt,4,,2 baths,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",$132.00,2,28,t,0,0,0,,,,,,,,,,
3,3518856,https://www.airbnb.com/rooms/3518856,20230314070633,2023-03-14,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,,100%,f,2.0,5.0,Merton,,51.42231,-0.18841,Private room in rental unit,Private room,1,,1 private bath,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",$100.00,5,1125,f,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,20230314070633,2023-03-14,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,within a few hours,75%,46%,f,1.0,1.0,Barnet,,51.602282,-0.193606,Entire condo,Entire home/apt,2,,1 bath,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",$120.00,5,90,f,0,0,0,,,,,,,,,,


In [9]:
listings_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75241 entries, 0 to 75240
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            75241 non-null  int64  
 1   listing_url                   75241 non-null  object 
 2   scrape_id                     75241 non-null  int64  
 3   last_scraped                  75241 non-null  object 
 4   name                          75210 non-null  object 
 5   picture_url                   75241 non-null  object 
 6   host_id                       75241 non-null  int64  
 7   host_response_time            46285 non-null  object 
 8   host_response_rate            46285 non-null  object 
 9   host_acceptance_rate          51028 non-null  object 
 10  host_is_superhost             75223 non-null  object 
 11  host_listings_count           75236 non-null  float64
 12  host_total_listings_count     75236 non-null  float64
 13  n

In [10]:
listings_short.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
count,75241.0,75241.0,75241.0,75236.0,75236.0,0.0,75241.0,75241.0,75241.0,0.0,71768.0,74135.0,75241.0,75241.0,75241.0,75241.0,75241.0,56548.0,55595.0,55606.0,55564.0,55592.0,55565.0,55562.0,56548.0
mean,2.368628e+17,20230310000000.0,139076500.0,39.525958,71.3791,,51.509708,-0.128108,3.105793,,1.513153,1.772833,5.750748,7790.3,17.974668,5.736301,0.456467,4.588159,4.723349,4.623915,4.783393,4.801027,4.729358,4.607755,0.877064
std,3.425911e+17,12.96884,152962100.0,222.170789,420.039233,,0.048369,0.099341,1.936972,,0.885015,1.228013,24.240947,1914055.0,41.984021,12.991805,1.277612,0.779083,0.489328,0.550721,0.453835,0.448759,0.418873,0.521839,1.234003
min,13913.0,20230310000000.0,2594.0,1.0,1.0,,51.295937,-0.4978,0.0,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,19817400.0,20230310000000.0,19959230.0,1.0,1.0,,51.48354,-0.18939,2.0,,1.0,1.0,1.0,42.0,1.0,0.0,0.0,4.5,4.67,4.5,4.75,4.79,4.64,4.5,0.13
50%,39338750.0,20230310000000.0,67455190.0,2.0,2.0,,51.51384,-0.12628,2.0,,1.0,1.0,2.0,365.0,4.0,0.0,0.0,4.82,4.89,4.8,4.94,4.97,4.85,4.75,0.45
75%,6.562985e+17,20230310000000.0,224867000.0,5.0,8.0,,51.53945,-0.06846,4.0,,2.0,2.0,4.0,1125.0,17.0,6.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,4.97,1.09
max,8.463271e+17,20230310000000.0,505040000.0,2138.0,24047.0,,51.681142,0.28857,16.0,,22.0,38.0,1125.0,524855600.0,1328.0,564.0,68.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,51.05


## 3.Overpass Pipeline

In [11]:
### Increase outside border of listings
london_bbox = increase_bbox(listings)

<!-- Increasing the maxs by 0.01 and decreasing the mins by 0.01 will shift the outline's border by a bit more than 1km in each direction.

See increase_bbox function in py_functions.py -->

(northern hemisphere)

latitude max = north

latitude min = south

longitude max = east

longitude min = west

In [12]:
### Get OSM data for slightly bigger bbox
### Example for nodes with an entry for "cuisine"
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(node["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
);
out body;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

data_norm = pd.json_normalize(data, record_path="elements")

In [13]:
### select only desired columns
data_norm_keepers = (["id",
                     "lat",
                     "lon",
                     "tags.name",
                     "tags.amenity",
                     "tags.cuisine",
                     "tags.diet:vegetarian",
                     "tags.diet:vegan"]
                     )

In [14]:
### filter columns 
streetmap_short = data_norm[data_norm_keepers]

### 3.1. First Look - Open Street Maps-Data

In [15]:
streetmap_short.head()

Unnamed: 0,id,lat,lon,tags.name,tags.amenity,tags.cuisine,tags.diet:vegetarian,tags.diet:vegan
0,451152,51.60084,-0.194608,King of Prussia,pub,pizza;burger,yes,yes
1,21593237,51.517347,-0.118957,The Polish Bar (Na Zdrowie),bar,polish,,
2,25729218,51.453406,-0.038511,The Brockley Jack,pub,international,,
3,26544484,51.398014,-0.172235,Casuarina Tree,restaurant,indian,,
4,26604024,51.525732,-0.458548,Jin Li,restaurant,chinese,,


In [16]:
streetmap_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8157 entries, 0 to 8156
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    8157 non-null   int64  
 1   lat                   8157 non-null   float64
 2   lon                   8157 non-null   float64
 3   tags.name             8061 non-null   object 
 4   tags.amenity          7993 non-null   object 
 5   tags.cuisine          8157 non-null   object 
 6   tags.diet:vegetarian  568 non-null    object 
 7   tags.diet:vegan       401 non-null    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 509.9+ KB


In [17]:
streetmap_short.describe()

Unnamed: 0,id,lat,lon
count,8157.0,8157.0,8157.0
mean,4971301000.0,51.499522,-0.133331
std,3253489000.0,0.068799,0.13189
min,451152.0,51.286426,-0.507334
25%,1977683000.0,51.466187,-0.192639
50%,4865710000.0,51.509336,-0.128814
75%,7637750000.0,51.53612,-0.073668
max,10809290000.0,51.687652,0.298538


## 4. Web scraping test

In [None]:
from bs4 import BeautifulSoup

In [None]:
url = "http://insideairbnb.com/get-the-data/"
page = requests.get(url)

In [None]:
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
print(soup)