# Airbnb Capstone Project

## 1.Import all Libraries

In [1]:
### import all libraries and set settings 
import pandas as pd
import numpy as np
import requests
import json
import gzip
from py_functions import increase_bbox 

pd.set_option('display.max_columns', None) # show all columns  

## 2.Inside Airbnb pipeline

In [2]:
### Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "united-kingdom"
state = "england"
city = "london"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-14/data/{gz_file}"

In [4]:
### Create new directory for city
!mkdir {path}{city}

mkdir: data/london: File exists


In [38]:
### Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [3]:
### Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:
    listings = pd.read_csv(f)


In [4]:
### select only desired columns 
columns_keeper = (["id",
                   "listing_url",
                   "name",
                   "picture_url",
                   "host_id",
                   "host_response_rate",
                   "host_acceptance_rate",
                   "host_is_superhost",
                   "host_listings_count",
                   "host_total_listings_count",
                   "neighbourhood_cleansed",
                   "latitude",
                   "longitude",
                   "room_type",
                   "accommodates",
                   "bathrooms_text",
                   "bedrooms",
                   "beds",
                   "amenities",
                   "price",
                   "minimum_nights",
                   "maximum_nights",
                   "instant_bookable",
                   "number_of_reviews",
                   "number_of_reviews_ltm",
                   "number_of_reviews_l30d",
                   "first_review",
                   "last_review",
                   "review_scores_rating",
                   "review_scores_accuracy",
                   "review_scores_cleanliness",
                   "review_scores_checkin",
                   "review_scores_communication",
                   "review_scores_location",
                   "review_scores_value",
                   "reviews_per_month"]
                  )

In [5]:
### filter columns 
listings_short = listings[columns_keeper]

### 2.2.First Look - Airbnb Data

In [6]:
listings_short.head()

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room,1,1 shared bath,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",$100.00,1,365,f,0,0,0,,,,,,,,,,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100%,100%,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire home/apt,1,1 bath,1.0,1.0,[],$65.00,180,365,t,0,0,0,,,,,,,,,,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,100%,91%,t,4.0,8.0,Harrow,51.60818,-0.2774,Entire home/apt,4,2 baths,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",$132.00,2,28,t,0,0,0,,,,,,,,,,
3,3518856,https://www.airbnb.com/rooms/3518856,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,100%,f,2.0,5.0,Merton,51.42231,-0.18841,Private room,1,1 private bath,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",$100.00,5,1125,f,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,75%,46%,f,1.0,1.0,Barnet,51.602282,-0.193606,Entire home/apt,2,1 bath,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",$120.00,5,90,f,0,0,0,,,,,,,,,,


In [7]:
listings_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75241 entries, 0 to 75240
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           75241 non-null  int64  
 1   listing_url                  75241 non-null  object 
 2   name                         75210 non-null  object 
 3   picture_url                  75241 non-null  object 
 4   host_id                      75241 non-null  int64  
 5   host_response_rate           46285 non-null  object 
 6   host_acceptance_rate         51028 non-null  object 
 7   host_is_superhost            75223 non-null  object 
 8   host_listings_count          75236 non-null  float64
 9   host_total_listings_count    75236 non-null  float64
 10  neighbourhood_cleansed       75241 non-null  object 
 11  latitude                     75241 non-null  float64
 12  longitude                    75241 non-null  float64
 13  room_type       

In [8]:
listings_short.describe()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
count,75241.0,75241.0,75236.0,75236.0,75241.0,75241.0,75241.0,71768.0,74135.0,75241.0,75241.0,75241.0,75241.0,75241.0,56548.0,55595.0,55606.0,55564.0,55592.0,55565.0,55562.0,56548.0
mean,2.368628e+17,139076500.0,39.525958,71.3791,51.509708,-0.128108,3.105793,1.513153,1.772833,5.750748,7790.3,17.974668,5.736301,0.456467,4.588159,4.723349,4.623915,4.783393,4.801027,4.729358,4.607755,0.877064
std,3.425911e+17,152962100.0,222.170789,420.039233,0.048369,0.099341,1.936972,0.885015,1.228013,24.240947,1914055.0,41.984021,12.991805,1.277612,0.779083,0.489328,0.550721,0.453835,0.448759,0.418873,0.521839,1.234003
min,13913.0,2594.0,1.0,1.0,51.295937,-0.4978,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,19817400.0,19959230.0,1.0,1.0,51.48354,-0.18939,2.0,1.0,1.0,1.0,42.0,1.0,0.0,0.0,4.5,4.67,4.5,4.75,4.79,4.64,4.5,0.13
50%,39338750.0,67455190.0,2.0,2.0,51.51384,-0.12628,2.0,1.0,1.0,2.0,365.0,4.0,0.0,0.0,4.82,4.89,4.8,4.94,4.97,4.85,4.75,0.45
75%,6.562985e+17,224867000.0,5.0,8.0,51.53945,-0.06846,4.0,2.0,2.0,4.0,1125.0,17.0,6.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,4.97,1.09
max,8.463271e+17,505040000.0,2138.0,24047.0,51.681142,0.28857,16.0,22.0,38.0,1125.0,524855600.0,1328.0,564.0,68.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,51.05


## 2.3. Clean Airbnb 

### 2.3.1. Handling Missing Data 

In [10]:
listings_short.shape

(75241, 36)

In [11]:
listings_short.isnull().sum()

id                                 0
listing_url                        0
name                              31
picture_url                        0
host_id                            0
host_response_rate             28956
host_acceptance_rate           24213
host_is_superhost                 18
host_listings_count                5
host_total_listings_count          5
neighbourhood_cleansed             0
latitude                           0
longitude                          0
room_type                          0
accommodates                       0
bathrooms_text                   124
bedrooms                        3473
beds                            1106
amenities                          0
price                              0
minimum_nights                     0
maximum_nights                     0
instant_bookable                   0
number_of_reviews                  0
number_of_reviews_ltm              0
number_of_reviews_l30d             0
first_review                   18693
l

**host_is_superhost**

In [12]:
# check the different values of "host_is_superhost"
listings_short["host_is_superhost"].value_counts(dropna=False)

f      64574
t      10649
NaN       18
Name: host_is_superhost, dtype: int64

In [13]:
# check how many listings the hosts with nan value for "host_is_superhost" have: 
listings_short[listings_short['host_is_superhost'].isna()]["host_total_listings_count"].value_counts()

5.0     4
2.0     3
6.0     2
10.0    2
7.0     2
4.0     2
26.0    2
1.0     1
Name: host_total_listings_count, dtype: int64

In [14]:
# we can fill values with "f" for false 
listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")


In [15]:
# renaming Rows with NaN to "Unknown"
listings_short[["name", "host_response_rate",
                "host_acceptance_rate"]] = listings_short[["name", "host_response_rate",
                                                           "host_acceptance_rate"]].fillna("Unknown")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short[["name", "host_response_rate",


**host_listings_count & host_total_listings_count**

In [16]:
# set the mode for host_listings_count & host_total_listings_count
listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)

listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)


**bedrooms , beds & bathrooms_text**

In [17]:
# set the mode for above columns
listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)

listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)

listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)


**Convert host_response_rate & host_acceptance_rate**

In [18]:
## Convert response rate/acceptance rate from % in integer
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate"].str[:-1]
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].replace('Unknow', np.nan)
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].astype("float64")
listings_short["host_acceptance_rate"] = listings_short["host_acceptance_rate_int"]
listings_short.drop("host_acceptance_rate_int", axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate"].str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].replace('Unknow', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
#same for host_acceptance_rate 

listings_short["host_response_rate_int"] = listings_short["host_response_rate"].str[:-1]
listings_short["host_response_rate_int"] = listings_short["host_response_rate_int"].replace('Unknow', np.nan)
listings_short["host_response_rate"] = listings_short["host_response_rate_int"].astype("float64")
listings_short.drop("host_response_rate_int", axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_response_rate_int"] = listings_short["host_response_rate"].str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_response_rate_int"] = listings_short["host_response_rate_int"].replace('Unknow', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listin

**price**

In [20]:
#convert Price in Integer

listings_short["price"] = listings_short["price"].str[1:]
listings_short["price"] = listings_short["price"].str.replace(",", "")
listings_short["price"] = listings_short["price"].astype("float64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].str[1:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].str.replace(",", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].astype("float64")


**Bathroom_text & private_bath**

In [21]:
#convert bathroom text top bool ('private_bath)
listings_short['private_bath'] = ~listings_short['bathrooms_text'].str.contains('shared|Shared')
listings_short.drop('bathrooms_text', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short['private_bath'] = ~listings_short['bathrooms_text'].str.contains('shared|Shared')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short.drop('bathrooms_text', inplace = True, axis = 1)


In [22]:
listings_short.head(2)

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,private_bath
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room,1,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",100.0,1,365,f,0,0,0,,,,,,,,,,,False
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100.0,100.0,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire home/apt,1,1.0,1.0,[],65.0,180,365,t,0,0,0,,,,,,,,,,,True


**room_type**

In [23]:
#renaming the values 
listings_short["room_type"] = listings_short["room_type"].str.replace("Entire home/apt", "Entire home")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["room_type"] = listings_short["room_type"].str.replace("Entire home/apt", "Entire home")


**Instant_bookable and Host_is_superhost as bool**

In [None]:
listings_short['instant_bookable'] = listings_short['instant_bookable'].map({'f': False, 't': True})
listings_short['host_is_superhost'] = listings_short['host_is_superhost'].map({'f': False, 't': True})

**amenities**

In [134]:
test = listings_short.copy()

In [135]:
# convert items in "amenities" to a list
test["amenities"] = test["amenities"].str.lower().str.replace('[','').str.replace(']','').str.replace('"','').str.replace(' ','_').str.split(',')


  test["amenities"] = test["amenities"].str.lower().str.replace('[','').str.replace(']','').str.replace('"','').str.replace(' ','_').str.split(',')


In [136]:
# create new columns for each amenity 
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
amenities = test.join(pd.DataFrame(mlb.fit_transform(test.pop('amenities')),
                          columns=mlb.classes_,
                          index=test.index))

In [137]:
# create a list of amenity with fewer than 10% of listings
infrequent_amenities = []
for col in amenities.iloc[: , 35:].columns:
    if amenities[col].sum() < len(amenities)/10:
        infrequent_amenities.append(col)

# drop infrequent amenity features
amenities.drop(infrequent_amenities, axis=1, inplace=True)


In [138]:
# combine _coffee & _coffe_maker to one column
amenities['_coffee_'] = (amenities['_coffee_maker'] | amenities['_coffee']).astype(int)


In [139]:
# focus on relevant columns
amenity_keeper = ["id",
                  "_wifi",
                  "_long_term_stays_allowed",
                  "_private_patio_or_balcony",
                  "_private_entrance",
                  "_pets_allowed",
                  "_outdoor_dining_area",
                  "_lockbox",
                  "_kitchen",
                  "_hair_dryer",
                  "_free_street_parking",
                  "_free_parking_on_premises",
                  "_dedicated_workspace",
                  "_coffee_maker",
                  "_coffee",
                  "_bed_linens",
                  "_bathtub"]


In [140]:
# keep only relevant columns 
amenities_short = amenities[amenity_keeper]

In [142]:
# get rid of first "_"
amenities_short.columns = amenities_short.columns.str.replace('_','', 1)


In [148]:
# merge to one dataframe 
airbnb = listings_short.merge(amenities_short, how="left", on="id")

In [149]:
airbnb.shape

(75241, 52)

## 3.Overpass Pipeline

In [None]:
### Increase outside border of listings
london_bbox = increase_bbox(listings)

In [None]:
# Increasing the maxs by 0.01 and decreasing the mins by 0.01 
# will shift the outline's border by a bit more than 1km in each direction.

# See increase_bbox function in py_functions.py

In [None]:
# (northern hemisphere)
# latitude max = north
# latitude min = south
# longitude max = east
# longitude min = west

In [None]:
### Get OSM data for slightly bigger bbox
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(
    node["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    );
    (._;>;);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

osm = pd.json_normalize(data, record_path="elements")


In [None]:
### clean column names 
osm.columns = osm.columns.str.replace(".", "_", regex=False)
osm.columns = osm.columns.str.replace(":", "_", regex=False)


In [None]:
osm["lat"] = np.where(osm["lat"].isna(), osm["center_lat"], osm['lat'])
osm["lon"] = np.where(osm["lon"].isna(), osm["center_lon"], osm['lon'])

In [None]:
### select only desired columns
osm_keepers = ["id",
                     "lat",
                     "lon",
                     "tags_name",
                     "tags_amenity",
                     "tags_cuisine",
                     "tags_diet_vegetarian",
                     "tags_diet_vegan",
                     "tags_railway"]
                     

In [None]:
osm_short = osm[osm_keepers]

In [None]:
### combine lat & lon to a new column lat_lon, to check for duplicates 
# osm_short["lat_lon"] = osm_short["lat"].astype(str) + osm_short["lon"].astype(str)

In [None]:
### drop all rows with no lat/lon
osm_short = osm_short.dropna(subset=['lat'])


In [None]:
osm_short = osm_short.drop(osm_short[(osm_short['tags_name'].isna()) & (osm_short['tags_amenity'].isna())].index)

In [None]:
osm_short['tags_amenity'] = osm_short['tags_amenity'].str.replace('pub', 'bar')

In [None]:
osm_short['gastronomy'] = np.where(osm_short['tags_amenity'].isin(['restaurant', 'fast_food']), True, False)

In [None]:
osm_short = osm_short[osm_short['tags_amenity'].isin(['bar', 'restaurant', np.nan, 'cafe', 'fast_food', 'bakery', 'food_court'])]

In [None]:
osm_short[osm_short['tags_name'] == 'White Lion']

Unnamed: 0,id,lat,lon,tags_name,tags_amenity,tags_cuisine,tags_diet_vegetarian,tags_diet_vegan,tags_railway,gastronomy
84154,626036832,51.309568,-0.053992,White Lion,bar,,limited,limited,,False


In [None]:
osm_short['tags_diet_vegan'].unique()

array(['yes', nan, 'no', 'only', 'limited'], dtype=object)

In [None]:
osm_short['tags_diet_vegetarian'] = np.where(osm_short['tags_diet_vegetarian'].isin(['yes', 'only', 'limited']), True, osm_short['tags_diet_vegetarian'])
osm_short['tags_diet_vegetarian'] = np.where(osm_short['tags_diet_vegetarian'] == 'no', False, osm_short['tags_diet_vegetarian'])
osm_short['tags_diet_vegan'] = np.where(osm_short['tags_diet_vegan'].isin(['yes', 'only', 'limited']), True, osm_short['tags_diet_vegan'])
osm_short['tags_diet_vegan'] = np.where(osm_short['tags_diet_vegan'] == 'no', False, osm_short['tags_diet_vegan'])

In [None]:
osm_short[(osm_short['tags_name'].duplicated(keep=False)) & (~osm_short['tags_name'].isna())].sort_values('tags_name')

Unnamed: 0,id,lat,lon,tags_name,tags_amenity,tags_cuisine,tags_diet_vegetarian,tags_diet_vegan,tags_railway,gastronomy
69299,9154045091,51.551510,0.025445,% Arabica,cafe,coffee_shop,,,,False
81868,366302157,51.536501,-0.062015,% Arabica,cafe,coffee_shop,,,,False
56910,7162287707,51.511600,-0.124178,% Arabica,cafe,,,,,False
75624,10266440589,51.481276,-0.009764,15grams,cafe,coffee_shop,,,,False
75802,10280308656,51.466273,0.008868,15grams,cafe,,,,,False
...,...,...,...,...,...,...,...,...,...,...
85121,835331045,51.517020,-0.083122,itsu,fast_food,asian,,,,True
81068,294886288,51.522595,-0.136622,itsu,fast_food,asian,,,,True
81607,349295161,51.519376,-0.135280,mooboo,cafe,bubble_tea,,,,False
75459,10243071184,51.581536,-0.339206,mooboo,cafe,bubble_tea,,,,False


In [None]:
osm_short[(osm_short[['tags_name', 'lon']].duplicated(keep=False))]

Unnamed: 0,id,lat,lon,tags_name,tags_amenity,tags_cuisine,tags_diet_vegetarian,tags_diet_vegan,tags_railway,gastronomy
77013,10804698810,51.517783,-0.119808,Hiba,,lebanese;kebab,,,,False
77014,10804707265,51.517783,-0.119808,Hiba,fast_food,lebanese;kebab,,,,True
78421,148523747,51.510973,-0.119257,Pret a Manger,cafe,,,,,False
78950,161420170,51.513528,-0.112037,Wrap It Up!,fast_food,,,,,True
78955,161420193,51.513528,-0.112037,Wrap It Up!,fast_food,,True,,,True
79145,175155045,51.514317,-0.126766,Monmouth Coffee Company,,coffee_shop,,,,False
80097,232232236,51.514317,-0.126766,Monmouth Coffee Company,cafe,,,,,False
84331,671375498,51.510973,-0.119257,Pret a Manger,fast_food,,,,,True
85644,917487765,51.413644,-0.124324,,fast_food,burger,,,,True
85645,917487766,51.413613,-0.124324,,fast_food,,,,,True


In [None]:
osm_short.columns = osm_short.columns.str.replace('tags_' , '')

In [None]:
osm_short.head()

Unnamed: 0,id,lat,lon,name,amenity,cuisine,diet_vegetarian,diet_vegan,railway,gastronomy
0,451152,51.60084,-0.194608,King of Prussia,bar,pizza;burger,True,True,,False
1,451153,51.602031,-0.193503,Central Restaurant,restaurant,,,,,True
2,451154,51.599579,-0.196028,The Catcher in the Rye,bar,,,,,False
3,451271,51.614104,-0.176556,The Tally Ho,bar,,,,,False
4,12242503,51.592016,0.027962,Railway Bell,bar,,,,,False


### 3.1. First Look - Open Street Maps-Data

In [None]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"].shape

NameError: name 'streetmap_short' is not defined

In [None]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"]["tags.cuisine"].value_counts(dropna=False)

In [None]:
streetmap_short["tags.cuisine"].value_counts(dropna=False)

In [None]:
streetmap_short.head()

In [None]:
streetmap_short["tags.amenity"].unique()

In [None]:
streetmap_short.info()

In [None]:
streetmap_short.describe()

In [None]:
### Increase outside border of listings
london_bbox = increase_bbox(listings)

In [None]:
# Increasing the maxs by 0.01 and decreasing the mins by 0.01 
# will shift the outline's border by a bit more than 1km in each direction.

# See increase_bbox function in py_functions.py

In [None]:
# (northern hemisphere)
# latitude max = north
# latitude min = south
# longitude max = east
# longitude min = west

In [None]:
### Get OSM data for slightly bigger bbox
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(
    node["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    );
    (._;>;);
out body;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

osm = pd.json_normalize(data, record_path="elements")


In [None]:
### clean column names 
osm.columns = osm.columns.str.replace(".", "_", regex=False)
osm.columns = osm.columns.str.replace(":", "_", regex=False)


In [None]:
### select only desired columns
osm_keepers = (["id",
                     "lat",
                     "lon",
                     "tags_name",
                     "tags_amenity",
                     "tags_cuisine",
                     "tags_diet_vegetarian",
                     "tags_diet_vegan",
                     "tags_railway"]
                     )

In [None]:
osm_short = osm[osm_keepers]

In [None]:
### combine lat & lon to a new column lat_lon, to check for duplicates 
osm_short["lat_lon"] = osm_short["lat"].astype(str) + osm_short["lon"].astype(str)

In [None]:
### drop all rows with no lat/lon
osm_short = osm_short.dropna(subset=['lat'])


In [None]:
osm_short.head()

### 3.1. First Look - Open Street Maps-Data

In [None]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"].shape

In [None]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"]["tags.cuisine"].value_counts(dropna=False)

In [None]:
streetmap_short["tags.cuisine"].value_counts(dropna=False)

In [None]:
streetmap_short.head()

In [None]:
streetmap_short["tags.amenity"].unique()

In [None]:
streetmap_short.info()

In [None]:
streetmap_short.describe()

## 4. Web scraping test

In [None]:
from bs4 import BeautifulSoup

In [None]:
url = "http://insideairbnb.com/get-the-data/"
page = requests.get(url)

In [None]:
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
print(soup)

## 5.Area-Calc-Test

In [None]:
streetmap.head()

In [None]:
# Import libraries and set test subsets of the data
from sklearn.neighbors import BallTree
list_test = listings_short[["id", "name", "latitude", "longitude"]]
street_test = streetmap[["id", "lat", "lon", "tags.amenity", "tags.railway"]]

In [None]:
#rename the column name so that they are equal 
streetmap = streetmap.rename(columns={"lat": "latitude", "lon": "longitude"})


In [None]:
# Calculate the needed radius when converted to unit sphere.
distance_in_meter = 500
earth_radius_in_meter = 6_371_000

radius = distance_in_meter / earth_radius_in_meter

In [None]:
# Convert the latitude and longitude columns to radians
list_test = list_test.copy()
list_test.loc[:, 'lat_rad'] = np.radians(list_test['latitude'])
list_test.loc[:, 'lon_rad'] = np.radians(list_test['longitude'])
street_test = street_test.copy()
street_test.loc[:, 'lat_rad'] = np.radians(street_test['latitude'])
street_test.loc[:, 'lon_rad'] = np.radians(street_test['longitude'])

# Create a BallTree object with the latitude and longitude columns
tree = BallTree(street_test[['lat_rad', 'lon_rad']],
                leaf_size=15, metric='haversine')

# Find the indices of all neighbors within a radius of 500 meters
# for each row in list_test
indices = tree.query_radius(
    list_test[['lat_rad', 'lon_rad']], r=radius, count_only=False)

# Calculate the number of neighbors for each amenity type
amenity_types = street_test['tags.amenity'].unique()
amenity_counts = np.zeros((list_test.shape[0], amenity_types.shape[0]))
for i, amenity in enumerate(amenity_types):
    street_indices = street_test[street_test['tags.amenity'] == amenity].index
    intersection_counts = np.array(
        [np.intersect1d(street_indices, idx).size for idx in indices])
    amenity_counts[:, i] = intersection_counts

# Add the new columns to list_test
list_test = pd.concat([list_test, pd.DataFrame(amenity_counts, columns=[
                      f'num_neighbors_{amenity}' for amenity in amenity_types])], axis=1)

# Calculate the number of neighbors for each railway type
railway_types = street_test['tags.railway'].unique()
railway_counts = np.zeros((list_test.shape[0], railway_types.shape[0]))
for i, railway in enumerate(railway_types):
    street_indices = street_test[street_test['tags.railway'] == railway].index
    intersection_counts = np.array(
        [np.intersect1d(street_indices, idx).size for idx in indices])
    railway_counts[:, i] = intersection_counts

# Add the new columns to list_test
list_test = pd.concat([list_test, pd.DataFrame(railway_counts, columns=[
                      f'num_neighbors_{railway}' for railway in railway_types])], axis=1)


# Remove the temporary columns
list_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)
street_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)


In [None]:
list_test.describe()

## 6.Nearest Station-Test

In [None]:
streetmap["tags.railway"].unique()

In [None]:
streetmap.dropna(subset=["tags.railway"], inplace=True)


In [None]:
from haversine import haversine, Unit

# define a function to calculate distance between two points
def calc_distance(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# get all unique values in "tags.railway"
railway_tags = streetmap["tags.railway"].unique()

# loop through each row in listings_short and calculate the minimum distance
# for each value in streetmap["tags.railway"]
for tag in railway_tags:
    distances = []
    for _, row in listings_short.iterrows():
        min_distance = None
        for _, sm_row in streetmap[streetmap["tags.railway"] == tag].iterrows():
            distance = calc_distance(row["latitude"], row["longitude"], sm_row["latitude"], sm_row["longitude"])
            if min_distance is None or distance < min_distance:
                min_distance = distance
        distances.append(min_distance)

    # add the calculated minimum distances as a new column in listings_short
    col_name = "min_distance_{}".format(tag)
    listings_short[col_name] = distances


In [None]:
# from haversine import haversine, Unit

# # define a function to calculate distance between two points
# def calc_distance(lat1, lon1, lat2, lon2):
#     return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# # loop through each row in listings_short and calculate the minimum distance
# # for "subway_entrance" tag
# distances = []
# for _, row in listings_short.iterrows():
#     min_distance = None
#     for _, sm_row in streetmap[streetmap["tags.railway"] == "subway_entrance"].iterrows():
#         distance = calc_distance(row["latitude"], row["longitude"], sm_row["latitude"], sm_row["longitude"])
#         if min_distance is None or distance < min_distance:
#             min_distance = distance
#     distances.append(min_distance)

# # add the calculated minimum distances as a new column in listings_short
# listings_short["min_distance_subway_entrance"] = distances


In [None]:
listings_short.describe()

Price correlation

In [None]:
list_test["price"] = listings_short["price"]

In [None]:
list_test.price = list_test.price.str[1:-3]
list_test.price = list_test.price.str.replace(",", "")
list_test.price = list_test.price.astype('int64')

In [None]:
list_test.corrwith(list_test['price'])