# Airbnb Capstone Project

## 1.Import all Libraries

In [1]:
### import all libraries and set settings 
import pandas as pd
import numpy as np
import requests
import json
import gzip
from py_functions import increase_bbox 

pd.set_option('display.max_columns', None) # show all columns  

## 2.Inside Airbnb pipeline

In [2]:
### Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "united-kingdom"
state = "england"
city = "london"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-14/data/{gz_file}"

In [3]:
### Create new directory for city
!mkdir {path}{city}

mkdir: data/london: File exists


In [4]:
### Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [5]:
### Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:
    listings = pd.read_csv(f)


In [6]:
### select only desired columns 
columns_keeper = (["id",
                   "listing_url",
                   "name",
                   "picture_url",
                   "host_id",
                   "host_response_rate",
                   "host_acceptance_rate",
                   "host_is_superhost",
                   "host_listings_count",
                   "host_total_listings_count",
                   "neighbourhood_cleansed",
                   "latitude",
                   "longitude",
                   "property_type",
                   "room_type",
                   "accommodates",
                   "bathrooms_text",
                   "bedrooms",
                   "beds",
                   "amenities",
                   "price",
                   "minimum_nights",
                   "maximum_nights",
                   "instant_bookable",
                   "number_of_reviews",
                   "number_of_reviews_ltm",
                   "number_of_reviews_l30d",
                   "first_review",
                   "last_review",
                   "review_scores_rating",
                   "review_scores_accuracy",
                   "review_scores_cleanliness",
                   "review_scores_checkin",
                   "review_scores_communication",
                   "review_scores_location",
                   "review_scores_value",
                   "reviews_per_month"]
                  )

In [7]:
### filter columns 
listings_short = listings[columns_keeper]

### 2.2.First Look - Airbnb Data

In [8]:
listings_short.head()

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room in condo,Private room,1,1 shared bath,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",$100.00,1,365,f,0,0,0,,,,,,,,,,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100%,100%,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire rental unit,Entire home/apt,1,1 bath,1.0,1.0,[],$65.00,180,365,t,0,0,0,,,,,,,,,,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,100%,91%,t,4.0,8.0,Harrow,51.60818,-0.2774,Entire rental unit,Entire home/apt,4,2 baths,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",$132.00,2,28,t,0,0,0,,,,,,,,,,
3,3518856,https://www.airbnb.com/rooms/3518856,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,100%,f,2.0,5.0,Merton,51.42231,-0.18841,Private room in rental unit,Private room,1,1 private bath,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",$100.00,5,1125,f,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,75%,46%,f,1.0,1.0,Barnet,51.602282,-0.193606,Entire condo,Entire home/apt,2,1 bath,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",$120.00,5,90,f,0,0,0,,,,,,,,,,


In [9]:
listings_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75241 entries, 0 to 75240
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           75241 non-null  int64  
 1   listing_url                  75241 non-null  object 
 2   name                         75210 non-null  object 
 3   picture_url                  75241 non-null  object 
 4   host_id                      75241 non-null  int64  
 5   host_response_rate           46285 non-null  object 
 6   host_acceptance_rate         51028 non-null  object 
 7   host_is_superhost            75223 non-null  object 
 8   host_listings_count          75236 non-null  float64
 9   host_total_listings_count    75236 non-null  float64
 10  neighbourhood_cleansed       75241 non-null  object 
 11  latitude                     75241 non-null  float64
 12  longitude                    75241 non-null  float64
 13  property_type   

In [10]:
listings_short.describe()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
count,75241.0,75241.0,75236.0,75236.0,75241.0,75241.0,75241.0,71768.0,74135.0,75241.0,75241.0,75241.0,75241.0,75241.0,56548.0,55595.0,55606.0,55564.0,55592.0,55565.0,55562.0,56548.0
mean,2.368628e+17,139076500.0,39.525958,71.3791,51.509708,-0.128108,3.105793,1.513153,1.772833,5.750748,7790.3,17.974668,5.736301,0.456467,4.588159,4.723349,4.623915,4.783393,4.801027,4.729358,4.607755,0.877064
std,3.425911e+17,152962100.0,222.170789,420.039233,0.048369,0.099341,1.936972,0.885015,1.228013,24.240947,1914055.0,41.984021,12.991805,1.277612,0.779083,0.489328,0.550721,0.453835,0.448759,0.418873,0.521839,1.234003
min,13913.0,2594.0,1.0,1.0,51.295937,-0.4978,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,19817400.0,19959230.0,1.0,1.0,51.48354,-0.18939,2.0,1.0,1.0,1.0,42.0,1.0,0.0,0.0,4.5,4.67,4.5,4.75,4.79,4.64,4.5,0.13
50%,39338750.0,67455190.0,2.0,2.0,51.51384,-0.12628,2.0,1.0,1.0,2.0,365.0,4.0,0.0,0.0,4.82,4.89,4.8,4.94,4.97,4.85,4.75,0.45
75%,6.562985e+17,224867000.0,5.0,8.0,51.53945,-0.06846,4.0,2.0,2.0,4.0,1125.0,17.0,6.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,4.97,1.09
max,8.463271e+17,505040000.0,2138.0,24047.0,51.681142,0.28857,16.0,22.0,38.0,1125.0,524855600.0,1328.0,564.0,68.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,51.05


## 2.3. Clean Airbnb 

### 2.3.1. Handling Missing Data 

In [11]:
listings_short.shape

(75241, 37)

In [12]:
listings_short.isnull().sum()

id                                 0
listing_url                        0
name                              31
picture_url                        0
host_id                            0
host_response_rate             28956
host_acceptance_rate           24213
host_is_superhost                 18
host_listings_count                5
host_total_listings_count          5
neighbourhood_cleansed             0
latitude                           0
longitude                          0
property_type                      0
room_type                          0
accommodates                       0
bathrooms_text                   124
bedrooms                        3473
beds                            1106
amenities                          0
price                              0
minimum_nights                     0
maximum_nights                     0
instant_bookable                   0
number_of_reviews                  0
number_of_reviews_ltm              0
number_of_reviews_l30d             0
f

**host_is_superhost**

In [13]:
# check the different values of "host_is_superhost"
listings_short["host_is_superhost"].value_counts(dropna=False)

f      64574
t      10649
NaN       18
Name: host_is_superhost, dtype: int64

In [14]:
# check how many listings the hosts with nan value for "host_is_superhost" have: 
listings_short[listings_short['host_is_superhost'].isna()]["host_total_listings_count"].value_counts()

5.0     4
2.0     3
6.0     2
10.0    2
7.0     2
4.0     2
26.0    2
1.0     1
Name: host_total_listings_count, dtype: int64

In [15]:
# we can fill values with "f" for false 
listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")


In [16]:
# renaming Rows with NaN to "Unknown"
listings_short[["name", "host_response_time", "host_response_rate",
                "host_acceptance_rate"]] = listings_short[["name", "host_response_time", "host_response_rate",
                                                           "host_acceptance_rate"]].fillna("Unknown")


KeyError: "['host_response_time'] not in index"

**host_listings_count & host_total_listings_count**

In [17]:
# set the mode for host_listings_count & host_total_listings_count
listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)

listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)


**neighbourhood_group_cleansed & bathrooms**

**bedrooms , beds & bathrooms_text**

In [18]:
# set the mode for above columns
listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)

listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)

listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)


In [19]:
#listings_short[(listings_short["bedrooms"].isna()) & (listings_short["beds"].isna())].count()

In [20]:
#listings_short[listings_short["bedrooms"].isna()][['property_type',"room_type"]].value_counts()

In [21]:
listings_short.head()

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room in condo,Private room,1,1 shared bath,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",$100.00,1,365,f,0,0,0,,,,,,,,,,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100%,100%,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire rental unit,Entire home/apt,1,1 bath,1.0,1.0,[],$65.00,180,365,t,0,0,0,,,,,,,,,,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,100%,91%,t,4.0,8.0,Harrow,51.60818,-0.2774,Entire rental unit,Entire home/apt,4,2 baths,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",$132.00,2,28,t,0,0,0,,,,,,,,,,
3,3518856,https://www.airbnb.com/rooms/3518856,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,100%,f,2.0,5.0,Merton,51.42231,-0.18841,Private room in rental unit,Private room,1,1 private bath,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",$100.00,5,1125,f,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,75%,46%,f,1.0,1.0,Barnet,51.602282,-0.193606,Entire condo,Entire home/apt,2,1 bath,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",$120.00,5,90,f,0,0,0,,,,,,,,,,


In [22]:
listings_short.duplicated().any()

False

In [23]:
listings_short["bathrooms_text"].value_counts()

1 bath               32853
1 shared bath        11817
2 baths               8867
1 private bath        6493
1.5 baths             5677
1.5 shared baths      3145
2.5 baths             2083
2 shared baths        1183
3 baths               1110
3.5 baths              566
2.5 shared baths       369
4 baths                217
3 shared baths         173
4.5 baths              120
0 shared baths          92
0 baths                 82
Half-bath               78
5 baths                 58
Shared half-bath        49
3.5 shared baths        42
Private half-bath       28
5.5 baths               27
6.5 baths               19
4.5 shared baths        15
4 shared baths          13
6 baths                 12
12 baths                 9
5 shared baths           7
7.5 baths                5
7 baths                  4
5.5 shared baths         4
23 baths                 3
9 baths                  3
8 baths                  3
12.5 baths               2
7 shared baths           2
8 shared baths           2
1

In [24]:
listings_short["amenities"].value_counts()

[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     568
["First aid kit", "Long term stays allowed", "Kitchen", "Refrigerator", "Elevator", "Lock on bedroom door", "Dedicated workspace", "Bed linens", "Heating", "Wifi", "Dishes and silverware", "Stove", "Microwave", "Essentials", "Oven"]                                                                                                                                                                                                                                                     

In [25]:
listings_short[listings_short["first_review"].isna()]["number_of_reviews"].value_counts()

0    18693
Name: number_of_reviews, dtype: int64

## 3.Overpass Pipeline

In [26]:
### Increase outside border of listings
london_bbox = increase_bbox(listings)

In [27]:
# Increasing the maxs by 0.01 and decreasing the mins by 0.01 
# will shift the outline's border by a bit more than 1km in each direction.

# See increase_bbox function in py_functions.py

In [28]:
# (northern hemisphere)
# latitude max = north
# latitude min = south
# longitude max = east
# longitude min = west

In [127]:
### Get OSM data for slightly bigger bbox
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(
    node["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    );
    (._;>;);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

osm = pd.json_normalize(data, record_path="elements")


In [128]:
### clean column names 
osm.columns = osm.columns.str.replace(".", "_", regex=False)
osm.columns = osm.columns.str.replace(":", "_", regex=False)


In [142]:
osm["lat"] = np.where(osm["lat"].isna(), osm["center_lat"], osm['lat'])
osm["lon"] = np.where(osm["lon"].isna(), osm["center_lon"], osm['lon'])

In [180]:
### select only desired columns
osm_keepers = ["id",
                     "lat",
                     "lon",
                     "tags_name",
                     "tags_amenity",
                     "tags_cuisine",
                     "tags_diet_vegetarian",
                     "tags_diet_vegan",
                     "tags_railway"]
                     

In [181]:
osm_short = osm[osm_keepers]

In [175]:
### combine lat & lon to a new column lat_lon, to check for duplicates 
# osm_short["lat_lon"] = osm_short["lat"].astype(str) + osm_short["lon"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  osm_short["lat_lon"] = osm_short["lat"].astype(str) + osm_short["lon"].astype(str)


In [182]:
### drop all rows with no lat/lon
osm_short = osm_short.dropna(subset=['lat'])


In [183]:
osm_short = osm_short.drop(osm_short[(osm_short['tags_name'].isna()) & (osm_short['tags_amenity'].isna())].index)

In [188]:
osm_short['tags_amenity'] = osm_short['tags_amenity'].str.replace('pub', 'bar')

In [200]:
osm_short['gastronomy'] = np.where(osm_short['tags_amenity'].isin(['restaurant', 'fast_food']), True, False)

In [190]:
osm_short = osm_short[osm_short['tags_amenity'].isin(['bar', 'restaurant', np.nan, 'cafe', 'fast_food', 'bakery', 'food_court'])]

In [202]:
osm_short[osm_short['tags_name'] == 'White Lion']

Unnamed: 0,id,lat,lon,tags_name,tags_amenity,tags_cuisine,tags_diet_vegetarian,tags_diet_vegan,tags_railway,restaurant_total,gastronomy
84167,626036832,51.309568,-0.053992,White Lion,bar,,limited,limited,,0,False


In [208]:
osm_short['tags_diet_vegan'].unique()

array([True, nan, False], dtype=object)

In [207]:
osm_short['tags_diet_vegetarian'] = np.where(osm_short['tags_diet_vegetarian'].isin(['yes', 'only', 'limited']), True, osm_short['tags_diet_vegetarian'])
osm_short['tags_diet_vegetarian'] = np.where(osm_short['tags_diet_vegetarian'] == 'no', False, osm_short['tags_diet_vegetarian'])
osm_short['tags_diet_vegan'] = np.where(osm_short['tags_diet_vegan'].isin(['yes', 'only', 'limited']), True, osm_short['tags_diet_vegan'])
osm_short['tags_diet_vegan'] = np.where(osm_short['tags_diet_vegan'] == 'no', False, osm_short['tags_diet_vegan'])

In [231]:
osm_short[(osm_short['tags_name'].duplicated(keep=False)) & (~osm_short['tags_name'].isna())].sort_values('tags_name')

Unnamed: 0,id,lat,lon,tags_name,tags_amenity,tags_cuisine,tags_diet_vegetarian,tags_diet_vegan,tags_railway,restaurant_total,gastronomy
69310,9154045091,51.551510,0.025445,% Arabica,cafe,coffee_shop,,,,0,False
81881,366302157,51.536501,-0.062015,% Arabica,cafe,coffee_shop,,,,0,False
56921,7162287707,51.511600,-0.124178,% Arabica,cafe,,,,,0,False
75813,10280308656,51.466273,0.008868,15grams,cafe,,,,,0,False
75635,10266440589,51.481276,-0.009764,15grams,cafe,coffee_shop,,,,0,False
...,...,...,...,...,...,...,...,...,...,...,...
56618,7118074388,51.504409,-0.018260,itsu,fast_food,asian,,,,1,True
3068,444285245,51.490152,-0.162302,itsu,fast_food,asian,,,,1,True
12812,1540888903,51.557974,-0.118892,mooboo,cafe,bubble tea,,,,0,False
75470,10243071184,51.581536,-0.339206,mooboo,cafe,bubble_tea,,,,0,False


In [None]:
osm_short[(osm_short[['tags_name', 'lon']].duplicated(keep=False))]

In [246]:
osm_short.columns = osm_short.columns.str.replace('tags_' , '')

In [247]:
osm_short.head()

Unnamed: 0,id,lat,lon,name,amenity,cuisine,diet_vegetarian,diet_vegan,railway,restaurant_total,gastronomy
0,451152,51.60084,-0.194608,King of Prussia,bar,pizza;burger,True,True,,0,False
1,451153,51.602031,-0.193503,Central Restaurant,restaurant,,,,,1,True
2,451154,51.599579,-0.196028,The Catcher in the Rye,bar,,,,,0,False
3,451271,51.614104,-0.176556,The Tally Ho,bar,,,,,0,False
4,12242503,51.592016,0.027962,Railway Bell,bar,,,,,0,False


### 3.1. First Look - Open Street Maps-Data

In [36]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"].shape

NameError: name 'streetmap_short' is not defined

In [None]:
streetmap_short[streetmap_short["tags.amenity"] == "restaurant"]["tags.cuisine"].value_counts(dropna=False)

NaN                      1619
indian                    480
italian                   448
pizza                     273
chinese                   233
                         ... 
italian;chinese             1
sichuan;chinese             1
spanish;mediterranean       1
scottish                    1
uyghur                      1
Name: tags.cuisine, Length: 358, dtype: int64

In [None]:
streetmap_short["tags.cuisine"].value_counts(dropna=False)

NaN                      8554
coffee_shop               849
pizza                     613
indian                    601
italian                   485
                         ... 
coffee_shop;danish          1
tapas;spanish;seafood       1
English breakfast           1
ramen;japanese              1
lebanese;kebab              1
Name: tags.cuisine, Length: 640, dtype: int64

In [None]:
streetmap_short.head()

Unnamed: 0,id,lat,lon,tags.name,tags.amenity,tags.cuisine,tags.railway,tags.public_transport,tags.diet:vegetarian,tags.diet:vegan
0,451152,51.60084,-0.194608,King of Prussia,pub,pizza;burger,,,yes,yes
1,451153,51.602031,-0.193503,Central Restaurant,restaurant,,,,,
2,451154,51.599579,-0.196028,The Catcher in the Rye,pub,,,,,
3,451271,51.614104,-0.176556,The Tally Ho,pub,,,,,
4,638645,51.372908,-0.414344,Walton-on-Thames,,,station,station,,


In [None]:
streetmap_short["tags.amenity"].unique()

array(['pub', 'restaurant', nan, 'cafe', 'bar', 'bus_station',
       'fast_food', 'ferry_terminal'], dtype=object)

In [None]:
streetmap_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16540 entries, 0 to 16539
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     16540 non-null  int64  
 1   lat                    16540 non-null  float64
 2   lon                    16540 non-null  float64
 3   tags.name              15925 non-null  object 
 4   tags.amenity           15260 non-null  object 
 5   tags.cuisine           7986 non-null   object 
 6   tags.railway           1280 non-null   object 
 7   tags.public_transport  764 non-null    object 
 8   tags.diet:vegetarian   753 non-null    object 
 9   tags.diet:vegan        541 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB


In [None]:
streetmap_short.describe()

Unnamed: 0,id,lat,lon
count,16540.0,16540.0,16540.0
mean,4684916000.0,51.500684,-0.131178
std,3258168000.0,0.068268,0.130715
min,451152.0,51.286089,-0.507334
25%,1650741000.0,51.469531,-0.190261
50%,4603416000.0,51.51009,-0.125446
75%,7147501000.0,51.535076,-0.072672
max,10819590000.0,51.691127,0.298538


## 4. Web scraping test

In [None]:
from bs4 import BeautifulSoup

In [None]:
url = "http://insideairbnb.com/get-the-data/"
page = requests.get(url)

In [None]:
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
print(soup)

## 5.Area-Calc-Test

In [None]:
streetmap.head()

Unnamed: 0,id,lat,lon,tags.name,tags.amenity,tags.cuisine,tags.railway,tags.public_transport,tags.diet:vegetarian,tags.diet:vegan
0,15262026,51.518156,-0.169879,,,,,,,
1,20576176,51.32522,-0.006881,,,,,,,
2,20576312,51.30953,-0.054178,,,,,,,
3,21141374,51.52227,-0.163755,,,,,,,
4,21310453,51.486288,-0.121714,,,,,,,


In [None]:
# Import libraries and set test subsets of the data
from sklearn.neighbors import BallTree
list_test = listings_short[["id", "name", "latitude", "longitude"]]
street_test = streetmap[["id", "lat", "lon", "tags.amenity", "tags.railway"]]

In [None]:
#rename the column name so that they are equal 
streetmap = streetmap.rename(columns={"lat": "latitude", "lon": "longitude"})


In [None]:
# Calculate the needed radius when converted to unit sphere.
distance_in_meter = 500
earth_radius_in_meter = 6_371_000

radius = distance_in_meter / earth_radius_in_meter

In [None]:
# Convert the latitude and longitude columns to radians
list_test = list_test.copy()
list_test.loc[:, 'lat_rad'] = np.radians(list_test['latitude'])
list_test.loc[:, 'lon_rad'] = np.radians(list_test['longitude'])
street_test = street_test.copy()
street_test.loc[:, 'lat_rad'] = np.radians(street_test['latitude'])
street_test.loc[:, 'lon_rad'] = np.radians(street_test['longitude'])

# Create a BallTree object with the latitude and longitude columns
tree = BallTree(street_test[['lat_rad', 'lon_rad']],
                leaf_size=15, metric='haversine')

# Find the indices of all neighbors within a radius of 500 meters
# for each row in list_test
indices = tree.query_radius(
    list_test[['lat_rad', 'lon_rad']], r=radius, count_only=False)

# Calculate the number of neighbors for each amenity type
amenity_types = street_test['tags.amenity'].unique()
amenity_counts = np.zeros((list_test.shape[0], amenity_types.shape[0]))
for i, amenity in enumerate(amenity_types):
    street_indices = street_test[street_test['tags.amenity'] == amenity].index
    intersection_counts = np.array(
        [np.intersect1d(street_indices, idx).size for idx in indices])
    amenity_counts[:, i] = intersection_counts

# Add the new columns to list_test
list_test = pd.concat([list_test, pd.DataFrame(amenity_counts, columns=[
                      f'num_neighbors_{amenity}' for amenity in amenity_types])], axis=1)

# Calculate the number of neighbors for each railway type
railway_types = street_test['tags.railway'].unique()
railway_counts = np.zeros((list_test.shape[0], railway_types.shape[0]))
for i, railway in enumerate(railway_types):
    street_indices = street_test[street_test['tags.railway'] == railway].index
    intersection_counts = np.array(
        [np.intersect1d(street_indices, idx).size for idx in indices])
    railway_counts[:, i] = intersection_counts

# Add the new columns to list_test
list_test = pd.concat([list_test, pd.DataFrame(railway_counts, columns=[
                      f'num_neighbors_{railway}' for railway in railway_types])], axis=1)


# Remove the temporary columns
list_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)
street_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)


In [None]:
list_test.describe()

Unnamed: 0,id,latitude,longitude,num_neighbors_nan,num_neighbors_pub,num_neighbors_fast_food,num_neighbors_clock,num_neighbors_atm,num_neighbors_toilets,num_neighbors_ferry_terminal,num_neighbors_restaurant,num_neighbors_dentist,num_neighbors_watering_place,num_neighbors_drinking_water,num_neighbors_bench,num_neighbors_parking_entrance,num_neighbors_bicycle_parking,num_neighbors_cafe,num_neighbors_bar,num_neighbors_bus_station,num_neighbors_nan.1,num_neighbors_pub.1,num_neighbors_fast_food.1,num_neighbors_clock.1,num_neighbors_atm.1,num_neighbors_toilets.1,num_neighbors_ferry_terminal.1,num_neighbors_restaurant.1,num_neighbors_dentist.1,num_neighbors_watering_place.1,num_neighbors_drinking_water.1,num_neighbors_bench.1,num_neighbors_parking_entrance.1,num_neighbors_bicycle_parking.1,num_neighbors_cafe.1,num_neighbors_bar.1,num_neighbors_bus_station.1,num_neighbors_nan.2,num_neighbors_subway_entrance,num_neighbors_entrance,num_neighbors_stop,num_neighbors_station,num_neighbors_tram_stop
count,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0
mean,2.368628e+17,51.509708,-0.128108,0.0,4.409511,10.803378,0.013809,0.020946,0.004187,0.069603,16.642974,0.001063,0.000173,0.002778,0.002831,0.001489,0.003203,12.25211,2.122952,0.039207,0.0,4.409511,10.803378,0.013809,0.020946,0.004187,0.069603,16.642974,0.001063,0.000173,0.002778,0.002831,0.001489,0.003203,12.25211,2.122952,0.039207,0.0,1.448878,0.001688,0.003588,2.308223,0.249917
std,3.425911e+17,0.048369,0.099341,0.0,9.416939,26.866415,0.116698,0.173963,0.06559,0.415069,42.29629,0.03259,0.013143,0.052631,0.053131,0.054543,0.056505,30.387748,4.906068,0.238124,0.0,9.416939,26.866415,0.116698,0.173963,0.06559,0.415069,42.29629,0.03259,0.013143,0.052631,0.053131,0.054543,0.056505,30.387748,4.906068,0.238124,0.0,3.823028,0.056099,0.059797,6.168676,1.346805
min,13913.0,51.295937,-0.4978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19817400.0,51.48354,-0.18939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39338750.0,51.51384,-0.12628,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.562985e+17,51.53945,-0.06846,0.0,4.0,8.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,0.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,8.463271e+17,51.681142,0.28857,0.0,77.0,247.0,1.0,2.0,2.0,5.0,383.0,1.0,1.0,1.0,1.0,2.0,1.0,287.0,47.0,2.0,0.0,77.0,247.0,1.0,2.0,2.0,5.0,383.0,1.0,1.0,1.0,1.0,2.0,1.0,287.0,47.0,2.0,0.0,43.0,2.0,1.0,52.0,14.0


## 6.Nearest Station-Test

In [None]:
streetmap["tags.railway"].unique()

array([nan, 'subway_entrance', 'entrance', 'stop', 'station', 'tram_stop'],
      dtype=object)

In [None]:
streetmap.dropna(subset=["tags.railway"], inplace=True)


In [None]:
from haversine import haversine, Unit

# define a function to calculate distance between two points
def calc_distance(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# get all unique values in "tags.railway"
railway_tags = streetmap["tags.railway"].unique()

# loop through each row in listings_short and calculate the minimum distance
# for each value in streetmap["tags.railway"]
for tag in railway_tags:
    distances = []
    for _, row in listings_short.iterrows():
        min_distance = None
        for _, sm_row in streetmap[streetmap["tags.railway"] == tag].iterrows():
            distance = calc_distance(row["latitude"], row["longitude"], sm_row["latitude"], sm_row["longitude"])
            if min_distance is None or distance < min_distance:
                min_distance = distance
        distances.append(min_distance)

    # add the calculated minimum distances as a new column in listings_short
    col_name = "min_distance_{}".format(tag)
    listings_short[col_name] = distances


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short[col_name] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short[col_name] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short[col_name] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

In [None]:
# from haversine import haversine, Unit

# # define a function to calculate distance between two points
# def calc_distance(lat1, lon1, lat2, lon2):
#     return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# # loop through each row in listings_short and calculate the minimum distance
# # for "subway_entrance" tag
# distances = []
# for _, row in listings_short.iterrows():
#     min_distance = None
#     for _, sm_row in streetmap[streetmap["tags.railway"] == "subway_entrance"].iterrows():
#         distance = calc_distance(row["latitude"], row["longitude"], sm_row["latitude"], sm_row["longitude"])
#         if min_distance is None or distance < min_distance:
#             min_distance = distance
#     distances.append(min_distance)

# # add the calculated minimum distances as a new column in listings_short
# listings_short["min_distance_subway_entrance"] = distances


In [None]:
listings_short.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,min_distance_subway_entrance,min_distance_entrance,min_distance_stop,min_distance_station,min_distance_tram_stop
count,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,56548.0,55595.0,55606.0,55564.0,55592.0,55565.0,55562.0,56548.0,75241.0,75241.0,75241.0,75241.0,75241.0
mean,2.368628e+17,20230310000000.0,139076500.0,39.523398,71.374424,51.509708,-0.128108,3.105793,1.489467,1.761473,5.750748,7790.3,17.974668,5.736301,0.456467,4.588159,4.723349,4.623915,4.783393,4.801027,4.729358,4.607755,0.877064,1175.880056,16688.645228,6880.722254,498.407503,11457.687812
std,3.425911e+17,12.96884,152962100.0,222.163629,420.025668,0.048369,0.099341,1.936972,0.871029,1.222497,24.240947,1914055.0,41.984021,12.991805,1.277612,0.779083,0.489328,0.550721,0.453835,0.448759,0.418873,0.521839,1.234003,1546.349009,5240.929974,3926.887173,350.291556,4912.939491
min,13913.0,20230310000000.0,2594.0,1.0,1.0,51.295937,-0.4978,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,1.077511,78.976398,20.607206,1.150966,32.403778
25%,19817400.0,20230310000000.0,19959230.0,1.0,1.0,51.48354,-0.18939,2.0,1.0,1.0,1.0,42.0,1.0,0.0,0.0,4.5,4.67,4.5,4.75,4.79,4.64,4.5,0.13,345.896172,13687.440042,4022.241663,266.743858,8236.041146
50%,39338750.0,20230310000000.0,67455190.0,2.0,2.0,51.51384,-0.12628,2.0,1.0,1.0,2.0,365.0,4.0,0.0,0.0,4.82,4.89,4.8,4.94,4.97,4.85,4.75,0.45,649.363801,16742.68294,6321.069869,425.348579,11379.540076
75%,6.562985e+17,20230310000000.0,224867000.0,5.0,8.0,51.53945,-0.06846,4.0,2.0,2.0,4.0,1125.0,17.0,6.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,4.97,1.09,1280.920791,19581.228836,8836.758199,641.052645,14414.214378
max,8.463271e+17,20230310000000.0,505040000.0,2138.0,24047.0,51.681142,0.28857,16.0,22.0,38.0,1125.0,524855600.0,1328.0,564.0,68.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,51.05,20340.499863,39554.20209,27737.123484,7394.015515,30478.709223


Price correlation

In [None]:
list_test["price"] = listings_short["price"]

In [None]:
list_test.price = list_test.price.str[1:-3]
list_test.price = list_test.price.str.replace(",", "")
list_test.price = list_test.price.astype('int64')

In [None]:
list_test.corrwith(list_test['price'])

  list_test.corrwith(list_test['price'])


id                                0.064004
latitude                          0.005344
longitude                        -0.034336
num_neighbors_nan                      NaN
num_neighbors_pub                 0.044490
num_neighbors_fast_food           0.046168
num_neighbors_clock               0.004750
num_neighbors_atm                -0.004762
num_neighbors_toilets            -0.001710
num_neighbors_restaurant          0.046288
num_neighbors_dentist            -0.005625
num_neighbors_watering_place     -0.002393
num_neighbors_drinking_water     -0.004003
num_neighbors_bench               0.080874
num_neighbors_parking_entrance   -0.005989
num_neighbors_bicycle_parking    -0.009015
num_neighbors_cafe                0.044919
num_neighbors_bar                 0.048390
price                             1.000000
num_neighbors_nan                      NaN
num_neighbors_pub                 0.013740
num_neighbors_fast_food           0.017038
num_neighbors_clock               0.000094
num_neighbo