# Airbnb Capstone Project

## 1.Import all Libraries

In [1]:
### import all libraries and set settings 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import json
import gzip
import pyproj
import math

from py_functions import increase_bbox 
from sklearn.neighbors import BallTree
from scipy.spatial import cKDTree
from shapely.geometry import Point
from shapely.ops import transform
from functools import partial

pd.set_option('display.max_columns', None) # show all columns  

## 2.Inside Airbnb pipeline

In [2]:
### Define path, .gz archive file name, country and city for url
path ='data/'
gz_file = "listings.csv.gz"
country = "united-kingdom"
state = "england"
city = "london"
url = f"http://data.insideairbnb.com/{country}/{state}/{city}/2023-03-14/data/{gz_file}"

In [3]:
### Create new directory for city
!mkdir {path}{city}

mkdir: data/london: File exists


In [4]:
### Download the .gz file
r = requests.get(url)
with open(path+city+'/'+gz_file, 'wb') as f:
    f.write(r.content)

In [5]:
### Unzip the .gz file and save the content as pd.DataFrame via read_csv
with gzip.open(path+city+'/'+gz_file) as f:
    listings = pd.read_csv(f)


In [6]:
### select only desired columns 
columns_keeper = (["id",
                   "listing_url",
                   "name",
                   "picture_url",
                   "host_id",
                   "host_response_rate",
                   "host_acceptance_rate",
                   "host_is_superhost",
                   "host_listings_count",
                   "host_total_listings_count",
                   "neighbourhood_cleansed",
                   "latitude",
                   "longitude",
                   "room_type",
                   "accommodates",
                   "bathrooms_text",
                   "bedrooms",
                   "beds",
                   "amenities",
                   "price",
                   "availability_90",
                   "minimum_nights",
                   "maximum_nights",
                   "instant_bookable",
                   "number_of_reviews",
                   "number_of_reviews_ltm",
                   "number_of_reviews_l30d",
                   "first_review",
                   "last_review",
                   "review_scores_rating",
                   "review_scores_accuracy",
                   "review_scores_cleanliness",
                   "review_scores_checkin",
                   "review_scores_communication",
                   "review_scores_location",
                   "review_scores_value",
                   "reviews_per_month"]
                  )

In [7]:
### filter columns 
listings_short = listings[columns_keeper]

### 2.2.First Look - Airbnb Data

In [8]:
listings_short.head()

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,availability_90,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room,1,1 shared bath,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",$100.00,90,1,365,f,0,0,0,,,,,,,,,,
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100%,100%,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire home/apt,1,1 bath,1.0,1.0,[],$65.00,90,180,365,t,0,0,0,,,,,,,,,,
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,100%,91%,t,4.0,8.0,Harrow,51.60818,-0.2774,Entire home/apt,4,2 baths,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",$132.00,35,2,28,t,0,0,0,,,,,,,,,,
3,3518856,https://www.airbnb.com/rooms/3518856,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,100%,f,2.0,5.0,Merton,51.42231,-0.18841,Private room,1,1 private bath,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",$100.00,90,5,1125,f,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05
4,4876550,https://www.airbnb.com/rooms/4876550,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,75%,46%,f,1.0,1.0,Barnet,51.602282,-0.193606,Entire home/apt,2,1 bath,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",$120.00,83,5,90,f,0,0,0,,,,,,,,,,


In [9]:
listings_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75241 entries, 0 to 75240
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           75241 non-null  int64  
 1   listing_url                  75241 non-null  object 
 2   name                         75210 non-null  object 
 3   picture_url                  75241 non-null  object 
 4   host_id                      75241 non-null  int64  
 5   host_response_rate           46285 non-null  object 
 6   host_acceptance_rate         51028 non-null  object 
 7   host_is_superhost            75223 non-null  object 
 8   host_listings_count          75236 non-null  float64
 9   host_total_listings_count    75236 non-null  float64
 10  neighbourhood_cleansed       75241 non-null  object 
 11  latitude                     75241 non-null  float64
 12  longitude                    75241 non-null  float64
 13  room_type       

In [10]:
listings_short.describe()

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,availability_90,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
count,75241.0,75241.0,75236.0,75236.0,75241.0,75241.0,75241.0,71768.0,74135.0,75241.0,75241.0,75241.0,75241.0,75241.0,75241.0,56548.0,55595.0,55606.0,55564.0,55592.0,55565.0,55562.0,56548.0
mean,2.368628e+17,139076500.0,39.525958,71.3791,51.509708,-0.128108,3.105793,1.513153,1.772833,27.229409,5.750748,7790.3,17.974668,5.736301,0.456467,4.588159,4.723349,4.623915,4.783393,4.801027,4.729358,4.607755,0.877064
std,3.425911e+17,152962100.0,222.170789,420.039233,0.048369,0.099341,1.936972,0.885015,1.228013,32.742591,24.240947,1914055.0,41.984021,12.991805,1.277612,0.779083,0.489328,0.550721,0.453835,0.448759,0.418873,0.521839,1.234003
min,13913.0,2594.0,1.0,1.0,51.295937,-0.4978,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,19817400.0,19959230.0,1.0,1.0,51.48354,-0.18939,2.0,1.0,1.0,0.0,1.0,42.0,1.0,0.0,0.0,4.5,4.67,4.5,4.75,4.79,4.64,4.5,0.13
50%,39338750.0,67455190.0,2.0,2.0,51.51384,-0.12628,2.0,1.0,1.0,9.0,2.0,365.0,4.0,0.0,0.0,4.82,4.89,4.8,4.94,4.97,4.85,4.75,0.45
75%,6.562985e+17,224867000.0,5.0,8.0,51.53945,-0.06846,4.0,2.0,2.0,55.0,4.0,1125.0,17.0,6.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,4.97,1.09
max,8.463271e+17,505040000.0,2138.0,24047.0,51.681142,0.28857,16.0,22.0,38.0,90.0,1125.0,524855600.0,1328.0,564.0,68.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,51.05


## 2.3. Clean Airbnb 

### 2.3.1. Handling Missing Data 

In [11]:
listings_short.shape

(75241, 37)

In [12]:
listings_short.isnull().sum()

id                                 0
listing_url                        0
name                              31
picture_url                        0
host_id                            0
host_response_rate             28956
host_acceptance_rate           24213
host_is_superhost                 18
host_listings_count                5
host_total_listings_count          5
neighbourhood_cleansed             0
latitude                           0
longitude                          0
room_type                          0
accommodates                       0
bathrooms_text                   124
bedrooms                        3473
beds                            1106
amenities                          0
price                              0
availability_90                    0
minimum_nights                     0
maximum_nights                     0
instant_bookable                   0
number_of_reviews                  0
number_of_reviews_ltm              0
number_of_reviews_l30d             0
f

**host_is_superhost**

In [13]:
# check the different values of "host_is_superhost"
listings_short["host_is_superhost"].value_counts(dropna=False)

f      64574
t      10649
NaN       18
Name: host_is_superhost, dtype: int64

In [14]:
# check how many listings the hosts with nan value for "host_is_superhost" have: 
listings_short[listings_short['host_is_superhost'].isna()]["host_total_listings_count"].value_counts()

5.0     4
2.0     3
6.0     2
10.0    2
7.0     2
4.0     2
26.0    2
1.0     1
Name: host_total_listings_count, dtype: int64

In [15]:
# we can fill values with "f" for false 
listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_is_superhost"] = listings_short["host_is_superhost"].fillna("f")


In [16]:
# renaming Rows with NaN to "Unknown"
listings_short[["name", "host_response_rate",
                "host_acceptance_rate"]] = listings_short[["name", "host_response_rate",
                                                           "host_acceptance_rate"]].fillna("Unknown")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short[["name", "host_response_rate",


**host_listings_count & host_total_listings_count**

In [17]:
# set the mode for host_listings_count & host_total_listings_count
listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)

listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_listings_count"].fillna(listings_short["host_listings_count"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_total_listings_count"].fillna(listings_short["host_total_listings_count"].mode()[0], inplace=True)


**bedrooms , beds & bathrooms_text**

In [18]:
# set the mode for above columns
listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)

listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)

listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bathrooms_text"].fillna(listings_short["bathrooms_text"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["bedrooms"].fillna(listings_short["bedrooms"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["beds"].fillna(listings_short["beds"].mode()[0], inplace=True)


**Convert host_response_rate & host_acceptance_rate**

In [19]:
## Convert response rate/acceptance rate from % in integer
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate"].str[:-1]
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].replace('Unknow', np.nan)
listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].astype("float64")
listings_short["host_acceptance_rate"] = listings_short["host_acceptance_rate_int"]
listings_short.drop("host_acceptance_rate_int", axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate"].str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_acceptance_rate_int"] = listings_short["host_acceptance_rate_int"].replace('Unknow', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
#same for host_acceptance_rate 

listings_short["host_response_rate_int"] = listings_short["host_response_rate"].str[:-1]
listings_short["host_response_rate_int"] = listings_short["host_response_rate_int"].replace('Unknow', np.nan)
listings_short["host_response_rate"] = listings_short["host_response_rate_int"].astype("float64")
listings_short.drop("host_response_rate_int", axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_response_rate_int"] = listings_short["host_response_rate"].str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["host_response_rate_int"] = listings_short["host_response_rate_int"].replace('Unknow', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listin

**price**

In [21]:
#convert Price in Integer

listings_short["price"] = listings_short["price"].str[1:]
listings_short["price"] = listings_short["price"].str.replace(",", "")
listings_short["price"] = listings_short["price"].astype("float64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].str[1:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].str.replace(",", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["price"] = listings_short["price"].astype("float64")


**Bathroom_text & private_bath**

In [22]:
#convert bathroom text top bool ('private_bath)
listings_short['private_bath'] = ~listings_short['bathrooms_text'].str.contains('shared|Shared')
listings_short.drop('bathrooms_text', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short['private_bath'] = ~listings_short['bathrooms_text'].str.contains('shared|Shared')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short.drop('bathrooms_text', inplace = True, axis = 1)


In [23]:
listings_short.head(2)

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price,availability_90,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,private_bath
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,f,1.0,1.0,Haringey,51.59728,-0.13933,Private room,1,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",100.0,90,1,365,f,0,0,0,,,,,,,,,,,False
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100.0,100.0,f,14.0,31.0,Barnet,51.636518,-0.177475,Entire home/apt,1,1.0,1.0,[],65.0,90,180,365,t,0,0,0,,,,,,,,,,,True


**room_type**

In [24]:
#renaming the values 
listings_short["room_type"] = listings_short["room_type"].str.replace("Entire home/apt", "Entire home")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short["room_type"] = listings_short["room_type"].str.replace("Entire home/apt", "Entire home")


**Instant_bookable and Host_is_superhost as bool**

In [25]:
listings_short['instant_bookable'] = listings_short['instant_bookable'].map({'f': False, 't': True})
listings_short['host_is_superhost'] = listings_short['host_is_superhost'].map({'f': False, 't': True})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short['instant_bookable'] = listings_short['instant_bookable'].map({'f': False, 't': True})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_short['host_is_superhost'] = listings_short['host_is_superhost'].map({'f': False, 't': True})


**amenities**

In [26]:
test = listings_short.copy()

In [27]:
# convert items in "amenities" to a list
test["amenities"] = test["amenities"].str.lower().str.replace('[','').str.replace(']','').str.replace('"','').str.replace(' ','_').str.split(',')


  test["amenities"] = test["amenities"].str.lower().str.replace('[','').str.replace(']','').str.replace('"','').str.replace(' ','_').str.split(',')


In [28]:
# create new columns for each amenity 
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
amenities = test.join(pd.DataFrame(mlb.fit_transform(test.pop('amenities')),
                          columns=mlb.classes_,
                          index=test.index))

In [29]:
# create a list of amenity with fewer than 10% of listings
infrequent_amenities = []
for col in amenities.iloc[: , 35:].columns:
    if amenities[col].sum() < len(amenities)/10:
        infrequent_amenities.append(col)

# drop infrequent amenity features
amenities.drop(infrequent_amenities, axis=1, inplace=True)


In [30]:
# combine _coffee & _coffe_maker to one column
amenities['_coffee_'] = (amenities['_coffee_maker'] | amenities['_coffee']).astype(int)


In [31]:
# focus on relevant columns
amenity_keeper = ["id",
                  "_wifi",
                  "_long_term_stays_allowed",
                  "_private_patio_or_balcony",
                  "_private_entrance",
                  "_pets_allowed",
                  "_outdoor_dining_area",
                  "_lockbox",
                  "_kitchen",
                  "_hair_dryer",
                  "_free_street_parking",
                  "_free_parking_on_premises",
                  "_dedicated_workspace",
                  "_coffee_",
                  "_bed_linens",
                  "_bathtub"]


In [32]:
# keep only relevant columns 
amenities_short = amenities[amenity_keeper]

In [33]:
# get rid of first "_"
amenities_short.columns = amenities_short.columns.str.replace('_','', 1)


In [34]:
# merge to one dataframe 
airbnb = listings_short.merge(amenities_short, how="left", on="id")

In [35]:
airbnb.shape

(75241, 52)

In [36]:
airbnb.head()

Unnamed: 0,id,listing_url,name,picture_url,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price,availability_90,minimum_nights,maximum_nights,instant_bookable,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,private_bath,wifi,long_term_stays_allowed,private_patio_or_balcony,private_entrance,pets_allowed,outdoor_dining_area,lockbox,kitchen,hair_dryer,free_street_parking,free_parking_on_premises,dedicated_workspace,coffee_,bed_linens,bathtub
0,714569379355913481,https://www.airbnb.com/rooms/714569379355913481,Lovely private bedroom in Muswell Hill.,https://a0.muscache.com/pictures/miso/Hosting-...,39009854,,,False,1.0,1.0,Haringey,51.59728,-0.13933,Private room,1,1.0,1.0,"[""Iron"", ""Hangers"", ""Hair dryer"", ""Outdoor din...",100.0,90,1,365,False,0,0,0,,,,,,,,,,,False,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
1,808038970516277767,https://www.airbnb.com/rooms/808038970516277767,Studio Flat Franklin London,https://a0.muscache.com/pictures/miso/Hosting-...,495977998,100.0,100.0,False,14.0,31.0,Barnet,51.636518,-0.177475,Entire home,1,1.0,1.0,[],65.0,90,180,365,True,0,0,0,,,,,,,,,,,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,822557738577472503,https://www.airbnb.com/rooms/822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,https://a0.muscache.com/pictures/d77957d5-695a...,325629338,100.0,91.0,True,4.0,8.0,Harrow,51.60818,-0.2774,Entire home,4,2.0,2.0,"[""Dining table"", ""Washer"", ""Outdoor furniture""...",132.0,35,2,28,True,0,0,0,,,,,,,,,,,True,1,1,1,0,0,0,0,1,1,0,1,0,0,1,1
3,3518856,https://www.airbnb.com/rooms/3518856,Wimbledon Double Bedroom Ensuite,https://a0.muscache.com/pictures/23a18442-fc1d...,187811,,100.0,False,2.0,5.0,Merton,51.42231,-0.18841,Private room,1,1.0,1.0,"[""Washer"", ""Iron"", ""Hangers"", ""Kitchen"", ""Smok...",100.0,90,5,1125,False,4,0,0,2015-12-27,2016-07-11,3.67,3.0,4.33,4.67,5.0,3.67,3.67,0.05,True,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
4,4876550,https://www.airbnb.com/rooms/4876550,Stunning Apartment 2 minutes walk to Tube Station,https://a0.muscache.com/pictures/miso/Hosting-...,25087384,75.0,46.0,False,1.0,1.0,Barnet,51.602282,-0.193606,Entire home,2,1.0,1.0,"[""First aid kit"", ""Washer"", ""Fire extinguisher...",120.0,83,5,90,False,0,0,0,,,,,,,,,,,True,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0


## 3.Overpass Pipeline

### 3.1. Get the Data 

In [37]:
### Increase outside border of listings
london_bbox = increase_bbox(listings)

In [38]:
# Increasing the maxs by 0.01 and decreasing the mins by 0.01 
# will shift the outline's border by a bit more than 1km in each direction.

# See increase_bbox function in py_functions.py

In [39]:
# (northern hemisphere)
# latitude max = north
# latitude min = south
# longitude max = east
# longitude min = west

In [40]:
# read in data only once, then export and read csv file locally / via sql
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json];
(
    node["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    node["tourism"="attraction"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["tourism"="artwork"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["tourism"="gallery"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["tourism"="museum"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["shop"="boutique"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["shop"="clothes"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    node["leisure"="park"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    way["amenity"="bar"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="pub"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="restaurant"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="cafe"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["amenity"="fast_food"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["railway"="subway_entrance"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["cuisine"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    way["tourism"="attraction"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["tourism"="artwork"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["tourism"="gallery"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["tourism"="museum"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["shop"="boutique"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["shop"="clothes"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});
    way["leisure"="park"]({london_bbox["south_shifted"]},{london_bbox["west_shifted"]},{london_bbox["north_shifted"]},{london_bbox["east_shifted"]});

    );
    (._;>;);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

osm = pd.json_normalize(data, record_path="elements")

In [None]:
osm.shape

In [None]:
osm_copy = osm 

In [None]:
osm.head(2)

### 3.2. Data Cleaning OSM

In [None]:
### clean column names 
osm.columns = osm.columns.str.replace(".", "_", regex=False)
osm.columns = osm.columns.str.replace(":", "_", regex=False)


In [None]:
### drop all columns with no lat or lon values 
osm["lat"] = np.where(osm["lat"].isna(), osm["center_lat"], osm['lat'])
osm["lon"] = np.where(osm["lon"].isna(), osm["center_lon"], osm['lon'])

In [None]:
### rename lat/lon to latitude/longitude 
osm = osm.rename(columns={"lat": "latitude", "lon": "longitude"})


In [None]:
### drop "tags_" in the column names 
osm.columns = osm.columns.str.replace('tags_' , '')

In [None]:
### select only desired columns
osm_keepers = ["id",
                     "latitude",
                     "longitude",
                     "name",
                     "amenity",
                     "tourism",
                     "shop",
                     "railway",
                     "leisure",
                     "cuisine",
                     "diet_vegetarian",
                     "diet_vegan"]

### 3.3. Data Cleaning OSM_SHORT

In [None]:
osm_short = osm[osm_keepers]

In [None]:
# drop all rows with no name AND no amenity 
osm_short = osm_short.drop(osm_short[(osm_short['name'].isna()) & (osm_short['amenity'].isna())].index)

In [None]:
# combine pub & bar in one column
osm_short['amenity'] = osm_short['amenity'].str.replace('pub', 'bar')

In [None]:
# create a new column `gastronomy` wh
osm_short['gastronomy'] = np.where(osm_short['amenity'].isin(['restaurant', 'fast_food']), True, False)

In [None]:
# create seperate df's for each POI-Type in order to filter only relevant values for each type 
df_amenity = osm_short[["id", "amenity"]]
df_tourism = osm_short[["id", "tourism"]]
df_shop = osm_short[["id", "shop"]]
df_railway = osm_short[["id", "railway"]]
df_leisure = osm_short[["id", "leisure"]]

In [None]:
# filter only relevant values for each type 
# fyi: df's had to be split as otherwise, POIs with values in more than one column would have been deleted 
df_amenity = df_amenity[df_amenity['amenity'].isin(['bar', 'restaurant', np.nan, 'cafe', 'fast_food', 'bakery', 'food_court'])]
df_tourism = df_tourism[df_tourism['tourism'].isin(['artwork', 'attraction', np.nan, 'gallery', 'museum'])]
df_shop = df_shop[df_shop['shop'].isin(['clothes', np.nan])]
df_railway = df_railway[df_railway['railway'].isin(['subway_entrance', np.nan])]
df_leisure = df_leisure[df_leisure['leisure'].isin(['park', np.nan])]


In [None]:
# merge the splitted df's 
df_splitted = df_amenity.merge(df_leisure, on="id", how="outer")
df_splitted = df_splitted.merge(df_railway, on="id", how="outer")
df_splitted = df_splitted.merge(df_leisure, on="id", how="outer")
df_splitted = df_splitted.merge(df_shop, on="id", how="outer")
df_splitted = df_splitted.merge(df_tourism, on="id", how="outer")

In [None]:
# define "keepers" for merge 
keep = ["id", "latitude", "longitude", "name", "cuisine", "diet_vegetarian", "diet_vegan"]

In [None]:
# merge df 
df_splitted = df_splitted.merge(osm_short[keep], on="id", how="outer")

In [None]:
# rename new columns 
df_splitted.columns = df_splitted.columns.str.replace("_y", "", regex=False)


In [None]:
df_splitted.head(2)

In [None]:
# assign df back to osm_short 
osm_short = df_splitted

In [None]:
# set True/False values for vegetarian/vegan columns 
osm_short['diet_vegetarian'] = np.where(osm_short['diet_vegetarian'].isin(['yes', 'only', 'limited']), True, osm_short['diet_vegetarian'])
osm_short['diet_vegetarian'] = np.where(osm_short['diet_vegetarian'] == 'no', False, osm_short['diet_vegetarian'])
osm_short['diet_vegan'] = np.where(osm_short['diet_vegan'].isin(['yes', 'only', 'limited']), True, osm_short['diet_vegan'])
osm_short['diet_vegan'] = np.where(osm_short['diet_vegan'] == 'no', False, osm_short['diet_vegan'])

In [None]:
osm_short.head()

In [None]:
osm_short.shape

In [None]:
osm_short["tourism"].count()

In [None]:
osm_short["amenity"].value_counts()

### 3.4 Create new columns for cuisines

In [None]:
cuisine_test = osm_short.copy()

In [None]:
# convert Nullvalues to an empty string
cuisine_test["cuisine"] = cuisine_test["cuisine"].replace(np.nan,' ',regex=True)

In [None]:
# convert items in "cuisine" to a list
cuisine_test["cuisine"] = cuisine_test["cuisine"].str.lower().str.split(';')

In [None]:
# create new columns for each cuisine
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
cuisine_type = cuisine_test.join(pd.DataFrame(mlb.fit_transform(cuisine_test.pop('cuisine')),
                          columns=mlb.classes_,
                          index=cuisine_test.index))

In [None]:
# create a list of cuisine with fewer than 25% of listings
infrequent_cuisine = []
for col in cuisine_type.iloc[: , 500:].columns:
    if cuisine_type[col].sum() < len(cuisine_type)/25:
        infrequent_cuisine.append(col)

# drop infrequent amenity features
cuisine_type.drop(infrequent_cuisine, axis=1, inplace=True)

In [None]:
#let chatgpt sort cuisines & create cuisine_bins 

cuisine_type['asian_cuisine'] = (cuisine_type['japanese'] | 
                                 cuisine_type['malaysian'] | 
                                 cuisine_type['thai'] | 
                                 cuisine_type['bangladesh'] | 
                                 cuisine_type['bengali'] | 
                                 cuisine_type['biryani'] | 
                                 cuisine_type['asian fusion'] | 
                                 cuisine_type['cantonese'] | 
                                 cuisine_type['chinese'] | 
                                 cuisine_type['chinese seafood'] | 
                                 cuisine_type['chinese+indian'] | 
                                 cuisine_type['chinese_fish_and_chips'] | 
                                 cuisine_type['chinese_tea'] | 
                                 cuisine_type['dumplings'] | 
                                 cuisine_type['east_asian'] | 
                                 cuisine_type['filipino'] | 
                                 cuisine_type['gyoza'] | 
                                 cuisine_type['indian'] | 
                                 cuisine_type['indochina'] | 
                                 cuisine_type['indonesian'] | 
                                 cuisine_type['cambodian'] | 
                                 cuisine_type['japanese_tea'] | 
                                 cuisine_type['korean'] | 
                                 cuisine_type['laotian'] | 
                                 cuisine_type['malay'] | 
                                 cuisine_type['mongolian'] | 
                                 cuisine_type['nepalese'] | 
                                 cuisine_type['pan-asian'] | 
                                 cuisine_type['ramen'] | 
                                 cuisine_type['sichuan'] | 
                                 cuisine_type['singaporean'] | 
                                 cuisine_type['sri lankan'] | 
                                 cuisine_type['sri_lankan'] | 
                                 cuisine_type['sushi'] | 
                                 cuisine_type['taiwan'] | 
                                 cuisine_type['taiwanese'] | 
                                 cuisine_type['tandoori'] | 
                                 cuisine_type['thai'] | 
                                 cuisine_type['thailandese'] | 
                                 cuisine_type['vietnamese']).astype(bool)

In [None]:
cuisine_type['italian_cuisine'] = (cuisine_type['pizza'] | 
                                   cuisine_type['italian'] | 
                                   cuisine_type['italian_pizza'] | 
                                   cuisine_type['pasta']).astype(bool)

In [None]:
cuisine_type['british_cuisine'] = (cuisine_type['afternoon_tea'] | 
                                   cuisine_type['british'] | 
                                   cuisine_type['british_cafe'] | 
                                   cuisine_type['bubble tea'] | 
                                   cuisine_type['bubble_tea'] | 
                                   cuisine_type['bubbles'] | 
                                   cuisine_type['breakfast'] | 
                                   cuisine_type['brunch'] | 
                                   cuisine_type['carvery'] | 
                                   cuisine_type['chips'] | 
                                   cuisine_type['cornish'] | 
                                   cuisine_type['cornish_pasty'] | 
                                   cuisine_type['english'] | 
                                   cuisine_type['english breakfast'] | 
                                   cuisine_type['english_breakfast'] | 
                                   cuisine_type['fry_ups'] | 
                                   cuisine_type['grilled'] | 
                                   cuisine_type['grill'] | 
                                   cuisine_type['jerk_chicken'] | 
                                   cuisine_type['pie'] | 
                                   cuisine_type['pie & mash'] | 
                                   cuisine_type['pie&mash'] | 
                                   cuisine_type['pie_and_mash']| 
                                   cuisine_type['fish_and_chips']).astype(bool)

In [None]:
cuisine_type['african_cuisine'] = (cuisine_type['african'] | 
                                   cuisine_type['afro-caribbean'] | 
                                   cuisine_type['afro-carribbean'] | 
                                   cuisine_type['afro-carribean'] | 
                                   cuisine_type['afro_caribbean'] | 
                                   cuisine_type['algerian'] | 
                                   cuisine_type['ethiopean'] | 
                                   cuisine_type['ethiopian'] | 
                                   cuisine_type['ghanaian'] | 
                                   cuisine_type['libyan'] | 
                                   cuisine_type['mauritian'] | 
                                   cuisine_type['lebanese'] | 
                                   cuisine_type['moroccan'] | 
                                   cuisine_type['afghan'] | 
                                   cuisine_type['nigerian'] | 
                                   cuisine_type['nigerian_cuisines'] | 
                                   cuisine_type['somali'] | 
                                   cuisine_type['somalian'] | 
                                   cuisine_type['south_african']).astype(bool)

In [None]:
cuisine_type['arab_cuisine'] = (cuisine_type['arab'] | 
                                cuisine_type['arabic'] | 
                                cuisine_type['egyptian'] | 
                                cuisine_type['egyptian,arab,african'] | 
                                cuisine_type['iraqi'] | 
                                cuisine_type['israeli'] | 
                                cuisine_type['kuwaiti'] | 
                                cuisine_type['lebanese'] | 
                                cuisine_type['levantine'] | 
                                cuisine_type['palestinian'] | 
                                cuisine_type['persian'] | 
                                cuisine_type['syrian'] | 
                                cuisine_type['kebab'] | 
                                cuisine_type['shakshuka'] | 
                                cuisine_type['shawarma'] | 
                                cuisine_type['falafel']).astype(bool)

In [None]:
cuisine_type['yoghurt'] = (cuisine_type['yoghurt']).astype(bool)

In [None]:
#relevant columns: 
cuisine_keeper = ['id',
                  'asian_cuisine',
                  'italian_cuisine',
                  'british_cuisine',
                  'african_cuisine',
                  'arab_cuisine',
                  'yoghurt']

In [None]:
# keep only relevant columns 
cuisine_type_short = cuisine_type[cuisine_keeper]

In [None]:
# merge to one dataframe 
osm_short = osm_short.merge(cuisine_type_short, how="left", on="id")

In [None]:
#delete old cuisine-column
del osm_short['cuisine']

In [None]:
#now 1/5 of our rows has a cuisine type
osm_short.loc[(osm_short['italian_cuisine'] == True) | (osm_short['british_cuisine'] == True) | (osm_short['arab_cuisine'] == True) | (osm_short['african_cuisine'] == True) | (osm_short['asian_cuisine'] == True)]

In [None]:
osm_short["amenity"].value_counts(dropna=False)

In [None]:
osm_short["leisure"].value_counts(dropna=False)

In [None]:
osm_short.shape

### Add scraped review Data to OSM 

In [None]:
osm_keepers = ["id",
               "name",
               "latitude",
               "longitude",
               "leisure",
               "shop",
               "tourism",
               "railway",
               "amenity"]


In [None]:
osm_tab = osm_short[osm_keepers]

In [None]:
# read in review data 
r_reviews = pd.read_csv(f"data/london/g1_restaurant_reviews.csv")

In [None]:
# merge to osm df 
osm_tab = osm_tab.merge(r_reviews[["id", "rating", "reviews", "price", "closed", "url"]], on="id", how="left")


In [None]:
# drop all rows with closed restaurants 
osm_tab = osm_tab[(osm_tab["closed"].isna()) | (osm_tab["closed"] == False)]

In [None]:
# drop closed column
osm_tab = osm_tab.drop("closed", axis=1)

In [None]:
# replace € with int for further EDA 
osm_tab["price_cat"] = osm_tab["price"].replace({"€": 1, "€€": 2, "€€€": 3, "€€€€": 4, "3-star hotel": np.nan,
                             "4-star hotel": np.nan, "2-star hotel": np.nan, "-star hotel": np.nan, "5-star hotel": np.nan})


In [None]:
# replace , and convert to float 
osm_tab["reviews"] = osm_tab["reviews"].str.replace(",", "").astype(float)

In [None]:
# export to csv for tableau 
osm_tab.to_csv("tab_export_osm.csv")

In [None]:
osm_tab.head(2)

In [None]:
osm_tab[osm_tab["amenity"] == "restaurant"].shape

In [None]:
osm_tab[osm_tab["amenity"] == "restaurant"].isnull().sum()

## 4.Combine airbnb Dataframe with POI's (Gastro)

The Number of POI's in a Distance of 500m has been calculated in Tableau* as listed below. After exporting it, it will now be merged into the `airbnb` Dataframe

Tableau Code: 
```Tableau
MAKEPOINT([Lat], [Lon])
 
BUFFER(MAKEPOINT([Latitude], [Longitude]), 500, 'meters')
```

* as you will see in 11. Calculation in Python, the calculation of the distances in Python didn't match the calculated distances in Tableau, due to converting issues.
Therefore we sticked with the Tableau calculation so that the distances match to the Dashboard

In [None]:
airbnb.head(2)

In [None]:
# read in table with number of POI in 500 meter radius. calculated in Tableau
amenities_500 = pd.read_excel(f'data/london/number_amenities_per_airbnb_500.xlsx', skiprows=1)

In [None]:
poi_gastro = (
    airbnb.merge(amenities_500, left_on="id", right_on="Id", how="left")
    .rename(columns={"bar": "bar_500", "cafe": "cafe_500", "fast_food": "fast_food_500", "restaurant": "restaurant_500"})
    .drop("Id", axis=1)
)

## 5.Combine with remaining POI Themes

Leisure / Railway / Shop / Tourism

In [None]:
# read in tables with number of POI in 500 meter radius. calculated in Tableau
# fyi: after first analysis on gastro POI's, we decided to keep a radius of 500m only 
leisure_500 = pd.read_excel(f'data/london/pois_leisure_500.xlsx', skiprows=1)
shop_500 = pd.read_excel(f'data/london/pois_shop_500.xlsx', skiprows=1)
tourism_500 = pd.read_excel(f'data/london/pois_tourism_500.xlsx', skiprows=1)
railway_500 = pd.read_excel(f'data/london/pois_railway_500.xlsx', skiprows=1)
railway_200 = pd.read_excel(f'data/london/pois_railway_200.xlsx', skiprows=1)


In [None]:
# merge leisure
poi = (
    poi_gastro.merge(leisure_500, left_on="id", right_on="Id", how="outer")
    .drop(["Id", "Listing Url"], axis=1)
    .rename(columns={"park": "park_500"})
)

In [None]:
# merge shop
poi = (
    poi.merge(shop_500, left_on="id", right_on="Id", how="outer")
    .drop(["Id", "Listing Url"], axis=1)
    .rename(columns={"clothes": "clothes_shop_500"})
)

In [None]:
# merge tourism
poi = (
    poi.merge(tourism_500, left_on="id", right_on="Id", how="outer")
    .drop(["Id", "Listing Url", "artwork"], axis=1)
    .rename(columns={"attraction": "attraction_500", "gallery": "gallery_500", "museum": "museum_500"})
)
poi.shape

In [None]:
# merge railway_500
poi = (
    poi.merge(railway_500, left_on="id", right_on="Id", how="outer")
    .drop(["Id", "Listing Url"], axis=1)
    .rename(columns={"subway_entrance": "subway_entrance_500"})
)
poi.shape

In [None]:
# merge railway_200
poi = (
    poi.merge(railway_200, left_on="id", right_on="Id", how="outer")
    .drop(["Id", "Listing Url"], axis=1)
    .rename(columns={"subway_entrance": "subway_entrance_200"})
)
poi.shape

## 6.Clean poi

In [None]:
# fill NaN values with 0 for reviews_per_month & poi cols
nan_cols = ['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500','attraction_500', 'gallery_500', 'museum_500', 'reviews_per_month', 'park_500', 'clothes_shop_500']

poi[nan_cols] = poi[nan_cols].fillna(0)

### new column: gastro_500

all bars, cafes, restaurants + fast_food POI's

In [None]:
# calculate a new field with all gastronomy POI's in a radius of 500 m
poi['gastro_500'] = poi['bar_500'] + poi['cafe_500'] + poi['restaurant_500'] + poi['fast_food_500']

### new column: art_500 

In [None]:
# calculate a new field with all museums & galleries combined 
poi['art_500'] = poi['gallery_500'] + poi['museum_500']

### new column: price_category

In [None]:
# check distribution
poi["price"].describe(percentiles=[.01, .25, .50, .75, .90])

In [None]:
# 0-19 - 0 
# 20-60 - 1 = low-budget
# 61-100 - 2 = budget
# 101-180 - 3 = standard
# 181+ - 4 = luxury 

# set conditions & values for each price category:
conditions = [
    (poi["price"] == 0),
    (poi["price"] > 0) & (poi["price"] <= 60),
    (poi["price"] > 60) & (poi["price"] <= 100),
    (poi["price"] > 100) & (poi["price"] <= 180),
    (poi["price"] > 180)
]

values = [0, 1, 2, 3, 4]

# create new column
poi["price_category"] = np.select(conditions, values)

### new column: "roomtype_int"  

In [None]:
# Shared room = 1
# Private room = 2
# Hotel room = 3
# Entire home = 4 

# set conditions & values for each price category:
conditions = [poi["room_type"] == "Shared room", 
              poi["room_type"] == "Private room", 
              poi["room_type"] == "Hotel room", 
              poi["room_type"] == "Entire home"
              ]

values = (1,2,3,4)

# create new column
poi["room_type_int"] = np.select(conditions, values)


### convert subway_entrance_xy to bool

In [None]:
# Convert columns to boolean values
poi['subway_entrance_500'] = poi['subway_entrance_500'].astype(bool)
poi['subway_entrance_200'] = poi['subway_entrance_200'].astype(bool)

### re-order columns in a suitable way

In [None]:
print([f"{col}" for col in poi.columns])


In [None]:
# shorten neighbourhood column-name 
poi = poi.rename(columns={"neighbourhood_cleansed": "neighbourhood"})

In [None]:
# shorten _coffee column-name 
poi = poi.rename(columns={"coffee_": "coffee"})

In [None]:
# specify needed columns in new order
new_col_order = ['id', 'listing_url', 'name', 'picture_url',                                                            #basics
                 'host_id', 'host_response_rate', 'host_acceptance_rate',                                               #host
                 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 
                 'neighbourhood', 'latitude', 'longitude',                                                              #location
                 'room_type', 'room_type_int', 'accommodates', 'bedrooms', 'beds',                                      #type of accommodation
                 'price', 'price_category', 'availability_90',                                                          # price
                 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm',                                     #reviews
                 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                 'long_term_stays_allowed', 'pets_allowed',                                                             #amenities               
                 'private_bath', 'bathtub', 'private_patio_or_balcony', 'private_entrance', 'outdoor_dining_area', 
                 'wifi', 'lockbox', 'kitchen', 'hair_dryer', 'coffee', 'bed_linens', 
                 'free_street_parking', 'free_parking_on_premises', 'dedicated_workspace', 
                 'bar_500','cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500',                                 #POI's
                 'park_500', 
                 'clothes_shop_500', 
                 'attraction_500', 'gallery_500', 'museum_500', 'art_500','subway_entrance_500', 'subway_entrance_200']


In [None]:
tab = ['id', 'listing_url', 'name', 'neighbourhood', 'latitude', 'longitude', 
       'room_type','accommodates', 'price', 'price_category', 'number_of_reviews','review_scores_rating' , 
       'private_bath', 'pets_allowed', 'private_patio_or_balcony' ]

In [None]:
poi[tab].to_csv("tab_export_airbnb.csv")

In [None]:
poi[tab].shape

In [None]:
# assign back to poi
poi = poi[new_col_order]

## 7.Add Average House-Prices for each Neighbourhood 

In [None]:
borough_prices = pd.read_csv(f'data/london/borough_costs_2022.csv')
borough_prices.head()


In [None]:
borough_prices.info()

In [None]:
# drop old values & difference 
del borough_prices['December 2021']
del borough_prices['Difference']

In [None]:
# convert prices to int 
borough_prices['December 2022'] = borough_prices['December 2022'].str[1:]
borough_prices['December 2022'] = borough_prices['December 2022'].str.replace(',','')
borough_prices['December 2022'] = borough_prices['December 2022'].astype("int")


In [None]:
# apply snakecase & rename 
borough_prices = borough_prices.rename({'London borough': 'neighbourhood', 'December 2022': 'avg_housing_price_22'}, axis=1)


In [None]:
# rename westminster
borough_prices['neighbourhood'] = borough_prices['neighbourhood'].str.replace('City of Westminster', 'Westminster')


In [None]:
poi = poi.merge(borough_prices, on="neighbourhood", how="left")

### add review data

In [None]:
# import xlsx file, calculated in tableau
rating_bars = pd.read_excel(f"data/london/avg_pricecat_rating_bars_500.xlsx")
# rename columns 
rating_bars = rating_bars.rename(columns={"Id": "id", "Avg. Price Cat": "price_cat_bar", "Unnamed: 2": "rating_bar"})

rating_restaurant = pd.read_excel(f"data/london/avg_pricecat_rating_restaurants_500.xlsx")
rating_restaurant = rating_restaurant.rename(columns={"Id": "id", "Avg. Price Cat": "price_cat_restaurant", "Unnamed: 2": "rating_restaurant"})

# merge 
ratings = rating_restaurant.merge(rating_bars, on="id", how="outer")

In [None]:
# merge to cleaned poi
poi = poi.merge(ratings, on="id", how="left")

In [None]:
# specify needed columns in new order
new_col_order = ['id', 'listing_url', 'name', 'picture_url',                                                            #basics
                 'host_id', 'host_response_rate', 'host_acceptance_rate',                                               #host
                 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 
                 'neighbourhood', 'avg_housing_price_22', 'latitude', 'longitude',                                      #location
                 'room_type', 'room_type_int', 'accommodates', 'bedrooms', 'beds',                                      #type of accommodation
                 'price', 'price_category', 'availability_90',                                                           # price
                 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm',                                     #reviews
                 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                 'long_term_stays_allowed', 'pets_allowed',                                                             #amenities               
                 'private_bath', 'bathtub', 'private_patio_or_balcony', 'private_entrance', 'outdoor_dining_area', 
                 'wifi', 'lockbox', 'kitchen', 'hair_dryer', 'coffee', 'bed_linens', 
                 'free_street_parking', 'free_parking_on_premises', 'dedicated_workspace', 
                 'bar_500', 'price_cat_bar', 'rating_bar', 'cafe_500', 'fast_food_500',                                 #POI's 
                 'restaurant_500', 'price_cat_restaurant' , 'rating_restaurant', 'gastro_500', 'park_500', 
                 'clothes_shop_500', 
                 'attraction_500', 'gallery_500', 'museum_500', 'art_500','subway_entrance_500', 'subway_entrance_200']

In [None]:
poi = poi[new_col_order]

In [None]:
# convert price categories into float
poi['price_cat_bar'] = poi['price_cat_bar'].str.replace(',','')
poi['price_cat_bar'] = poi['price_cat_bar'].astype("float")

poi['price_cat_restaurant'] = poi['price_cat_restaurant'].str.replace(',','')
poi['price_cat_restaurant'] = poi['price_cat_restaurant'].astype("float")

In [None]:
poi.head(2)

In [None]:
poi.columns

## 8.Price prediction

### data preparation

In [None]:
# create a new column: accommodates_per_bedroom
poi["accommodates_per_bedroom"] = poi["bedrooms"] / poi["accommodates"]

In [None]:
# specify needed columns for prediction
reg_cols = ['id',                                                                                                           
                 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 
                 'neighbourhood', 'avg_housing_price_22',                                      
                 'room_type', 'accommodates_per_bedroom', 'bedrooms',                                     
                 'price', 'price_category', 'availability_90',                                                           
                 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm',                                    
                 'review_scores_rating',
                 'long_term_stays_allowed', 'pets_allowed',                                                                           
                 'private_bath', 'bathtub', 'private_patio_or_balcony', 'private_entrance', 'outdoor_dining_area', 
                 'wifi', 'lockbox', 'kitchen', 'hair_dryer', 'coffee', 'bed_linens', 
                 'free_street_parking', 'free_parking_on_premises', 'dedicated_workspace', 
                 'bar_500', 'price_cat_bar', 'rating_bar',                                 
                 'restaurant_500', 'price_cat_restaurant' , 'rating_restaurant', 'gastro_500', 'park_500', 
                 'clothes_shop_500', 
                 'attraction_500', 'art_500', 'subway_entrance_200']

In [None]:
df = poi[reg_cols]

In [None]:
# set categorical values for average housing price 

# define the conditions and categories
conditions = [
    df["avg_housing_price_22"] < 400000,
    (df["avg_housing_price_22"] >= 400000) & (df["avg_housing_price_22"] < 500000),
    (df["avg_housing_price_22"] >= 500000) & (df["avg_housing_price_22"] < 600000),
    (df["avg_housing_price_22"] >= 600000) & (df["avg_housing_price_22"] < 750000),
    df["avg_housing_price_22"] >= 750000
]
categories = [
    "very_low",
    "low",
    "medium",
    "high",
    "very_high"
]

# create the new column based on the conditions and categories
df["housing_price_cat"] = np.select(conditions, categories)

In [None]:
# convert bool columns to int
df[["private_bath", "host_is_superhost", "subway_entrance_200"]] = df[["private_bath", "host_is_superhost", "subway_entrance_200"]].astype(int)

In [None]:
# convert price categories into float
df['price_cat_bar'] = df['price_cat_bar'].str.replace(',','')
df['price_cat_bar'] = df['price_cat_bar'].astype("float")

df['price_cat_restaurant'] = df['price_cat_restaurant'].str.replace(',','')
df['price_cat_restaurant'] = df['price_cat_restaurant'].astype("float")

In [None]:
# set nan's for review_scores_rating to 0 
df['review_scores_rating'] = df['review_scores_rating'].fillna(0)

In [None]:
# remove outliers from price 
from scipy import stats

# Calculate the z-scores for poi[price] and filter the df
z_scores = np.abs(stats.zscore(df['price']))
threshold = 3
df = df[z_scores < threshold]

In [None]:
# remove all rows where room type is "shared room" or "hotel room", also rows with price = 0 
df = df[(df["room_type"] == "Entire home") | (df["room_type"] == "Private room")]
df = df[~df["price"].isnull()]


In [None]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=["room_type", "neighbourhood", "housing_price_cat"])

In [None]:
# set nan's for remaining columns with nan to median 
foo = ["rating_bar", "price_cat_bar", "price_cat_restaurant", "rating_restaurant"]

for x in foo: 
    df[x] = df[x].fillna(df[x].mean())

In [None]:
df["price"].max()

### check for correlation

In [None]:
import re 
# define a function to convert a string to snake_case
def to_snake_case(string):
    s0 = re.sub('\s+', '_', string)  # replace whitespace with underscores
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s0)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    s3 = re.sub('^_+|_+$', '', s2)  # remove leading/trailing underscores
    return re.sub('_+', '_', s3)  # remove consecutive underscores

# rename columns to snake_case
df.columns = [to_snake_case(col) for col in df.columns]


In [None]:
df_basic = df[["price", "host_is_superhost", "host_listings_count", "host_total_listings_count", 
              "accommodates_per_bedroom", "bedrooms", "availability_90", "number_of_reviews", 
              "reviews_per_month", "number_of_reviews_ltm", "review_scores_rating", 
              "room_type_entire_home" , "room_type_private_room", "housing_price_cat_high", 
              "housing_price_cat_low", "housing_price_cat_medium", "housing_price_cat_very_high", "housing_price_cat_very_low"]]

In [None]:
df_amm = df[["price", "long_term_stays_allowed", "pets_allowed", "private_bath", "bathtub", 
             "private_patio_or_balcony", "private_entrance", "outdoor_dining_area", 
             "wifi", "lockbox", "kitchen", "hair_dryer", "coffee", "bed_linens", 
             "free_street_parking", "free_parking_on_premises" , "dedicated_workspace"]]

In [None]:
df_poi = df[["price", "bar_500", "price_cat_bar", "rating_bar", "restaurant_500", 
             "price_cat_restaurant", "rating_restaurant","gastro_500", 
             "park_500", "clothes_shop_500", "attraction_500", "art_500", "subway_entrance_200"]]

In [None]:
df_borough = df[["price", "neighbourhood_barking_and_dagenham", "neighbourhood_barnet", "neighbourhood_bexley", 
                 "neighbourhood_brent", "neighbourhood_bromley", "neighbourhood_camden", "neighbourhood_city_of_london",  
                 "neighbourhood_croydon", "neighbourhood_ealing", "neighbourhood_enfield", "neighbourhood_greenwich", 
                 "neighbourhood_hackney", "neighbourhood_hammersmith_and_fulham", "neighbourhood_haringey", "neighbourhood_harrow",
                 "neighbourhood_havering", "neighbourhood_hillingdon", "neighbourhood_hounslow", "neighbourhood_islington", 
                 "neighbourhood_kensington_and_chelsea", "neighbourhood_kingston_upon_thames", "neighbourhood_lambeth", "neighbourhood_lewisham", "neighbourhood_merton", 
                 "neighbourhood_newham", "neighbourhood_redbridge", "neighbourhood_richmond_upon_thames", "neighbourhood_southwark", "neighbourhood_sutton", 
                 "neighbourhood_tower_hamlets", "neighbourhood_waltham_forest", "neighbourhood_wandsworth", "neighbourhood_westminster"
                       ]]

In [None]:
corr = round(df_borough.corr(numeric_only=True),2)

sns.set(rc={"figure.figsize":(32, 25)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

In [None]:
corr = round(df_poi.corr(numeric_only=True),2)

sns.set(rc={"figure.figsize":(32, 25)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

In [None]:
corr = round(df_amm.corr(numeric_only=True),2)

sns.set(rc={"figure.figsize":(32, 25)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

In [None]:
corr = round(df_basic.corr(numeric_only=True),2)

sns.set(rc={"figure.figsize":(32, 25)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

### define predicted columns

In [None]:
pred = ["id", "price",
        "private_bath",
        "host_listings_count",
        "review_scores_rating",
        "bedrooms",
        "accommodates_per_bedroom",
        "availability_90",
        "bathtub",
        "room_type_private_room",
        "restaurant_500",
        "price_cat_restaurant",
        "clothes_shop_500",
        "attraction_500",
        "subway_entrance_200",
        "art_500",
        "housing_price_cat_very_high",
        "housing_price_cat_low"]

df_pred = df[pred]


In [None]:
corr = round(df_pred.corr(numeric_only=True),2)

sns.set(rc={"figure.figsize":(32, 25)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### define X & y // Split Dataframe // Scale all numerical features

In [None]:
df_pred.describe()

In [None]:
X = df_pred.drop(["id", "price"], axis=1)
y = df_pred["price"]

# Identify numeric columns
numeric_cols = X.select_dtypes(include=np.number).columns

# Scale numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### targeting multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create a new dataframe with only the predictor variables
#X = df_pred[columns_to_calc]
X = df_pred.drop(["id", "price"], axis=1)

# calculate VIF for each predictor variable
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["predictor"] = X.columns

print(vif)


### linear regression

In [None]:
# Import the statsmodels module
import statsmodels.api as sm

# Choose the predictor X and add a constant term
X2 = df_pred.drop(["id", "price"], axis=1)
X2 = sm.add_constant(X)
# Define dependent variable
y = df_pred["price"]

# fit model and get model summery in one step
sm.OLS(y, X2).fit().summary()


In [None]:
# Build the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr.predict(X_test)

# Evaluate the performance of the linear regression model
print('Linear Regression:')
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_lr))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_lr))
print('R-squared:', r2_score(y_test, y_pred_lr))

### knn model

In [None]:
# Build the KNN model
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test)

# Evaluate the performance of the KNN model
print('KNN:')
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_knn))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_knn))
print('R-squared:', r2_score(y_test, y_pred_knn))

### random forest

In [None]:
# Import the required libraries
from sklearn.ensemble import RandomForestRegressor

# Build the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the performance of the Random Forest model
print('Random Forest:')
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_rf))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_rf))
print('R-squared:', r2_score(y_test, y_pred_rf))

### random forest on whole df

In [None]:
# Import the required libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

# Split the data into X (independent variables) and y (dependent variable)
X = df_pred.drop(["id", "price"], axis=1)
y = df_pred["price"]

# Identify numeric columns
numeric_cols = X.select_dtypes(include=np.number).columns

# Scale numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Build the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Make predictions on the whole dataset
y_pred_rf = rf.predict(X)

# Create a new dataframe with the predicted values and the same index as df_pred
df_pred_pred = pd.DataFrame({'id': df_pred['id'], 'pred_price': y_pred_rf}, index=df_pred.index)

# Merge df_pred and df_pred_pred on the 'id' column
df_pred_with_pred = pd.merge(df_pred, df_pred_pred, on='id')

# Update df_pred to include the predicted values
df_pred = df_pred_with_pred


### 8.1.Add Predicted Price Column to Airbnb Dataframe

In [None]:
df_pred["pred_price"] = round(df_pred["pred_price"],2)

In [None]:
# calculate a new column to determine if the price is fair 
df_pred["fair_price"] = df_pred.apply(lambda x: "no" if x["pred_price"] < (
    x["price"] - 26) else ("yes" if x["pred_price"] > (x["price"] + 26) else "probably"), axis=1)


In [None]:
df_pred["fair_price"].value_counts()

In [None]:
airbnb[airbnb["id"] == 42708163]

In [None]:
df_pred[["id","price", "pred_price", "fair_price"]].sample(20)

### merge to old df & export for Tableau

In [None]:
# filter only relevant columns to merge
price_predicted = df_pred[["id", "pred_price", "fair_price"]]

In [None]:
airbnb_predicted = poi[tab]

In [None]:
airbnb_predicted.shape

In [None]:
# merge to airbnb df 
airbnb_predicted = airbnb_predicted.merge(price_predicted, on="id")

In [None]:
poi[tab].head(2)

In [None]:
airbnb_predicted.head(2)

In [None]:
# export to csv 
airbnb_predicted.to_csv("g1_airbnb_tab.csv")

In [None]:
# check for best parameters 
# (runs 100min!!)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20],
    'max_features': ['sqrt', 'log2', None]
}

# create a random forest regressor
rf = RandomForestRegressor()

# perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# train a random forest regressor with the best hyperparameters
best_rf = RandomForestRegressor(n_estimators=best_params['n_estimators'], 
                                 max_depth=best_params['max_depth'], 
                                 max_features=best_params['max_features'])
best_rf.fit(X_train, y_train)

# evaluate the performance of the best random forest regressor on the test set
test_score = best_rf.score(X_test, y_test)
print("Test set score:", test_score)


### gradient boosting 

In [None]:
# Import the required libraries
from sklearn.ensemble import GradientBoostingRegressor

# Build the Gradient Boosting model
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gbr = gbr.predict(X_test)

# Evaluate the performance of the Gradient Boosting model
print('Gradient Boosting:')
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_gbr))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_gbr))
print('R-squared:', r2_score(y_test, y_pred_gbr))


In [None]:
# Check MAE and R2 for each individual column in random forest model
# # Iterate over the columns of X
# for col in X.columns:
#     # Select the current column and the target variable
#     X_col = X[[col]]
#     y = df_pred["price"]

#     # Scale the current column
#     scaler = StandardScaler()
#     X_col = scaler.fit_transform(X_col)

#     # Split the data into training and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X_col, y, test_size=0.2, random_state=42)

#     # Build the Random Forest model
#     rf = RandomForestRegressor(n_estimators=100, random_state=42)
#     rf.fit(X_train, y_train)

#     # Make predictions on the test set
#     y_pred_rf = rf.predict(X_test)

#     # Evaluate the performance of the Random Forest model
#     print(f'Column {col}:')
#     print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_rf))
#     print('R-squared:', r2_score(y_test, y_pred_rf))
#     print()


## 9.EDA Customer oriented

###  #1: Budget Traveller: 

* Where can i find cheap Airbnb's with good (and cheap) Infrastructure?
* Only Price Category 1
* Number of POI's above average 
* Only restaurants with price category below average 


In [None]:
poi.shape

In [None]:
poi_cols = ['bar_500','cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500','park_500',  'clothes_shop_500', 
        'attraction_500', 'gallery_500', 'museum_500', 'art_500','subway_entrance_500', 'subway_entrance_200']

In [None]:
# Filter only Airbnb's where the amount of POI's is above the average + for 4 people with 2 bedrooms + 
# the Price Category of the Airnbnb is 1 + the Price Category of Restaurants is below average

# Filter out rows where price_cat_restaurant is NaN
budget_poi = poi[~poi['price_cat_restaurant'].isna()]

high_cols = ['clothes_shop_500', 'cafe_500', 'gallery_500', 'museum_500', 'art_500']

high_mask = budget_poi[high_cols].apply(lambda x: x >= x.quantile(0.50)).all(axis=1)
price_mask = budget_poi["price_category"] <= 2 
p_rest_mask = budget_poi['price_cat_restaurant'] < budget_poi['price_cat_restaurant'].mean()
acc_mask = budget_poi['accommodates'] >= 4
bed_mask = budget_poi['bedrooms'] >= 2

budget_poi = budget_poi[high_mask & price_mask & p_rest_mask & acc_mask & bed_mask]

In [None]:
# # Filter only Airbnb's where the amount of POI's is above the average +
# # the Price Category of the Airnbnb is 1 + the Price Category of Restaurants is below average

# # Filter out rows where price_cat_restaurant is NaN
# budget_poi = poi[~poi['price_cat_restaurant'].isna()]

# # calculate the mean of each column
# mean_values = budget_poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500', 'park_500',
#                           'clothes_shop_500', 'attraction_500', 'gallery_500', 'museum_500', 'art_500']].mean()

# # calculate the 75th percentile of each column
# percentile_values = budget_poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500', 'park_500',
#                                 'clothes_shop_500', 'attraction_500', 'gallery_500', 'museum_500', 'art_500']].quantile(0.75)

# # create a boolean mask based on the mean values and the price category conditions
# mask = ((budget_poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500', 'park_500',
#                      'clothes_shop_500', 'attraction_500', 'gallery_500', 'museum_500', 'art_500']] > mean_values).any(axis=1)) & \
#     (budget_poi['price_category'] <= 2) & \
#     (budget_poi['price_cat_restaurant'] < budget_poi['price_cat_restaurant'].mean()) & \
#     (budget_poi['accommodates'] >= 4) & \
#     (budget_poi['bedrooms'] >= 2)

# # filter the dataframe using the boolean mask
# budget_poi = budget_poi.loc[mask]


In [None]:
budget_poi[['clothes_shop_500', 'cafe_500', 'gallery_500', 'museum_500', 'art_500']].describe()

In [None]:
poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 
           'gastro_500', 'park_500', 'clothes_shop_500', 'attraction_500', 
           'gallery_500', 'museum_500', 'art_500']].describe()

In [None]:
# group by neighbourhood and calculate the mean number of POI's + Total Number of Airbnb's  
budget_grouped = budget_poi.groupby(by="neighbourhood").agg(count=("id", "size"),
                                                       gastro_mean=("gastro_500", "mean"),
                                                       price_cat_restaurant_mean=("price_cat_restaurant", "mean"),
                                                       clothes_shop_mean=("clothes_shop_500", "mean"),
                                                       attraction_mean=("attraction_500", "mean"),
                                                       art_mean=("art_500", "mean")).reset_index(level=None)


In [None]:
# calculate the same for all Airbnb's, in order to then calculate the % of Budget friendly Airbnbs 
grouped = poi.groupby(by="neighbourhood").agg(count=("id", "size"),
                                               gastro_mean=("gastro_500", "mean"),
                                               price_cat_restaurant_mean=("price_cat_restaurant", "mean"),
                                               clothes_shop_mean=("clothes_shop_500", "mean"),
                                               attraction_mean=("attraction_500", "mean"),
                                               art_mean=("art_500", "mean")).reset_index(level=None)


In [None]:
# Calculate percentage of count
budget_grouped['count_percentage'] = budget_grouped['count'] / grouped.loc[grouped['neighbourhood'].isin(budget_grouped['neighbourhood']), 'count'].values * 100
# round 
budget_grouped['count_percentage'] = round(budget_grouped['count_percentage'],1)

# Select only 'count' and 'count_percentage' columns
count_df = budget_grouped[['count', 'count_percentage']]

# Concatenate the two dataframes vertically
budget_result = pd.concat([count_df, budget_grouped.drop(
    columns=['count', 'count_percentage'])], axis=1)
budget_result = budget_result[['neighbourhood', 'count', 'count_percentage', 'gastro_mean',
                 'price_cat_restaurant_mean', 'clothes_shop_mean', 'attraction_mean', 'art_mean']].sort_values(by="count", ascending=False)


In [None]:
# create a DataFrame with all the desired rows and the corresponding budget values
desired_rows = ["City of London", "Westminster", "Camden", "Kensington and Chelsea", "Tower Hamlets", "Islington", "Hackney", "Southwark"]
budget_result_desired = pd.DataFrame({"neighbourhood": desired_rows})
budget_result_desired = budget_result_desired.merge(budget_result, on="neighbourhood", how="left")

# select the desired rows in the desired order
budget_result_desired = budget_result_desired.set_index("neighbourhood").loc[desired_rows, :].reset_index()

budget_result_desired


In [None]:
budget_result_desired = budget_result_desired.fillna("0")
budget_result_desired["count"] = budget_result_desired["count"].astype("int")
budget_result_desired["count_percentage"] = budget_result_desired["count_percentage"].astype("float")

In [None]:
# create a df to plot only the neighbourhoods with the most Airbnbs for this filter, and only the number of of Airbnbs + % 
plotting = budget_result_desired[["neighbourhood", "count", "count_percentage"]].head(11)
# Set the neighbourhood column as the index
plotting = plotting.set_index('neighbourhood')


In [None]:
# set colors for graphs
r = "#FF5A5F"
b = "#25328C"
w = "#FFFEF6"

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["axes.grid"] = False

# Create figure and axis objects
fig, ax1 = plt.subplots(figsize=(12, 8))

plt.style.use(['default'])
fig.set_facecolor(w)

# Create bar chart for count column
ax1.bar(plotting.index, plotting['count'], color=[b, b, b, b ,b, b, r,b])
ax1.set_ylabel('Count')
ax1.set_xticklabels(plotting.index, rotation=45, ha="right")
ax1.get_yaxis().set_visible(False)

# # Create secondary axis for count_percentage column
# ax2 = ax1.twinx()
# ax2.plot(plotting.index, plotting['count_percentage'], color='red', marker='o')
# ax2.set_ylim(0, 100)
# ax2.set_ylabel('Count Percentage')
# ax2.get_yaxis().set_visible(False)

# Add count values inside bars
for i, val in enumerate(plotting['count']):
    ax1.text(i, val, str(val), ha='center', va='bottom', fontweight='bold')

# # Add count_percentage values inside line plot
# for i, val in enumerate(plotting['count_percentage']):
#     ax2.text(i, val, f"{val:.1f}%", ha='center', va='top', fontweight='bold')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

plt.title("Amount of Airbnb's for Active Budget Families")
# Show the plot
plt.show();


### #2 Luxury Traveller

Where can i find the "best" Airbnb's: 
* high price-category (3+4)
* high rating: 4,5+ 
* more than average poi's 

In [None]:
# Filter only Airbnb's where the amount of POI's is above the average +
# the Price Category of the Airnbnb is 3 or 4 + the Price Category of Restaurants is above average + 
# rating of airbnb is 4.5+ 

# Filter out rows where price_cat_restaurant or review_scores_rating is NaN
luxury_poi = poi[~poi['price_cat_restaurant'].isna()]
luxury_poi = poi[~poi['review_scores_rating'].isna()]

# calculate the mean of each column
mean_values = luxury_poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500', 'park_500',
                   'clothes_shop_500', 'attraction_500', 'gallery_500', 'museum_500', 'art_500']].mean()

# create a boolean mask based on the mean values and the price category conditions
mask = ((luxury_poi[['bar_500', 'cafe_500', 'fast_food_500', 'restaurant_500', 'gastro_500', 'park_500',
              'clothes_shop_500', 'attraction_500', 'gallery_500', 'museum_500', 'art_500']] > mean_values).any(axis=1)) & \
    (luxury_poi['price_category'] >= 4) & \
    (luxury_poi['price_cat_restaurant'] > luxury_poi['price_cat_restaurant'].mean()) & \
    (luxury_poi['rating_restaurant'] > 4.3)  & \
    (luxury_poi['review_scores_rating'] > 4.5)

# filter the dataframe using the boolean mask
luxury_poi = luxury_poi.loc[mask]

In [None]:
luxury_poi.shape

In [None]:
# group by neighbourhood and calculate the mean number of POI's + Total Number of Airbnb's  
luxury_grouped = luxury_poi.groupby(by="neighbourhood").agg(count=("id", "size"),
                                                       gastro_mean=("gastro_500", "mean"),
                                                       price_cat_restaurant_mean=("price_cat_restaurant", "mean"),
                                                       clothes_shop_mean=("clothes_shop_500", "mean"),
                                                       attraction_mean=("attraction_500", "mean"),
                                                       art_mean=("art_500", "mean")).reset_index(level=None)


In [None]:
# Calculate percentage of count
luxury_grouped['count_percentage'] = luxury_grouped['count'] / grouped.loc[grouped['neighbourhood'].isin(luxury_grouped['neighbourhood']), 'count'].values * 100

luxury_grouped['count_percentage'] = round(luxury_grouped['count_percentage'],1)

# Select only 'count' and 'count_percentage' columns
count_df = luxury_grouped[['count', 'count_percentage']]

# Concatenate the two dataframes vertically
luxury_result = pd.concat([count_df, luxury_grouped.drop(
    columns=['count', 'count_percentage'])], axis=1)
luxury_result = luxury_result[['neighbourhood', 'count', 'count_percentage', 'gastro_mean',
                 'price_cat_restaurant_mean', 'clothes_shop_mean', 'attraction_mean', 'art_mean']].sort_values(by="count", ascending=False)

In [None]:
# the desired order of rows
desired_rows = ["City of London", "Westminster", "Camden", "Kensington and Chelsea", "Tower Hamlets", "Islington", "Hackney", "Southwark"]

# select the desired rows in the desired order
luxury_result = luxury_result.loc[luxury_result["neighbourhood"].isin(desired_rows), :]
luxury_result = luxury_result.loc[luxury_result["neighbourhood"].isin(desired_rows), :]
luxury_result = luxury_result.set_index("neighbourhood").loc[desired_rows, :].reset_index()

luxury_result

In [None]:
# create a df to plot only the neighbourhoods with the most Airbnbs for this filter, and only the number of of Airbnbs + % 
plotting_lux = luxury_result[["neighbourhood", "count", "count_percentage"]].head(11)
# Set the neighbourhood column as the index
plotting_lux = plotting_lux.set_index('neighbourhood')

In [None]:
# Create figure and axis objects
fig, ax1 = plt.subplots(figsize=(12, 8))

plt.style.use(['default'])
fig.set_facecolor(w)

# Create bar chart for count column
ax1.bar(plotting_lux.index, plotting_lux['count'], color=[b,r, r,r,b,b,b,b])
ax1.set_ylabel('Count')
ax1.set_xticklabels(plotting_lux.index, rotation=45, ha="right")
ax1.get_yaxis().set_visible(False)

# # Create secondary axis for count_percentage column
# ax2 = ax1.twinx()
# ax2.plot(plotting_lux.index, plotting_lux['count_percentage'], color='red', marker='o')
# ax2.set_ylim(0, 100)
# ax2.set_ylabel('Count Percentage')
# ax2.get_yaxis().set_visible(False)

# Add count values inside bars
for i, val in enumerate(plotting_lux['count']):
    ax1.text(i, val, str(val), ha='center', va='bottom', fontweight='bold')

# # Add count_percentage values inside line plot
# for i, val in enumerate(plotting_lux['count_percentage']):
#     ax2.text(i, val, f"{val:.1f}%", ha='center', va='top', fontweight='bold')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)


plt.title("Amount of good-rated-luxury Airbnb's with decent amount of POI's, good & high-priced Restaurants")
# Show the plot
plt.show();


### #3 "Green City Explorer"

* Where can i find good Airbnb's in quiet Areas with less POI's. Lots of Parks and good public transport

In [None]:
poi.head(2)

In [None]:
# Filter only Airbnb's where the amount of POI's is below the average +
# rating of airbnb is 4+ plus access to subway

q25_cols = ['bar_500',  
            'clothes_shop_500', 'gallery_500', 'museum_500', 'art_500']
park_col = 'park_500'
subway_col = 'subway_entrance_500'
rating_col = 'review_scores_rating'

q25_mask = poi[q25_cols].apply(lambda x: x <= x.quantile(0.45)).all(axis=1)
park_mask = poi[park_col] >= poi[park_col].quantile(0.60)
subway_mask = poi[subway_col] == True
rating_mask = poi[rating_col] > 4.0
balcony_mask = poi["private_patio_or_balcony"] == 1 

green_poi = poi[q25_mask & park_mask & rating_mask & balcony_mask]

In [None]:
green_poi[['bar_500', 'clothes_shop_500', 'gallery_500', 'museum_500', 'art_500', 'park_500']].describe()

In [None]:
green_poi.shape

In [None]:
# group by neighbourhood and calculate the mean number of POI's + Total Number of Airbnb's  
green_grouped = green_poi.groupby(by="neighbourhood").agg(count=("id", "size"),
                                                       park_mean=("park_500", "mean")).reset_index(level=None)


In [None]:
# Calculate percentage of count
green_grouped['count_percentage'] = green_grouped['count'] / grouped.loc[grouped['neighbourhood'].isin(green_grouped['neighbourhood']), 'count'].values * 100

green_grouped['count_percentage'] = round(green_grouped['count_percentage'],1)

# Select only 'count' and 'count_percentage' columns
count_df = green_grouped[['count', 'count_percentage']]

# Concatenate the two dataframes vertically
green_result = pd.concat([count_df, green_grouped.drop(
    columns=['count', 'count_percentage'])], axis=1)
green_result = green_result[['neighbourhood', 'count', 'count_percentage', 'park_mean']].sort_values(by="count", ascending=False)

In [None]:
green_result.head(8)

In [None]:
# create a DataFrame with all the desired rows and the corresponding budget values
desired_rows = ["City of London", "Westminster", "Camden", "Kensington and Chelsea", "Tower Hamlets", "Islington", "Hackney", "Southwark"]
green_result_2 = pd.DataFrame({"neighbourhood": desired_rows})
green_result_2 = green_result_2.merge(green_result, on="neighbourhood", how="left")

# select the desired rows in the desired order
green_result_2 = green_result_2.set_index("neighbourhood").loc[desired_rows, :].reset_index()

green_result_2

In [None]:
green_result_2 = green_result_2.fillna("0")
green_result_2["count"] = green_result_2["count"].astype("int")
green_result_2["count_percentage"] = green_result_2["count_percentage"].astype("float")

In [None]:
# create a df to plot only the neighbourhoods with the most Airbnbs for this filter, and only the number of of Airbnbs + % 
plotting_green = green_result_2[["neighbourhood", "count", "count_percentage"]].head(8)
# Set the neighbourhood column as the index
plotting_green = plotting_green.set_index('neighbourhood')

In [None]:
plotting_green

In [None]:
import matplotlib.pyplot as plt

# Create figure and axis objects
fig, ax1 = plt.subplots(figsize=(12, 8))

plt.style.use(['default'])
fig.set_facecolor(w)

# Create bar chart for count column
ax1.bar(plotting_green.index, plotting_green['count'], color=[b,b, b,b,r,b,b,b])
ax1.set_ylabel('Count')
ax1.set_xticklabels(plotting_green.index, rotation=45, ha="right")
ax1.get_yaxis().set_visible(False)

#props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
#ax1.text(6, 190,"Test", verticalalignment='top', bbox=props)

# # Create secondary axis for count_percentage column
# ax2 = ax1.twinx()
# ax2.plot(plotting_green.index, plotting_green['count_percentage'], color='red', marker='o')
# ax2.set_ylim(0, 100)
# ax2.set_ylabel('Count Percentage')
# ax2.get_yaxis().set_visible(False)

# Add count values inside bars
for i, val in enumerate(plotting_green['count']):
    ax1.text(i, val, str(val), ha='center', va='bottom', fontweight='bold')

# # Add count_percentage values inside line plot
# for i, val in enumerate(plotting_green['count_percentage']):
#     ax2.text(i, val, f"{val:.1f}%", ha='center', va='top', fontweight='bold')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)


plt.title("Amount of Airbnb's for Green City Explorer")
# Show the plot
plt.show();


In [None]:
# # the desired order of rows
# desired_rows = ["City of London", "Westminster", "Camden", "Kensington and Chelsea", "Tower Hamlets", "Islington", "Hackney", "Southwark"]

# # select the desired rows in the desired order
# green_result = green_result.loc[green_result["neighbourhood"].isin(desired_rows), :]
# green_result = green_result.loc[green_result["neighbourhood"].isin(desired_rows), :]
# green_result = green_result.set_index("neighbourhood").loc[desired_rows, :].reset_index()

# green_result

### #4 Which Neighbourhood is good for: 

* going out 
* foodies 
* shopping 
* culture 

In [None]:
# create new columns for culture & for going out 
poi["culture_500"] = poi["art_500"] + poi["attraction_500"]
poi["going_out_500"] = poi["bar_500"] + poi["cafe_500"]

In [None]:
# calculate the same for all Airbnb's, in order to then calculate the % of Budget friendly Airbnbs 
grouped_3 = poi.groupby(by="neighbourhood").agg(count=("id", "size"),
                                                restaurant_mean=("restaurant_500", "mean"),
                                                clothes_shop_mean=("clothes_shop_500", "mean"),
                                                culture_mean=("culture_500", "mean"),
                                                going_out_mean=("going_out_500", "mean")).reset_index(level=None)


In [None]:
grouped_3["total_mean"] = grouped_3["restaurant_mean"] + grouped_3["clothes_shop_mean"]	+ grouped_3["culture_mean"] + grouped_3["going_out_mean"]

In [None]:
plot_theme = grouped_3.sort_values(by="total_mean", ascending=False).reset_index().head(8)

In [None]:
plot_theme[['restaurant_mean', 'clothes_shop_mean', 'culture_mean', 'going_out_mean', 'total_mean']] = plot_theme[['restaurant_mean', 'clothes_shop_mean', 'culture_mean', 'going_out_mean', 'total_mean']].astype(int)

In [None]:
plot_theme[['restaurant_mean',
            'clothes_shop_mean', 'culture_mean', 'going_out_mean', 'total_mean']] = round(plot_theme[['restaurant_mean',
                                                                                                      'clothes_shop_mean', 'culture_mean', 'going_out_mean', 'total_mean']], 0)


In [None]:
plot_theme

In [None]:
import matplotlib.pyplot as plt

# define the figure size
fig, axs = plt.subplots(figsize=(18, 10), nrows=2, ncols=2)

plt.style.use(['default'])
fig.set_facecolor(w)

# create the first barplot in the upper left corner
axs[0, 0].bar(plot_theme['neighbourhood'], plot_theme['restaurant_mean'], color=[r,r, b,b,b,b,b,b])
axs[0, 0].set_title('Restaurants')
#axs[0, 0].tick_params(axis='x', rotation=45, ha="right")
axs[0, 0].set_xticklabels(plot_theme['neighbourhood'], rotation=45, ha="right")
axs[0, 0].tick_params(axis='y', labelleft=False)


# add values in the bars
for i, v in enumerate(plot_theme['restaurant_mean']):
    axs[0, 0].text(i, v, str(v), color='black', ha='center', va='bottom')

# create the second barplot in the upper right corner
axs[0, 1].bar(plot_theme['neighbourhood'], plot_theme['clothes_shop_mean'], color=[b,r, b,r,b,b,b,b])
axs[0, 1].set_title('Shopping')
axs[0, 1].set_xticklabels(plot_theme['neighbourhood'], rotation=45, ha="right")
axs[0, 1].tick_params(axis='y', labelleft=False)

# add values in the bars
for i, v in enumerate(plot_theme['clothes_shop_mean']):
    axs[0, 1].text(i, v, str(v), color='black', ha='center', va='bottom')

# create the third barplot in the lower left corner
axs[1, 0].bar(plot_theme['neighbourhood'], plot_theme['culture_mean'], color=[r,r, r,r,b,b,b,b])
axs[1, 0].set_title('Culture')
axs[1, 0].set_xticklabels(plot_theme['neighbourhood'], rotation=45, ha="right")
axs[1, 0].tick_params(axis='y', labelleft=False)

# add values in the bars
for i, v in enumerate(plot_theme['culture_mean']):
    axs[1, 0].text(i, v, str(v), color='black', ha='center', va='bottom')

# create the fourth barplot in the lower right corner
axs[1, 1].bar(plot_theme['neighbourhood'], plot_theme['going_out_mean'], color=[r,b, b,b,b,b,b,b])
axs[1, 1].set_title('Going Out')
axs[1, 1].set_xticklabels(plot_theme['neighbourhood'], rotation=45, ha="right")
axs[1, 1].tick_params(axis='y', labelleft=False)

# add values in the bars
for i, v in enumerate(plot_theme['going_out_mean']):
    axs[1, 1].text(i, v, str(v), color='black', ha='center', va='bottom')

# adjust the spacing between the subplots
plt.subplots_adjust(hspace=0.6)

# add a title for the whole figure
plt.suptitle("Average Count of POI's per Category")

# display the plot
plt.show()



In [None]:
import matplotlib.pyplot as plt

# define the figure size
plt.figure(figsize=(18, 10))

plt.style.use(['default'])
#fig.set_facecolor(w)

# create the first barplot in the upper left corner
plt.subplot(2, 2, 1)
plt.bar(plot_theme['neighbourhood'], plot_theme['restaurant_mean'], color=[r,r, b,b,b,b,b,b])
plt.title('Restaurants')
plt.xticks(rotation=45, ha="right")
plt.yticks([])

# add values in the bars
for i, v in enumerate(plot_theme['restaurant_mean']):
    plt.text(i, v, str(v), color='black', ha='center', va='bottom')

# create the second barplot in the upper right corner
plt.subplot(2, 2, 2)
plt.bar(plot_theme['neighbourhood'], plot_theme['clothes_shop_mean'], color=[b,r, b,r,b,b,b,b])
plt.title('Shopping')
plt.xticks(rotation=45, ha="right")
plt.yticks([])

# add values in the bars
for i, v in enumerate(plot_theme['clothes_shop_mean']):
    plt.text(i, v, str(v), color='black', ha='center', va='bottom')

# create the third barplot in the lower left corner
plt.subplot(2, 2, 3)
plt.bar(plot_theme['neighbourhood'], plot_theme['culture_mean'], color=[r,r, r,r,b,b,b,b])
plt.title('Culture')
plt.xticks(rotation=45, ha="right")
plt.yticks([])

# add values in the bars
for i, v in enumerate(plot_theme['culture_mean']):
    plt.text(i, v, str(v), color='black', ha='center', va='bottom')

# create the fourth barplot in the lower right corner
plt.subplot(2, 2, 4)
plt.bar(plot_theme['neighbourhood'], plot_theme['going_out_mean'], color=[r,b, b,b,b,b,b,b])
plt.title('Going Out')
plt.xticks(rotation=45, ha="right")
plt.yticks([])

# add values in the bars
for i, v in enumerate(plot_theme['going_out_mean']):
    plt.text(i, v, str(v), color='black', ha='center', va='bottom')

# adjust the spacing between the subplots
plt.subplots_adjust(hspace=0.6)

# add a title for the whole figure
plt.suptitle("Average Count of POI's per Category")

# display the plot
plt.show()



### correlation of agglomeration of POI's + Airbnbs

In [None]:
result.corr()["count"].tail(5)

In [None]:
result.corr()["count"].tail(5).plot(kind="bar")

plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.title("Generall correlation of POI's to Number of Airbnbs in all Neighbourhoods")
plt.show()

## 10.EDA: poi (gastro)

### Heatmap

In [None]:
# set relevant columns for hypothesis 
heat = poi[["price", 
                     "price_category", 
                     "number_of_reviews", 
                     "number_of_reviews_ltm", 
                     "reviews_per_month", 
                     "review_scores_rating",
                     "room_type", 
                     "room_type_int", 
                     "restaurant_500", 
                     "fast_food_500", 
                     "bar_500", 
                     "cafe_500", 
                     "gastro_500"]]

In [None]:
corr = heat.corr(numeric_only=True)

sns.set(rc={"figure.figsize":(16, 12)})

# getting the upper triangle of the co-relation matrix
matrix = np.triu(corr)

# using the upper triangle matrix as mask 
sns.heatmap(corr, annot=True, mask=matrix)

### Hypothesis: In Area's with lot's of POI's, there are less Airbnb's which offer Entire Homes

In [None]:
# group by room_type_int and calculate the mean value 
poi.groupby(by="room_type_int").agg({"gastro_500": "mean", 
                                             "restaurant_500": "mean",
                                             "fast_food_500": "mean",
                                             "cafe_500": "mean",
                                             "bar_500": "mean",
                                             "id": "size"})[["gastro_500","restaurant_500","fast_food_500","cafe_500", "bar_500", "id"]]

In [None]:
# plot without category "hotel room(3)" or "shared room (1)"
poi[(poi["room_type_int"] != 1) & (poi["room_type_int"] != 3)].groupby(by="room_type_int").agg({"gastro_500": "mean", 
                                             "restaurant_500": "mean",
                                             "fast_food_500": "mean",
                                             "cafe_500": "mean",
                                             "bar_500": "mean"})[["restaurant_500","fast_food_500","cafe_500", "bar_500", "gastro_500"]].plot(kind="bar")
plt.xticks([0, 1], ["Private Room", "Entire Home"], rotation=360)
plt.title("Average # of POI's per Room Type")
plt.show()

### Hypothesis: POI's have direct impact on Price & Demand of Airbnb's

In [None]:
# group by price category and calculate the mean value 
poi.groupby(by="price_category").agg({"gastro_500": "mean", 
                                             "restaurant_500": "mean",
                                             "fast_food_500": "mean",
                                             "cafe_500": "mean",
                                             "bar_500": "mean",
                                             "id": "size"})[["gastro_500","restaurant_500","fast_food_500","cafe_500", "bar_500", "id"]]

In [None]:
# plot without category 0 
poi[poi["price_category"] != 0].groupby(by="price_category").agg({"gastro_500": "mean", 
                                             "restaurant_500": "mean",
                                             "fast_food_500": "mean",
                                             "cafe_500": "mean",
                                             "bar_500": "mean"})[["restaurant_500","fast_food_500","cafe_500", "bar_500", "gastro_500"]].plot(kind="bar")

plt.xticks([0, 1, 2, 3], ["<60", "61-100", "101-180", ">180"], rotation=360)
plt.title("Average # of POI's in each Airbnb-Price Category")
plt.show()

### Hypothesis: More POI's == better reviews (for location & general)

### Rating vs. Gastro POI's vs. Price Category

In [None]:
sns.set(rc={"figure.figsize":(16, 12)})

# Filter only Reviews with > 0 Reviews per Month, without Category 0
poi_gastro_filtered = poi[(poi["reviews_per_month"] > 0) & (poi["price_category"] > 0)]

# Define the red-green color palette
custom_palette = sns.color_palette("RdYlGn", as_cmap=True)

# Create the scatter plot with hue based on review_scores_rating
ax = sns.scatterplot(x="reviews_per_month", y="gastro_500", hue="price_category", data=poi_gastro_filtered, palette=custom_palette)

# Rename the x-axis and y-axis labels
ax.set_xlabel("Number of Reviews per Month")
ax.set_ylabel("Number of Gastronomy POIs")

plt.title("Number of Reviews vs. Number of Gastronomy-POI's vs. Price Category")
plt.show()

In [None]:
sns.set(rc={"figure.figsize":(16, 12)})

# Filter only Reviews with > 0 Reviews per Month, without Category 0, and without outliers
poi_gastro_filtered = poi[(poi["reviews_per_month"] > 0) & (poi["reviews_per_month"] < 10) & (poi["price_category"] > 0)]

# Define the red-green color palette
custom_palette = sns.color_palette("RdYlGn", as_cmap=True)

# Create the scatter plot with hue based on review_scores_rating
ax = sns.scatterplot(x="reviews_per_month", y="gastro_500", hue="price_category", data=poi_gastro_filtered, palette=custom_palette)

# Rename the x-axis and y-axis labels
ax.set_xlabel("Number of Reviews per Month")
ax.set_ylabel("Number of Gastronomy POIs")

plt.title("Number of Reviews vs. Number of Gastronomy-POI's vs. Price Category \n Without Outliers ")
plt.show()

### Hypothesis: Certain Amenities have direct impact on Price & Demand of Airbnb's

In [None]:
#Correlation between price & amenities 

columns = ['private_bath', 'wifi', 'long_term_stays_allowed',
       'private_patio_or_balcony', 'private_entrance', 'pets_allowed',
       'outdoor_dining_area', 'lockbox', 'kitchen', 'hair_dryer',
       'free_street_parking', 'free_parking_on_premises',
       'dedicated_workspace', 'coffee', 'bed_linens',
       'bathtub']


selected_columns = poi[columns]
corr = selected_columns.corrwith(poi["price_category"]).sort_values(ascending=False)

print(corr)

In [None]:
plt.figure(figsize=(8, 6))

corr.plot(kind="bar")
plt.title("Correlation Amenities vs. Price ")

plt.ylim(-0.1, 0.5)

In [None]:
#Correlation between amenities & reviews 

columns = ['private_bath', 'wifi', 'long_term_stays_allowed',
       'private_patio_or_balcony', 'private_entrance', 'pets_allowed',
       'outdoor_dining_area', 'lockbox', 'kitchen', 'hair_dryer',
       'free_street_parking', 'free_parking_on_premises',
       'dedicated_workspace', 'coffee', 'bed_linens',
       'bathtub']


selected_columns = poi[columns]
corr = selected_columns.corrwith(poi["review_scores_rating"]).sort_values(ascending=False)

print(corr)

In [None]:
plt.figure(figsize=(8, 6))

corr.plot(kind="bar")
plt.title("Correlation Amenities vs. Review Score")

plt.ylim(-0.1, 0.5)

## 11.Calculation in Python

the calculation of the distances below didnt match the calculated distances in Tableau. Therefore we sticked with the Tableau calculations.

### POI in Area

In [None]:
# ### subset's of df's for each amenity 
# restaurant = osm_short[osm_short["amenity"] == "restaurant"][['id', 'latitude', 'longitude', 'name', 'amenity']]
# fast_food = osm_short[osm_short["amenity"] == "fast_food"][['id', 'latitude', 'longitude', 'name', 'amenity']]
# cafe = osm_short[osm_short["amenity"] == "cafe"][['id', 'latitude', 'longitude', 'name', 'amenity']]
# bar = osm_short[osm_short["amenity"] == "bar"][['id', 'latitude', 'longitude', 'name', 'amenity']]
# subway = osm_short[osm_short["railway"] == "subway_entrance"][['id', 'latitude', 'longitude', 'name', 'railway']]

# street_test = osm_short[["id", "latitude", "longitude", "amenity"]]

In [None]:
# airbnb_short = airbnb[['id', 'latitude', 'longitude']]

#### Function meters_to_degrees

In [None]:
# # Define the conversion factor from meters to degrees based on the latitude
# def meters_to_degrees(meters, latitude):
#     proj_meters = pyproj.CRS("EPSG:3857")  # meters
#     proj_latlon = pyproj.CRS("EPSG:4326")  # degrees
#     transformer = pyproj.Transformer.from_crs(
#         proj_meters, proj_latlon, always_xy=True)
#     lon, lat = transformer.transform(meters, 0)

#     # Calculate the distance per degree of latitude
#     lat_dist_per_deg = 111132.954 - 559.822 * math.cos(2 * math.radians(latitude)) + 1.175 * math.cos(
#         4 * math.radians(latitude)) - 0.0023 * math.cos(6 * math.radians(latitude))

#     # Calculate the distance per degree of longitude
#     lon_dist_per_deg = math.pi / 180 * 6378137 * \
#         math.cos(math.radians(latitude))

#     lat_degrees = meters / lat_dist_per_deg
#     lon_degrees = meters / lon_dist_per_deg
#     return lat_degrees, lon_degrees

### 100 Meter

In [None]:
# # ignore seetingswithcopy only for this cell. will be set back to warn at the end of the code
# pd.options.mode.chained_assignment = None

# # Convert the airbnb_short DataFrame to a GeoDataFrame with a Point geometry column
# airbnb_geo = gpd.GeoDataFrame(airbnb_short, geometry=gpd.points_from_xy( airbnb_short["longitude"], airbnb_short["latitude"]))

# # Convert the street_test DataFrame to a GeoDataFrame with a Point geometry column
# street_test_geo = gpd.GeoDataFrame(street_test, geometry=gpd.points_from_xy(street_test["longitude"], street_test["latitude"]))

# # Create an array of coordinates for the street_test GeoDataFrame
# X = np.column_stack((street_test_geo["longitude"].values, street_test_geo["latitude"].values))

# # Create a BallTree spatial index for the street_test GeoDataFrame
# tree = BallTree(X, leaf_size=40)

# # Define the radius of the search in meters
# radius_meters = 100

# # Loop through each row in airbnb_geo
# for index, row in airbnb_geo.iterrows():
#     # Convert the radius from meters to degrees based on the latitude
#     lat, lon = row["latitude"], row["longitude"]
#     lat_deg, lon_deg = meters_to_degrees(radius_meters, lat)

#     # Use the BallTree spatial index to find the street_test rows within the search radius
#     indices = tree.query_radius([[row["longitude"], row["latitude"]]], r=lon_deg)[0]

#     # Filter the street_test rows to only those within the search radius
#     candidate_rows = street_test_geo.iloc[indices]

#     # Count the occurrences of each amenity in the candidate rows
#     counts = candidate_rows["amenity"].value_counts().to_dict()

#     # Add the counts as new columns in the airbnb_short DataFrame
#     for amenity_type, count in counts.items():
#         airbnb_short.at[index, amenity_type] = count

# #    # Add the list of ids as a new column in the airbnb_short
# #    airbnb_short.at[index, "street_test_ids"] = str(candidate_rows["id"].tolist())

#     # If there are no amenities in the given radius, append "no amenities" in the list of ids
# #    if not candidate_rows["id"].tolist():
# #        airbnb_short.at[index, "street_test_ids"] = "no amenities"

#     # Print progress
#     if index % 10000 == 0:
#         print(f"Processed {index} rows")

# # Replace NaN values with 0
# airbnb_short.fillna(value=0, inplace=True)

# pd.options.mode.chained_assignment = 'warn'

### Nearest distance 

In [None]:
# ### is calculating values, but they seem to small
# from scipy.spatial import cKDTree

# # Import the radians function from numpy
# from numpy import radians

# # Convert the latitude and longitude columns in both dataframes to radians
# airbnb[['latitude', 'longitude']] = radians(airbnb[['latitude', 'longitude']])
# subway[['latitude', 'longitude']] = radians(subway[['latitude', 'longitude']])

# # Build the KDTree index using the radians converted latitude and longitude columns in the subway dataframe
# subway_tree = cKDTree(subway[['latitude', 'longitude']])

# # Query the KDTree index for the nearest subway station to each airbnb location
# distances, indices = subway_tree.query(airbnb[['latitude', 'longitude']], k=1)

# # Convert the distance from radians to meters
# earth_radius = 6371000  # radius of the Earth in meters
# distances_meters = distances * earth_radius

# # Add the nearest subway station distance to each airbnb row
# airbnb['nearest_subway_distance'] = distances_meters



In [None]:
#airbnb['nearest_subway_distance'].describe()

## Old Versions

In [None]:
# ### runs, but with too smal results 

# # ignore seetingswithcopy only for this cell. will be set back to warn at the end of the code 
# pd.options.mode.chained_assignment = None

# # Define the conversion factor from meters to degrees based on the latitude
# def meters_to_degrees(meters, latitude):
#     proj_meters = pyproj.CRS("EPSG:3857")  # meters
#     proj_latlon = pyproj.CRS("EPSG:4326")  # degrees
#     transformer = pyproj.Transformer.from_crs(proj_meters, proj_latlon, always_xy=True)
#     lon, lat = transformer.transform(meters, 0)
#     lat_dist_per_deg = 111132.954 - 559.822 * math.cos(2 * math.radians(latitude)) + 1.175 * math.cos(4 * math.radians(latitude))
#     lon_dist_per_deg = 111412.84 * math.cos(math.radians(latitude))
#     lat_degrees = meters / lat_dist_per_deg
#     lon_degrees = meters / lon_dist_per_deg
#     return lat_degrees, lon_degrees


# airbnb_test["closest_amenity"] = ""


# # Convert the airbnb_test DataFrame to a GeoDataFrame with a Point geometry column
# airbnb_test_geo = gpd.GeoDataFrame(airbnb_test, geometry=gpd.points_from_xy(airbnb_test["longitude"], airbnb_test["latitude"]))

# # Convert the street_test DataFrame to a GeoDataFrame with a Point geometry column
# street_test_geo = gpd.GeoDataFrame(street_test, geometry=gpd.points_from_xy(street_test["longitude"], street_test["latitude"]))

# # Create an R-tree spatial index for the street_test GeoDataFrame
# street_test_sindex = street_test_geo.sindex

# # Define the radius of the search in meters
# radius_meters = 1_000

# # Loop through each row in airbnb_test_geo
# for index, row in airbnb_test_geo.iterrows():
#     # Convert the radius from meters to degrees based on the latitude
#     lat, lon = row["latitude"], row["longitude"]
#     lat_deg, lon_deg = meters_to_degrees(radius_meters, lat)
    
#     # Use the R-tree spatial index to find the street_test rows within the search radius
#     candidate_indices = list(street_test_sindex.intersection(row.geometry.buffer(lon_deg).bounds))

#     # Filter the street_test rows to only those within the search radius
#     candidate_rows = street_test_geo.iloc[candidate_indices]
# #
#     if len(candidate_rows) == 0:
#         # No amenities within the search radius
#         closest_amenity_distance = np.nan
#     else:
#         # Calculate the distances from the current Airbnb location to all the amenities in the search radius
#         candidate_rows["distance"] = candidate_rows.geometry.distance(row.geometry)

#         # Sort the candidate rows by distance
#         candidate_rows = candidate_rows.sort_values("distance")

#         # Find the closest amenity and its distance
#         closest_amenity = candidate_rows["amenity"].iloc[0]
#         closest_amenity_distance = candidate_rows["distance"].iloc[0]

#     # Add the closest amenity and its distance as new columns in the airbnb_test DataFrame
#     airbnb_test.at[index, "closest_amenity"] = closest_amenity
#     airbnb_test.at[index, "closest_amenity_distance_m"] = closest_amenity_distance

#     # Print progress
#     if index % 10000 == 0:
#         print(f"Processed {index} rows")

# # set seetingswithcopy back only for this cell. will be set back to warn at the end of the code 
# pd.options.mode.chained_assignment = 'warn'


In [None]:
#airbnb_test.describe()

In [None]:
# ### runs, but with less accurate results, than in 4.1.

# ### Number of amenities + liste 

# import geopandas as gpd
# from shapely.geometry import Point
# from shapely.ops import transform
# from functools import partial
# import pyproj
# import math

# # Define the conversion factor from meters to degrees based on the latitude
# def meters_to_degrees(meters, latitude):
#     proj_meters = pyproj.CRS("EPSG:3857")  # meters
#     proj_latlon = pyproj.CRS("EPSG:4326")  # degrees
#     transformer = pyproj.Transformer.from_crs(
#         proj_meters, proj_latlon, always_xy=True)
#     lon, lat = transformer.transform(meters, 0)

#     # Calculate the distance per degree of latitude
#     lat_dist_per_deg = 111132.954 - 559.822 * math.cos(2 * math.radians(latitude)) + 1.175 * math.cos(
#         4 * math.radians(latitude)) - 0.0023 * math.cos(6 * math.radians(latitude))

#     # Calculate the distance per degree of longitude
#     lon_dist_per_deg = math.pi / 180 * 6378137 * \
#         math.cos(math.radians(latitude))

#     lat_degrees = meters / lat_dist_per_deg
#     lon_degrees = meters / lon_dist_per_deg
#     return lat_degrees, lon_degrees


# # Convert the airbnb_test DataFrame to a GeoDataFrame with a Point geometry column
# airbnb_test_geo = gpd.GeoDataFrame(airbnb_test, geometry=gpd.points_from_xy(airbnb_test["longitude"], airbnb_test["latitude"]))

# # Convert the street_test DataFrame to a GeoDataFrame with a Point geometry column
# street_test_geo = gpd.GeoDataFrame(street_test, geometry=gpd.points_from_xy(street_test["longitude"], street_test["latitude"]))

# # Create an R-tree spatial index for the street_test GeoDataFrame
# street_test_sindex = street_test_geo.sindex

# # Define the radius of the search in meters
# radius_meters = 200

# # Loop through each row in airbnb_test_geo
# for index, row in airbnb_test_geo.iterrows():
#     # Convert the radius from meters to degrees based on the latitude
#     lat, lon = row["latitude"], row["longitude"]
#     lat_deg, lon_deg = meters_to_degrees(radius_meters, lat)
    
#     # Use the R-tree spatial index to find the street_test rows within the search radius
#     candidate_indices = list(street_test_sindex.intersection(row.geometry.buffer(lon_deg).bounds))

#     # Filter the street_test rows to only those within the search radius
#     candidate_rows = street_test_geo.iloc[candidate_indices]

#     # Create an empty list to store the id's of street_test rows
#     ids = []

#     # Group the candidate rows by amenity and count the occurrences
#     counts = candidate_rows.groupby("amenity").size().to_dict()

#     # Add the counts as new columns in the airbnb_test DataFrame
#     for amenity_type, count in counts.items():
#         airbnb_test.at[index, amenity_type] = count
#         ids.extend(candidate_rows[candidate_rows["amenity"] == amenity_type]["id"].tolist())

#     # If there are no amenities in the given radius, append "no amenities" in the list of ids
#     if not ids:
#         ids.append("no amenities")
        
#     # Add the list of ids as a new column in the airbnb_test DataFrame
#     airbnb_test.at[index, "street_test_ids"] = str(ids)

#     # Print progress
#     if index % 10000 == 0:
#         print(f"Processed {index} rows")

# # Replace NaN values with 0
# airbnb_test.fillna(value=0, inplace=True)



In [None]:
# ### Calculation of POI's in Area with Balltree (old, not working properly)

# # Calculate the needed radius when converted to unit sphere.
# distance_in_meter = 200
# earth_radius_in_meter = 6_371_000

# radius = distance_in_meter / earth_radius_in_meter

# # Convert the latitude and longitude columns to radians
# airbnb_test = airbnb_test.copy()
# airbnb_test.loc[:, 'lat_rad'] = np.radians(airbnb_test['latitude'])
# airbnb_test.loc[:, 'lon_rad'] = np.radians(airbnb_test['longitude'])
# street_test = street_test.copy()
# street_test.loc[:, 'lat_rad'] = np.radians(street_test['latitude'])
# street_test.loc[:, 'lon_rad'] = np.radians(street_test['longitude'])

# # Create a BallTree object with the latitude and longitude columns
# tree = BallTree(street_test[['lat_rad', 'lon_rad']],
#                 leaf_size=15, metric='haversine')

# # Find the indices of all neighbors within a radius of 500 meters
# # for each row in list_test
# indices = tree.query_radius(
#     airbnb_test[['lat_rad', 'lon_rad']], r=radius, count_only=False)

# # Calculate the number of neighbors for each amenity type
# amenity_types = street_test['amenity'].unique()
# amenity_counts = np.zeros((airbnb_test.shape[0], amenity_types.shape[0]))
# for i, amenity in enumerate(amenity_types):
#     street_indices = street_test[street_test['amenity'] == amenity].index
#     intersection_counts = np.array(
#         [np.intersect1d(street_indices, idx).size for idx in indices])
#     amenity_counts[:, i] = intersection_counts

# # Add the new columns to list_test
# list_test = pd.concat([airbnb_test, pd.DataFrame(amenity_counts, columns=[
#                       f'num_neighbors_{amenity}' for amenity in amenity_types])], axis=1)

# # Calculate the number of neighbors for each railway type
# railway_types = street_test['railway'].unique()
# railway_counts = np.zeros((list_test.shape[0], railway_types.shape[0]))
# for i, railway in enumerate(railway_types):
#     street_indices = street_test[street_test['railway'] == railway].index
#     intersection_counts = np.array(
#         [np.intersect1d(street_indices, idx).size for idx in indices])
#     railway_counts[:, i] = intersection_counts

# # Add the new columns to list_test
# list_test = pd.concat([list_test, pd.DataFrame(railway_counts, columns=[
#                       f'num_neighbors_{railway}' for railway in railway_types])], axis=1)


# # Remove the temporary columns
# list_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)
# street_test.drop(columns=['lat_rad', 'lon_rad'], inplace=True)
