# Does the Tourism Preferences Changes after COVID19?

## CASA0013: Foundations of Spatial Data Science

### Student Ids: ucfnjji, ucfnlun, ucfnpar, ucfnrli.

## Green Space Data

In [2]:
# Import visualisation modules
import matplotlib as mpl 
%matplotlib inline 
import matplotlib.pyplot as plt 

#Import modules
import osmnx as ox
import pandas as pd
import geopandas as gpd
import numpy as np
import os

import warnings 
warnings.simplefilter(action='ignore')
ox.__version__

'1.2.2'

In [3]:
# Set up query
q1 = {
    "tourism":"camp_site",
    'leisure':'common',
    'leisure':'dog_park',
    'natural':'scrub',
    'natural':'fell',
    'landuse':'forest',
    'leisure':'garden',
    'landuse':'greenfield',
    'leisure':'golf_course',
    'landuse':'grass',
    'natural':'grassland',
    'natural':'heath',
    'landuse':'meadow',
    'natural':'moor',
    'leisure':'nature_reserve',
    'landuse':'orchard',
    'leisure':'park',
    'leisure':'pitch',
    'landuse':'recreation_ground',
    'landuse':'village_green',
    'landuse':'vineyard',
    'natural':'wood'
    }

greenspace = ox.geometries.geometries_from_place(
            'Greater London, UK',
            tags = q1,
            which_result=1)

greenspace.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,created_by,barrier,bicycle,foot,source,leisure,name,name:ru,sport,...,oneway,construction,proposed,religion,danger,genus:en,informal,ways,type,network
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,20851184,POINT (-0.33622 51.40443),,,,,,pitch,The Royal Tennis Court,Реал-теннис,real_tennis,...,,,,,,,,,,
node,92273182,POINT (-0.40698 51.48916),JOSM,,,,,,,,,...,,,,,,,,,,
node,895874399,POINT (-0.23028 51.55593),,,,,,pitch,Kilburn Cosmos RFC,,rugby,...,,,,,,,,,,
node,920063079,POINT (-0.06894 51.56576),,,,,,pitch,,,,...,,,,,,,,,,
node,1296074660,POINT (-0.17313 51.41807),,,,,,pitch,,,table_tennis,...,,,,,,,,,,


In [4]:
greenspace.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 15452 entries, ('node', 20851184) to ('relation', 15268904)
Columns: 278 entries, geometry to network
dtypes: geometry(1), object(277)
memory usage: 33.4+ MB


In [5]:
path = os.path.join('data')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
greenspace.to_csv(os.path.join(path,'greenspace.csv'), index=False)

## Tourism Attraction Data

In [6]:
# Set up query
q2 = {'tourism':'attraction'}

# Run query
# Jin: I change ox.pois.pois_from_place to ox.geometries.geometries_from_place, for the adjustment of osmnx version change.
# details: https://stackoverflow.com/questions/71559143/what-happened-to-the-pois-module-in-osmnx-and-what-to-use-now
tourism_attraction = ox.geometries.geometries_from_place(
            'Greater London, UK',
            tags = q2,
            which_result=1)

tourism_attraction.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,historic,name,tourism,wikidata,wikipedia,geometry,access,barrier,bicycle,place,...,int_name,source:description,name:ban,name:eo,name:hak,name:mai,name:pms,name:tl,name:sw,name:xmf
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,25524252,building,Blewcoat School,attraction,Q4926413,en:Blewcoat School,POINT (-0.13606 51.49830),,,,,...,,,,,,,,,,
node,26559743,,,attraction,,,POINT (-0.14525 51.39520),,,,,...,,,,,,,,,,
node,252602371,,London Bridge Experience,attraction,Q7748032,en:The London Bridge Experience,POINT (-0.08826 51.50639),,,,,...,,,,,,,,,,
node,269236138,,Little Holland House,attraction,,,POINT (-0.17065 51.35530),,,,,...,,,,,,,,,,
node,293221901,,Hall Place and Gardens,attraction,Q5642615,,POINT (0.16023 51.44819),,,,,...,,,,,,,,,,


In [7]:
tourism_attraction.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 344 entries, ('node', 25524252) to ('relation', 12942436)
Columns: 359 entries, historic to name:xmf
dtypes: geometry(1), object(358)
memory usage: 976.9+ KB


In [8]:
path = os.path.join('data')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
tourism_attraction.to_csv(os.path.join(path,'tourism_attraction.csv'), index=False)

## Airbnb data (Pre-precessing)

Aim: our dependent variable which represents the tourists' area(?) preference on choosing accommondation. 
It would be the estimates bookings of Airbnb listings in each area.

Workflow: 

    1. to get the number of review of all the listing has in the last 12 months,
    
    2. and calculate the estimates bookings for each Airbnb listings in the last 12 months according to review rate in London.
    
    3. ...sum up or times _an average length of stay_ (e.g. 5.5 nights)

### read in data

In [89]:
IA_2022 = pd.read_csv('./data/2022-09-10-listings.csv.gz',compression='gzip', low_memory=False)
IA_2022.info()
# we need to filter out the columns.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69358 entries, 0 to 69357
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            69354 non-null  object 
 1   listing_url                                   69357 non-null  object 
 2   scrape_id                                     69357 non-null  object 
 3   last_scraped                                  69357 non-null  object 
 4   source                                        69358 non-null  object 
 5   name                                          69337 non-null  object 
 6   description                                   67839 non-null  object 
 7   neighborhood_overview                         39966 non-null  object 
 8   picture_url                                   69358 non-null  object 
 9   host_id                                       69358 non-null 

In [68]:
IA_2019 = pd.read_csv('./data/London-2019-11-listings.csv',low_memory=False)
IA_2019.info()
# we need to fix the data type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85236 entries, 0 to 85235
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   id                              85236 non-null  object
 1   name                            85236 non-null  object
 2   host_id                         85236 non-null  object
 3   host_name                       85236 non-null  object
 4   neighbourhood_group             85236 non-null  object
 5   neighbourhood                   85236 non-null  object
 6   latitude                        85236 non-null  object
 7   longitude                       85236 non-null  object
 8   room_type                       85236 non-null  object
 9   price                           85236 non-null  object
 10  minimum_nights                  85236 non-null  object
 11  number_of_reviews               85236 non-null  object
 12  last_review                     85236 non-null

### clean data

In [61]:
# select the column we need
df_raw1 = IA_2022
df_raw2 = IA_2019
print(df_raw1.columns.to_list())
print(df_raw2.columns.to_list())

['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availabil

In [62]:
IA_2019.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,room_class,last_review_timestamp
0,9582415,Single/Twin/Double Ensuite near Twickenham Sta...,49602995,Daniela,,Richmond upon Thames,51.44473,-0.379,Private room,35,1,47,2019-11-06,2.27,1,278,Private room,11/6/2019 0:00:00
1,23013522,"Spacious room with double bed for 2, Twickenham",70374572,Monk,,Hounslow,51.45867,-0.34444,Private room,19,1,131,2019-11-06,6.52,3,23,Private room,11/6/2019 0:00:00
2,25796711,Stunning Central London Apartment Close The River,25587547,Matthew,,Hammersmith and Fulham,51.47398,-0.21531,Entire home/apt,103,7,8,2019-11-06,0.51,1,317,Entire home/apt,11/6/2019 0:00:00
3,29060134,Recently renovated fabulous four bedroom house,57607790,Shofraz,,Hillingdon,51.52716,-0.44164,Entire home/apt,85,14,5,2019-11-06,0.4,1,346,Entire home/apt,11/6/2019 0:00:00
4,30374086,Lovely cosy flat for 4 in Heart of North London,228095684,Yashar,,Barnet,51.59118,-0.1667,Entire home/apt,85,3,16,2019-11-06,1.52,3,139,Entire home/apt,11/6/2019 0:00:00


#### filter out the columns (2022 dataset)

In [26]:
# select the columns we may need. All fields about review are reserved.
cols = ['id', 'listing_url', 'last_scraped', 'name', 'description', 'host_id', 'host_name', 
        'host_since','host_listings_count', 'host_total_listings_count', 
        'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 
        'amenities', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 
        'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 
        'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
        'review_scores_communication', 'review_scores_location', 'review_scores_value', 'license', 'reviews_per_month']

In [37]:
df1 = df_raw1[cols]
df1.sample(5)

Unnamed: 0,id,listing_url,last_scraped,name,description,host_id,host_name,host_since,host_listings_count,host_total_listings_count,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,reviews_per_month
56439,595325688935564954,https://www.airbnb.com/rooms/595325688935564954,2022-09-11,"Entire house in the center, duplex with garden",,227144138.0,Alex,2018-11-23,5.0,11.0,...,2022-08-02,4.63,4.63,4.63,4.75,4.63,4.88,4.13,,1.48
56626,601440622903951582,https://www.airbnb.com/rooms/601440622903951582,2022-09-11,Modern 1 bed flat with office space in West Lo...,Our 1-bed apartment is an ideal base to explor...,9968726.0,Jonathan,2013-11-12,1.0,1.0,...,2022-09-03,5.0,5.0,5.0,5.0,5.0,4.83,5.0,,4.19
63880,673909121711100746,https://www.airbnb.com/rooms/673909121711100746,2022-09-11,Cute & spacious 1 bed flat with parking,A great central location in South East London....,65334911.0,Jess,2016-04-01,1.0,1.0,...,,,,,,,,,,
8264,9410577,https://www.airbnb.com/rooms/9410577,2022-09-11,2 bed spacious flat East London,"The flat is a bright, spacious, cosy 2 double ...",10730601.0,Oliver,2013-12-20,2.0,2.0,...,,,,,,,,,,
29882,28884075,https://www.airbnb.com/rooms/28884075,2022-09-12,Stunning 2 Bedroom Apartment in Belsize Park,This beautiful 2 bedroom apartment sits in the...,215907872.0,Charlie,2018-09-17,1.0,1.0,...,2019-01-02,3.5,3.0,4.5,2.5,4.0,4.5,3.0,,0.04


In [40]:
df1.sample(5)[['latitude','longitude', 'property_type', 'number_of_reviews', 'number_of_reviews_ltm']]

Unnamed: 0,latitude,longitude,property_type,number_of_reviews,number_of_reviews_ltm
55207,51.579205,0.085855,Private room in home,7.0,7.0
7405,51.5129,-0.12224,Entire rental unit,126.0,22.0
19788,51.53905,-0.29733,Entire rental unit,1.0,0.0
26693,51.50958,-0.30621,Entire serviced apartment,7.0,1.0
46156,51.52958,-0.17547,Private room in rental unit,0.0,0.0


#### fixing data type (2019 dataset)

In [69]:
df2 = IA_2019

In [75]:
df2['reviews_per_month']

0        2.27 
1        6.52 
2        0.51 
3        0.40 
4        1.52 
         ...  
85231         
85232         
85233         
85234         
85235         
Name: reviews_per_month, Length: 85236, dtype: object

In [84]:
# there was some empty string in the dataset...
df2 = df2.replace(r'^\s*$', np.nan, regex=True)

In [93]:
# transfer "object" to numeric 
ints  = ['latitude', 'longitude', 'price', 
         'number_of_reviews', 'reviews_per_month']
for i in ints:
    print(f"Converting {i}")
    df2[i] = df2[i].astype('float')

Converting latitude
Converting longitude
Converting price
Converting number_of_reviews
Converting reviews_per_month


In [94]:
# check that they have been changed to float
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85236 entries, 0 to 85235
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              85068 non-null  object 
 1   name                            85056 non-null  object 
 2   host_id                         84912 non-null  object 
 3   host_name                       84900 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   84911 non-null  object 
 6   latitude                        84911 non-null  float64
 7   longitude                       84911 non-null  float64
 8   room_type                       84911 non-null  object 
 9   price                           84911 non-null  float64
 10  minimum_nights                  84911 non-null  object 
 11  number_of_reviews               84911 non-null  float64
 12  last_review                     

#### drop NA/NULL/missing values

but we should keep "0".

In [105]:
df1.isnull().sum(axis=0).sort_values(ascending=False)[:12]
# little null value in column 'number_of_reviews_ltm' in IA 2022. We can just drop them.

license                        69358
review_scores_value            17849
review_scores_checkin          17848
review_scores_location         17846
review_scores_communication    17815
review_scores_accuracy         17814
review_scores_cleanliness      17801
reviews_per_month              16792
review_scores_rating           16792
last_review                    16787
first_review                   16785
description                     1519
dtype: int64

In [103]:
df2.isnull().sum(axis=0).sort_values(ascending=False)
# There are 20298 missing values in column 'reviews_per_month' in IA 2019. 
# And I checked that these listings have no review either. therefore, we can just drop them...

neighbourhood_group               85236
total_bookings_2019               20298
last_review_timestamp             20298
reviews_per_month                 20298
last_review                       20298
host_name                           336
minimum_nights                      325
availability_365                    325
calculated_host_listings_count      325
number_of_reviews                   325
price                               325
room_type                           325
longitude                           325
latitude                            325
neighbourhood                       325
host_id                             324
name                                180
id                                  168
room_class                            0
dtype: int64

In [112]:
df2[df2['reviews_per_month'].isna()]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,room_class,last_review_timestamp,total_bookings_2019
64938,38151,Double room/ lounge,163634,Lisa,,Croydon,51.41940,-0.08722,Private room,65.0,1,0.0,,,1,365,Private room,,
64939,38950,Room 1 Large Double Bedroom - front ground floor,167107,Paul,,Haringey,51.58730,-0.08606,Private room,45.0,1,0.0,,,4,90,Private room,,
64940,38995,ROOM IN CONTEMPORARY STYLE FLAT,167281,C,,Southwark,51.47892,-0.06040,Private room,45.0,1,0.0,,,1,87,Private room,,
64941,40228,Room 4 Cosy Double Bedroom on First Floor,167107,Paul,,Haringey,51.58860,-0.08805,Private room,29.0,1,0.0,,,4,45,Private room,,
64942,96008,Light airy double bedroom for rent,510424,Dionne,,Lambeth,51.43923,-0.10421,Private room,80.0,2,0.0,,,1,365,Private room,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85231,39868875,"The Apartment, East London Village",13553423,Melissa,,Waltham Forest,51.58208,-0.00915,Entire home/apt,70.0,3,0.0,,,1,88,Entire home/apt,,
85232,39869123,Elegant stylish modern house with amazing view!,21145791,Ingrid,,Merton,51.40628,-0.23244,Private room,69.0,2,0.0,,,2,113,Private room,,
85233,39869214,Holland Road - Kensington Olympia,251922260,Christian,,Hammersmith and Fulham,51.50244,-0.21547,Entire home/apt,90.0,3,0.0,,,4,63,Entire home/apt,,
85234,39869249,Functional single room 18 min to London Bridge,255372486,Nudrat,,Bromley,51.35742,0.10764,Private room,30.0,2,0.0,,,4,1,Private room,,


there are 

### calculate the occupancy rate

In [32]:
# The Occupancy Model is constructed by following the modified methodology from Inside Airbnb and ...

In [99]:
# A Review Rate of 50% is used to convert reviews to estimated bookings.
review_rate = 0.5
# Airbnb 2022 
df1['total_bookings_2022'] = df1['number_of_reviews_ltm']/review_rate
df1[['total_bookings_2022','number_of_reviews_ltm']]

Unnamed: 0,total_bookings_2022,number_of_reviews_ltm
0,18.0,9.0
1,0.0,0.0
2,2.0,1.0
3,0.0,0.0
4,16.0,8.0
...,...,...
69353,2.0,1.0
69354,24.0,12.0
69355,12.0,6.0
69356,8.0,4.0


In [100]:
# Airbnb 2019 
df2['total_bookings_2019'] = df2['reviews_per_month']*12/review_rate
df2[['total_bookings_2019','number_of_reviews']]

Unnamed: 0,total_bookings_2019,number_of_reviews
0,54.48,47.0
1,156.48,131.0
2,12.24,8.0
3,9.60,5.0
4,36.48,16.0
...,...,...
85231,,0.0
85232,,0.0
85233,,0.0
85234,,0.0
