# Does the Tourism Preferences Changes after COVID19?

## CASA0013: Foundations of Spatial Data Science

### Student Ids: ucfnjji, ucfnlun, ucfnpar, ucfnrli.

## Green Space Data

In [3]:
# Import visualisation modules
import matplotlib as mpl 
%matplotlib inline 
import matplotlib.pyplot as plt 

#Import modules
import osmnx as ox
import pandas as pd
import geopandas as gpd
import numpy as np
import os

import warnings 
warnings.simplefilter(action='ignore')
ox.__version__

'1.2.2'

In [4]:
# Set up query
q1 = {
    "tourism":"camp_site",
    'leisure':'common',
    'leisure':'dog_park',
    'natural':'scrub',
    'natural':'fell',
    'landuse':'forest',
    'leisure':'garden',
    'landuse':'greenfield',
    'leisure':'golf_course',
    'landuse':'grass',
    'natural':'grassland',
    'natural':'heath',
    'landuse':'meadow',
    'natural':'moor',
    'leisure':'nature_reserve',
    'landuse':'orchard',
    'leisure':'park',
    'leisure':'pitch',
    'landuse':'recreation_ground',
    'landuse':'village_green',
    'landuse':'vineyard',
    'natural':'wood'
    }

greenspace = ox.geometries.geometries_from_place(
            'Greater London, UK',
            tags = q1,
            which_result=1)

greenspace.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,created_by,barrier,bicycle,foot,source,leisure,name,name:ru,sport,...,oneway,construction,proposed,religion,danger,genus:en,informal,ways,type,network
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,20851184,POINT (-0.33622 51.40443),,,,,,pitch,The Royal Tennis Court,Реал-теннис,real_tennis,...,,,,,,,,,,
node,92273182,POINT (-0.40698 51.48916),JOSM,,,,,,,,,...,,,,,,,,,,
node,895874399,POINT (-0.23028 51.55593),,,,,,pitch,Kilburn Cosmos RFC,,rugby,...,,,,,,,,,,
node,920063079,POINT (-0.06894 51.56576),,,,,,pitch,,,,...,,,,,,,,,,
node,1296074660,POINT (-0.17313 51.41807),,,,,,pitch,,,table_tennis,...,,,,,,,,,,


In [5]:
greenspace.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 15452 entries, ('node', 20851184) to ('relation', 15268904)
Columns: 278 entries, geometry to network
dtypes: geometry(1), object(277)
memory usage: 33.4+ MB


In [6]:
path = os.path.join('data')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
greenspace.to_csv(os.path.join(path,'greenspace.csv'), index=False)

## Tourism Attraction Data

In [7]:
# Set up query
q2 = {'tourism':'attraction'}

# Run query
# Jin: I change ox.pois.pois_from_place to ox.geometries.geometries_from_place, for the adjustment of osmnx version change.
# details: https://stackoverflow.com/questions/71559143/what-happened-to-the-pois-module-in-osmnx-and-what-to-use-now
tourism_attraction = ox.geometries.geometries_from_place(
            'Greater London, UK',
            tags = q2,
            which_result=1)

tourism_attraction.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,historic,name,tourism,wikidata,wikipedia,geometry,access,barrier,bicycle,place,...,int_name,source:description,name:ban,name:eo,name:hak,name:mai,name:pms,name:tl,name:sw,name:xmf
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,25524252,building,Blewcoat School,attraction,Q4926413,en:Blewcoat School,POINT (-0.13606 51.49830),,,,,...,,,,,,,,,,
node,26559743,,,attraction,,,POINT (-0.14525 51.39520),,,,,...,,,,,,,,,,
node,252602371,,London Bridge Experience,attraction,Q7748032,en:The London Bridge Experience,POINT (-0.08826 51.50639),,,,,...,,,,,,,,,,
node,269236138,,Little Holland House,attraction,,,POINT (-0.17065 51.35530),,,,,...,,,,,,,,,,
node,293221901,,Hall Place and Gardens,attraction,Q5642615,,POINT (0.16023 51.44819),,,,,...,,,,,,,,,,


In [8]:
tourism_attraction.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 344 entries, ('node', 25524252) to ('relation', 12942436)
Columns: 359 entries, historic to name:xmf
dtypes: geometry(1), object(358)
memory usage: 976.9+ KB


In [9]:
path = os.path.join('data')

if not os.path.exists(path):
    print(f"Creating {path} under {os.getcwd()}")
    os.makedirs(path)
    
tourism_attraction.to_csv(os.path.join(path,'tourism_attraction.csv'), index=False)

## Airbnb data (Pre-precessing)

My workflow planning of this:

Aim: calculate the availablity for each Airbnb listing in 2019 and 2022.

    Count f or t in field "available" by rows and also group by "listing_id".
    
    Get a field we need finally, which represent the number of days booked/occupied in the period of one year.
    
    Join this field to listing.csv by listing_id.

### read in data

In [10]:
IA_cal_2022 = pd.read_csv('./data/calendar.csv.gz',compression='gzip', low_memory=False)
IA_cal_2022

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,13913,2022-09-11,f,$50.00,$50.00,1.0,29.0
1,106332,2022-09-11,f,$55.00,$55.00,3.0,365.0
2,106332,2022-09-12,f,$55.00,$55.00,3.0,365.0
3,106332,2022-09-13,f,$55.00,$55.00,3.0,365.0
4,106332,2022-09-14,f,$55.00,$55.00,3.0,365.0
...,...,...,...,...,...,...,...
25310907,554172169432589107,2023-09-07,f,$134.00,$134.00,3.0,5.0
25310908,554172169432589107,2023-09-08,f,$134.00,$134.00,3.0,4.0
25310909,554172169432589107,2023-09-09,f,$134.00,$134.00,3.0,3.0
25310910,554172169432589107,2023-09-10,f,$134.00,$134.00,2.0,1125.0


### fixing data type

In [12]:
#check data type, and find the column "available" is object type. so change it to boolean?
IA_cal_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25310912 entries, 0 to 25310911
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           object 
 4   adjusted_price  object 
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.3+ GB


In [13]:
bools = ['available']
IA_cal_2022.sample(5, random_state=43)[bools]

Unnamed: 0,available
9656045,t
9976390,f
4219951,t
9336946,f
22511373,f


In [14]:
# to map 't' and 'f' to True and False
for b in bools:
    print(f"Converting {b}")
    IA_cal_2022[b] = IA_cal_2022[b].replace({'f':False, 't':True}).astype('bool')

Converting available


In [None]:
IA_cal_2022.sample(5, random_state=43)[bools]

In [16]:
# check that it has been transformed to boolean value
IA_cal_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25310912 entries, 0 to 25310911
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       bool   
 3   price           object 
 4   adjusted_price  object 
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: bool(1), float64(2), int64(1), object(3)
memory usage: 1.2+ GB


In [23]:
# split a test dataset first, because the original whole table is toooooo large.
# p.s. What is a feather file? Can I transfer csv.gz to feather to make the memory usage smaller?
from sklearn.model_selection import train_test_split 
df = IA_cal_2022.copy()  
df_test = train_test_split(df,test_size = 0.00001,random_state=44)

In [27]:
df_test

[                  listing_id        date  available    price adjusted_price  \
 21360490  626018464134708493  2022-10-31       True  $125.00        $125.00   
 2111221              7108631  2023-01-04      False  $140.00        $140.00   
 17537166            49777101  2023-06-26      False  $236.00        $236.00   
 8645338             22630583  2023-04-13      False   $55.00         $55.00   
 14448004            39330005  2022-10-23       True  $190.00        $190.00   
 ...                      ...         ...        ...      ...            ...   
 12242819            32948088  2023-01-13      False  $105.00        $105.00   
 10535483            27753972  2023-06-15       True  $120.00        $120.00   
 2253997              7522224  2022-10-02      False   $30.00         $30.00   
 13888241            37697462  2023-09-09      False   $39.00         $39.00   
 12258723            32986196  2023-06-09      False  $122.00        $122.00   
 
           minimum_nights  maximum_nig

In [25]:
print(f"  Testing data size:  {df_test.info()} ")
# it's a list, not a dateframe...

AttributeError: 'list' object has no attribute 'info'

In [28]:
df_test.groupby('listing_id')['available'].sum()

AttributeError: 'list' object has no attribute 'groupby'