# Parameter Setup

In [1]:
import json
import pandas as pd
import numpy as np
import math

from datetime import datetime
from random import sample 

In [2]:
# ElasticSeach
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [3]:
# Elastic App Search
from elastic_enterprise_search import AppSearch

app_search = AppSearch(
    "http://localhost:3002",
    http_auth="private-6jj3ai4ckkq2xykcocosmv6o"
)

In [4]:
app_search.list_engines()

{'meta': {'page': {'current': 1, 'total_pages': 1, 'total_results': 1, 'size': 25}}, 'results': [{'name': 'airbnb-history-boston', 'type': 'default', 'language': None, 'document_count': 162737}]}

# Get All Time Snapshots

In [5]:
from elasticsearch.helpers import scan

# Query parameters
k = 10000
location = "boston"
start_date = '2020-12-21' # default is '2020-12-21'
end_date = '2021-03-21'   # default is '2020-12-21' + 90d
index_name = "airbnb_history_" + location

res = scan(
    es,
    index=index_name,
    query={"query": { "match_all" : {}}}
)

In [6]:
data = list(res)
len(data)

162737

In [7]:
# appending rows 
docs = []
for item in data:  
    doc = {
        'id': item['_id'],
        'index': item['_index'],
        'listing_url': item['_source']['listing_url'],
        'scrape_id': item['_source']['scrape_id'],
        'last_scraped': item['_source']['last_scraped'],
        'crawled_date': item['_source']['crawled_date'],
        'name': item['_source']['name'],
        'host_id': item['_source']['host_id'],
        'host_is_superhost': item['_source']['host_is_superhost'],
        'host_identity_verified': item['_source']['host_identity_verified'],
        'room_type': item['_source']['room_type'],
        'accommodates': item['_source']['accommodates'],
        'guests_included': item['_source']['guests_included'],
        'minimum_nights': item['_source']['minimum_nights'],
        'maximum_nights': item['_source']['maximum_nights'],
        'calendar_updated': item['_source']['calendar_updated'],
        'instant_bookable': item['_source']['instant_bookable'],
        'is_business_travel_ready': item['_source']['is_business_travel_ready'],
        'cancellation_policy': item['_source']['cancellation_policy'],
        'price': item['_source']['price'],
        'availability_30': item['_source']['availability_30'],
        'availability_60': item['_source']['availability_60'],
        'availability_90': item['_source']['availability_90'],
        'availability_365': item['_source']['availability_365'],
        'first_review': item['_source']['first_review'],
        'last_review': item['_source']['last_review'],
        'review_scores_rating': item['_source']['review_scores_rating'],
        'review_scores_accuracy': item['_source']['review_scores_accuracy'],
        'review_scores_cleanliness': item['_source']['review_scores_cleanliness'],
        'review_scores_checkin': item['_source']['review_scores_checkin'],
        'review_scores_communication': item['_source']['review_scores_communication'],
        'review_scores_location': item['_source']['review_scores_location'],
        'review_scores_value': item['_source']['review_scores_value'],
        'overall_rating': item['_source']['overall_rating']
    }
    
    docs.append(doc) 

In [8]:
df = pd.DataFrame(docs)

In [9]:
date_snapshots = np.sort(df.crawled_date.unique())

date_snapshots

array(['20151002', '20160906', '20171005', '20180414', '20180517',
       '20180718', '20180817', '20180914', '20181011', '20181117',
       '20181213', '20190117', '20190209', '20190312', '20190415',
       '20190519', '20190614', '20190714', '20190819', '20190922',
       '20191018', '20191121', '20191204', '20200105', '20200213',
       '20200316', '20200414', '20200512', '20200610', '20200711',
       '20200831', '20200928', '20201024'], dtype=object)

# Get Num. of Listings per Time Snapshot

In [10]:
doc_dist = df['crawled_date'].value_counts().sort_index(ascending=True)

doc_dist

20151002    2558
20160906    3585
20171005    4870
20180414    6393
20180517    5968
20180718    5997
20180817    6036
20180914    5957
20181011    6014
20181117    6296
20181213    6217
20190117    6247
20190209    6155
20190312    6221
20190415    6202
20190519    6241
20190614    6246
20190714    6264
20190819    6214
20190922    5711
20191018    5647
20191121    5387
20191204    3507
20200105    3746
20200213    3903
20200316    3799
20200414    3845
20200512    3688
20200610    3440
20200711     497
20200831    3339
20200928    3293
20201024    3254
Name: crawled_date, dtype: int64

# Get Num. of Listings per Different Feature Values

In [35]:
availability_30d_dist = df[df['crawled_date'] == '20151002']['availability_30'].value_counts().sort_index(ascending=True)

availability_30d_dist

0     823
1     131
2     127
3      88
4      87
5      74
6     101
7      82
8      65
9      70
10     51
11     65
12     43
13     55
14     49
15     66
16     49
17     39
18     34
19     33
20     41
21     36
22     40
23     25
24     28
25     38
26     20
27     26
28     23
29     25
30    124
Name: availability_30, dtype: int64

In [36]:
host_is_superhost_dist = df[df['crawled_date'] == '20151002']['host_is_superhost'].value_counts().sort_index(ascending=True)

host_is_superhost_dist

        2
f    2345
t     211
Name: host_is_superhost, dtype: int64

In [37]:
instant_bookable_dist = df[df['crawled_date'] == '20151002']['instant_bookable'].value_counts().sort_index(ascending=True)

instant_bookable_dist

f    2254
t     304
Name: instant_bookable, dtype: int64

In [43]:
is_business_travel_ready_dist = df[df['crawled_date'] == '20201024']['is_business_travel_ready'].value_counts().sort_index(ascending=True)

is_business_travel_ready_dist

f    3254
Name: is_business_travel_ready, dtype: int64

In [47]:
accommodates_dist = df[df['crawled_date'] == '20151002']['accommodates'].value_counts().sort_index(ascending=True)

accommodates_dist

1      325
2     1170
3      282
4      448
5      128
6      124
7       21
8       42
9        4
10       8
11       2
12       2
14       2
Name: accommodates, dtype: int64

In [48]:
room_type_dist = df[df['crawled_date'] == '20151002']['room_type'].value_counts().sort_index(ascending=True)

room_type_dist

Entire home/apt    1445
Private room       1059
Shared room          54
Name: room_type, dtype: int64

In [49]:
minimum_nights_dist = df[df['crawled_date'] == '20151002']['minimum_nights'].value_counts().sort_index(ascending=True)

minimum_nights_dist

1     1165
2      652
3      404
4       91
5       60
6       22
7      113
9        1
10       3
14      29
15       3
18       1
20       1
21       3
25       3
27       4
28       1
30       1
60       1
Name: minimum_nights, dtype: int64

# Simulate Search using Dates (or Crawled Snapshots)

In [63]:
# Query parameters
k = 300
location = "boston"

availability_30d = [1, 2, 3, 5, 7, 14, 30]
host_is_superhost = ['t', ['t', 'f']]
instant_bookable  = ['t', ['t', 'f']]
#is_business_travel_ready = ['t', 'f']
room_type = ['Entire home/apt', 'Private room', 'Shared room']
accommodates = [1, 2, 3, 4, 5]

#TO-DO:
# price: require a distribution analysis
# cancellation_policy

index_name = "airbnb-history-" + location

In [None]:
res = None
for crawled_date, num_doc in doc_dist.items():
    #print("crawled_date: " + crawled_date + "\tnum_doc: " + str(num_doc))
    
    num_searches = math.ceil(0.234 * num_doc)
    #print("num_searches: " + str(num_searches))
    
    num_nights = sample(availability_30d, 1)[0]
    #print("num_nights: " + str(num_nights))
    
    valid_nights = [i for i in range(1, num_nights+1)]
    #print("valid_nights: " + str(valid_nights))
    
    invalid_nights = [i for i in range(1, num_nights)]
    #print("invalid_nights: " + str(invalid_nights))
    
    superhost = sample(host_is_superhost, 1)[0]
    #print("superhost: " + str(superhost))
    
    instant = sample(instant_bookable, 1)[0]
    #print("instant: " + str(instant))
    
    #business = sample(is_business_travel_ready, 1)[0]
    #print("business: " + str(business))
    
    persons = sample(accommodates, 1)[0]
    #print("persons: " + str(persons))
    
    room = sample(room_type, 1)[0]
    #print("room: " + str(room))
    
    for s in range(1, num_searches):
    #for s in range(1,2):
        #print("search_id: " + str(s))
        
        # num. of paging = 10
        for i in range(1,11):
        #for p in range(1,2):
            #print("page_id: " + str(p))
            
            resp = app_search.search(
            engine_name = index_name,
            body={
                "query": "",
                "page": {
                    "current": p,
                    "size": 10
                },
                "filters": {
                    "all": [
                        { "crawled_date": crawled_date },
                        { "host_is_superhost": superhost },
                        { "instant_bookable": instant },
                        #{ "is_business_travel_ready": business }#,
                        { "room_type": room },
                        { "minimum_nights": valid_nights },
                        { "availability_30": { "from": num_nights } },
                        { "accommodates": { "from": persons } }
                    ],
                    "none": [
                        { "maximum_nights": invalid_nights }
                    ]
                },
                "sort": [
                    { "_score": "desc" },
                    { "overall_rating": "desc" }
                ]
            }
        )

In [61]:
print(resp)



In [None]:
data = list(resp)

In [None]:
len(data)

In [None]:
data[0]