# Parameter Setup

In [1]:
import json
import pandas as pd
import numpy as np
import math

from datetime import datetime
from random import sample 

In [2]:
# ElasticSeach
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [3]:
# Elastic App Search
from elastic_enterprise_search import AppSearch

app_search = AppSearch(
    "http://localhost:3002",
    http_auth="private-6jj3ai4ckkq2xykcocosmv6o"
)

In [4]:
app_search.list_engines()

{'meta': {'page': {'current': 1, 'total_pages': 1, 'total_results': 2, 'size': 25}}, 'results': [{'name': 'airbnb-history-geneva', 'type': 'default', 'language': None, 'document_count': 129094}, {'name': 'airbnb-history-boston', 'type': 'default', 'language': None, 'document_count': 162737}]}

# Get All Time Snapshots

In [5]:
from elasticsearch.helpers import scan

# Query parameters
k = 10000
location = "geneva"
start_date = '2020-12-21' # default is '2020-12-21'
end_date = '2021-03-21'   # default is '2020-12-21' + 90d
index_name = "airbnb_history_" + location

res = scan(
    es,
    index=index_name,
    query={"query": { "match_all" : {}}}
)

In [6]:
data = list(res)
len(data)

129094

In [7]:
# appending rows 
docs = []
for item in data:  
    doc = {
        'id': item['_id'],
        'index': item['_index'],
        'listing_url': item['_source']['listing_url'],
        'scrape_id': item['_source']['scrape_id'],
        'last_scraped': item['_source']['last_scraped'],
        'crawled_date': item['_source']['crawled_date'],
        'name': item['_source']['name'],
        'host_id': item['_source']['host_id'],
        'host_is_superhost': item['_source']['host_is_superhost'],
        'host_identity_verified': item['_source']['host_identity_verified'],
        'room_type': item['_source']['room_type'],
        'accommodates': item['_source']['accommodates'],
        'guests_included': item['_source']['guests_included'],
        'minimum_nights': item['_source']['minimum_nights'],
        'maximum_nights': item['_source']['maximum_nights'],
        'calendar_updated': item['_source']['calendar_updated'],
        'instant_bookable': item['_source']['instant_bookable'],
        'is_business_travel_ready': item['_source']['is_business_travel_ready'],
        'cancellation_policy': item['_source']['cancellation_policy'],
        'price': item['_source']['price'],
        'availability_30': item['_source']['availability_30'],
        'availability_60': item['_source']['availability_60'],
        'availability_90': item['_source']['availability_90'],
        'availability_365': item['_source']['availability_365'],
        'first_review': item['_source']['first_review'],
        'last_review': item['_source']['last_review'],
        'review_scores_rating': item['_source']['review_scores_rating'],
        'review_scores_accuracy': item['_source']['review_scores_accuracy'],
        'review_scores_cleanliness': item['_source']['review_scores_cleanliness'],
        'review_scores_checkin': item['_source']['review_scores_checkin'],
        'review_scores_communication': item['_source']['review_scores_communication'],
        'review_scores_location': item['_source']['review_scores_location'],
        'review_scores_value': item['_source']['review_scores_value'],
        'overall_rating': item['_source']['overall_rating']
    }
    
    docs.append(doc) 

In [8]:
df = pd.DataFrame(docs)

In [9]:
date_snapshots = np.sort(df.crawled_date.unique())

date_snapshots

array(['20160506', '20160605', '20160709', '20160808', '20160909',
       '20161215', '20170115', '20170219', '20170314', '20170414',
       '20170512', '20170613', '20170714', '20170814', '20170918',
       '20171017', '20171119', '20171217', '20180127', '20180418',
       '20180728', '20180820', '20180916', '20181016', '20181119',
       '20181216', '20190123', '20190216', '20190323', '20190422',
       '20190525', '20190625', '20190722', '20190829', '20190925',
       '20191026', '20191128', '20191230', '20200128', '20200227',
       '20200321', '20200428', '20200528', '20200623', '20201027'],
      dtype=object)

# Get Num. of Listings per Time Snapshot

In [10]:
doc_dist = df['crawled_date'].value_counts().sort_index(ascending=True)

doc_dist

20160506    1941
20160605    2066
20160709    2273
20160808    2408
20160909    2401
20161215    2355
20170115    2325
20170219    2350
20170314    2413
20170414    2509
20170512    2540
20170613    2587
20170714    2822
20170814    2861
20170918    2809
20171017    2815
20171119    2851
20171217    2946
20180127    3060
20180418    3107
20180728    3087
20180820    3026
20180916    2972
20181016    2948
20181119    2991
20181216    3049
20190123    2985
20190216    2966
20190323    2980
20190422    2964
20190525    2976
20190625    3087
20190722    3260
20190829    3212
20190925    3166
20191026    3194
20191128    3271
20191230    3387
20200128    3447
20200227    3403
20200321    3330
20200428    3314
20200528    3312
20200623    3349
20201027    1979
Name: crawled_date, dtype: int64

# Get Num. of Listings per Different Feature Values

In [11]:
availability_30d_dist = df[df['crawled_date'] == '20151002']['availability_30'].value_counts().sort_index(ascending=True)

availability_30d_dist

Series([], Name: availability_30, dtype: int64)

In [12]:
host_is_superhost_dist = df[df['crawled_date'] == '20151002']['host_is_superhost'].value_counts().sort_index(ascending=True)

host_is_superhost_dist

Series([], Name: host_is_superhost, dtype: int64)

In [13]:
instant_bookable_dist = df[df['crawled_date'] == '20151002']['instant_bookable'].value_counts().sort_index(ascending=True)

instant_bookable_dist

Series([], Name: instant_bookable, dtype: int64)

In [14]:
is_business_travel_ready_dist = df[df['crawled_date'] == '20201024']['is_business_travel_ready'].value_counts().sort_index(ascending=True)

is_business_travel_ready_dist

Series([], Name: is_business_travel_ready, dtype: int64)

In [15]:
accommodates_dist = df[df['crawled_date'] == '20151002']['accommodates'].value_counts().sort_index(ascending=True)

accommodates_dist

Series([], Name: accommodates, dtype: int64)

In [16]:
room_type_dist = df[df['crawled_date'] == '20151002']['room_type'].value_counts().sort_index(ascending=True)

room_type_dist

Series([], Name: room_type, dtype: int64)

In [17]:
minimum_nights_dist = df[df['crawled_date'] == '20151002']['minimum_nights'].value_counts().sort_index(ascending=True)

minimum_nights_dist

Series([], Name: minimum_nights, dtype: int64)

# Simulate Search using Dates (or Crawled Snapshots)

In [18]:
# Query parameters
k = 300
#location = "boston"
location = "geneva"

availability_30d = [1, 2, 3, 5, 7, 14, 30]
host_is_superhost = ['t', ['t', 'f']]
instant_bookable  = ['t', ['t', 'f']]
#is_business_travel_ready = ['t', 'f']
room_type = ['Entire home/apt', 'Private room', 'Shared room']
accommodates = [1, 2, 3, 4, 5]

#TO-DO:
# price: require a distribution analysis
# cancellation_policy

index_name = "airbnb-history-" + location

In [19]:
res = None
for crawled_date, num_doc in doc_dist.items():

    #print("crawled_date: " + crawled_date + "\tnum_doc: " + str(num_doc))

    num_searches = math.ceil(0.234 * num_doc)
    #print("num_searches: " + str(num_searches))

    num_nights = sample(availability_30d, 1)[0]
    #print("num_nights: " + str(num_nights))

    valid_nights = [i for i in range(1, num_nights+1)]
    #print("valid_nights: " + str(valid_nights))

    invalid_nights = [i for i in range(0, num_nights)] # handle the case when num_nights = 1
    #print("invalid_nights: " + str(invalid_nights))

    superhost = sample(host_is_superhost, 1)[0]
    #print("superhost: " + str(superhost))

    instant = sample(instant_bookable, 1)[0]
    #print("instant: " + str(instant))

    #business = sample(is_business_travel_ready, 1)[0]
    #print("business: " + str(business))

    persons = sample(accommodates, 1)[0]
    #print("persons: " + str(persons))

    room = sample(room_type, 1)[0]
    #print("room: " + str(room))

    for s in range(1, num_searches):
    #for s in range(1,2):
        #print("search_id: " + str(s))

        # num. of paging = 10
        #for i in range(1,11):
        for p in range(1,2):
            #print("page_id: " + str(p))

            resp = app_search.search(
            engine_name = index_name,
            body={
                "query": "",
                "page": {
                    "current": p,
                    "size": 10
                },
                "filters": {
                    "all": [
                        { "crawled_date": crawled_date },
                        { "host_is_superhost": superhost },
                        { "instant_bookable": instant },
                        #{ "is_business_travel_ready": business }#,
                        { "room_type": room },
                        { "minimum_nights": valid_nights },
                        { "availability_30": { "from": num_nights } },
                        { "accommodates": { "from": persons } }
                    ],
                    "none": [
                        { "maximum_nights": invalid_nights }
                    ]
                },
                "sort": [
                    { "_score": "desc" },
                    { "overall_rating": "desc" }
                ]
            }
        )

In [29]:
r = app_search.get_api_logs(
        index_name,
        "2021-02-20T00:00:00+00:00",
        "2021-02-26T00:00:00+00:00",
        current_page=2,
        page_size=80)
    

In [30]:
print(r)



In [20]:
print(resp)



In [21]:
data = list(resp)

In [22]:
len(data)

2

In [23]:
data[0]

'meta'