In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import copy

import pandas as pd
import numpy as np

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

In [2]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


### <span style=color:blue>Cleaning up indexes before working to build reviews_join_listings.json </span>

<span style=color:blue>This cell illustrates a few things.  First, I have a list of "all_indexes" which holds most (but actually not all) of the indexes I have been using; this includes all indexes used when building the three .json files.  I do not list comments_tsv_in_reviews in all_indexes, because comments_tsv_in_reviews gets special treatment, and once created is never dropped.  </span>

<span style=color:blue>In this cell I drop all the indexes in all_indexes, and check that everything (except for comments_tsv_in_reviews) has been dropped.</span> 


In [3]:
all_indexes = [['datetime','reviews'], 
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

for i_spec in all_indexes:
    result = util.add_drop_index(db_eng, 'drop', i_spec)
    print('Inside the loop, here is the result of calling add_drop_index on the table "' \
          + str(i_spec[1]) + '"')
    # Note: add_drop_index is called multiple times for some tables
    # Also, when creating this notebook I have run this cell multiple times, so the indexes are already flushed out
    pprint.pp(result)

print('\n\nCurrent set of indexes in effect is:')
indexes_in_effect = util.fetch_all_index_info(db_eng, all_indexes)
pprint.pp(indexes_in_effect, width=150)

Inside the loop, here is the result of calling add_drop_index on the table "reviews"
[('new_york_city', 'reviews', 'comments_tsv_in_reviews', None, 'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[('new_york_city', 'listings', 'neighbourhood_group_in_listings', None, 'CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[('new_york_city', 'listings', 'neighbourhood_group_in_listings', None, 'CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]


Current set of indexes in effect is:
{'reviews'

### <span style=color:blue>Now working to build the file listings_in_reviews.json    </span>

<span style=color:blue>First step is to build up a dictionary of the queries to be used</span>

In [4]:
q_dict = {}
# reviews has data for years 2009 to 2024
for yr in range(2009,2025):
    q_name = 'listings_join_reviews_' + str(yr)
    date_start = str(yr) + '-01-01'
    date_end = str(yr) + '-12-31'
    q_dict[q_name] = util.build_query_listings_join_reviews(date_start, date_end)
# pprint.pp(q_dict)

pprint.pp(q_dict['listings_join_reviews_2015'])

('SELECT DISTINCT l.id, l.name\n'
 'FROM listings l, reviews r \n'
 'WHERE l.id = r.listing_id\n'
 "  AND r.datetime >= '2015-01-01'\n"
 "  AND r.datetime <= '2015-12-31'\n"
 'ORDER BY l.id;')


<span style=color:blue>Now invoking a loop that will build the file</span>

In [5]:
# keeping all_indexes in this cell, in case I run the cell stand-alone during development 
all_indexes = [['datetime','reviews'], 
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

i_spec_list = [['datetime','reviews'], ['id', 'listings']]

perf_file = 'listings_join_reviews_on_mac_v02.json'

# setting count to 3 for now so that things run faster
count = 50

for q in q_dict:    # for testing, replace q_dict with : ['listings_join_reviews_2015']:
    print('\n====>>> Now working on query:', q)
    perf_summary = util.run_one_query_and_multi_index_specs(db_eng, all_indexes, q, q_dict[q], i_spec_list, count)
    # print('\nThe perf_summary of running query', q, 'is as follows:') 
    # pprint.pp(perf_summary)
    updated_perf_summary = util.fetch_and_update_perf_data_deeper_merge(perf_file, perf_summary)

print('\nThe value of updated perf_summary is:')
pprint.pp(updated_perf_summary, sort_dicts=True)


====>>> Now working on query: listings_join_reviews_2009

Now working on the i_spec: []
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'], 'listings': []}

Now invoking run_one_query

Now working on the i_spec: [['datetime', 'reviews']]
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX datetime_in_reviews ON new_york_city.reviews USING btree (datetime)',
             'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'],
 'listings': []}

Now invoking run_one_query

Now working on the i_spec: [['id', 'listings']]
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'],
 'listings': ['CREATE INDEX id_in_listings ON new_york_city.listings USING btree (id)']}

Now invoking run_one_query

Now working on the i_spec: [['datetime', 'reviews'], ['id', 'listings']]
Current 