In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import copy

import pandas as pd
import numpy as np

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

In [2]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


### <span style=color:blue>Cleaning up indexes before working to build reviews_join_listings.json </span>

<span style=color:blue>This cell illustrates a few things.  First, I have a list of "all_indexes" which holds most (but actually not all) of the indexes I have been using; this includes all indexes used when building the three .json files.  I do not list comments_tsv_in_reviews in all_indexes, because comments_tsv_in_reviews gets special treatment, and once created is never dropped.  </span>

<span style=color:blue>In this cell I drop all the indexes in all_indexes, and check that everything (except for comments_tsv_in_reviews) has been dropped.</span> 

In [3]:
all_indexes = [['datetime','reviews'], 
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

for i_spec in all_indexes:
    result = util.add_drop_index(db_eng, 'drop', i_spec)
    print('Inside the loop, here is the result of calling add_drop_index on the table "' \
          + str(i_spec[1]) + '"')
    # Note: add_drop_index is called multiple times for some tables
    # Also, when creating this notebook I have run this cell multiple times, so the indexes are already flushed out
    pprint.pp(result)

print('\n\nCurrent set of indexes in effect is:')
indexes_in_effect = util.fetch_all_index_info(db_eng, all_indexes)
pprint.pp(indexes_in_effect, width=150)

Inside the loop, here is the result of calling add_drop_index on the table "reviews"
[('new_york_city', 'reviews', 'comments_tsv_in_reviews', None, 'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]


Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'], 'listings': []}


### <span style=color:blue>Displaying some statistics about occurrences of words in reviews by year</span>

<span style=color:blue>In order to reduce the time of benchmarking runs, we will focus only on years\
   2009: 56 reviews\
   2010: 449 reviews\
   2011: 1905 reviews\
   2012: 3872 reviews\
   2013: 7317 reviews\
   2014: 14293 reviews\
   2017: 66146 reviews\
   2019: 126469 reviews\
   2023: 228831 reviews

In [4]:
q = util.query_reviews_by_year_counts()

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

pprint.pp(result.fetchall())

[('2009', 56),
 ('2010', 449),
 ('2011', 1905),
 ('2012', 3872),
 ('2013', 7317),
 ('2014', 14203),
 ('2015', 28465),
 ('2016', 48527),
 ('2017', 66146),
 ('2018', 95137),
 ('2019', 126469),
 ('2020', 51172),
 ('2021', 109415),
 ('2022', 196136),
 ('2023', 228831),
 ('2024', 8710)]


### <span style=color:blue>Working with text indexing</span>

<span style=color:blue>First, here are examples of the 2 kinds of query we are working with.  I might decide to ask the students to experiment with a second word.  (I need to find a word with different behavior!)</span>

### <span style=color:blue>NOTE: please see the double "%%" in the first query -- this is a kind of escape character so that sqlalchemy will be able to work with the "%" signs.  </span>

In [5]:
word = 'awesome'
start_date = '2015-01-01'
end_date = '2015-12-31'

q1 = util.build_query_word_year_no_ts_index(word, start_date, end_date)
q2 = util.build_query_word_year_ts_index(word, start_date, end_date)

print('q1 is:')
print(q1)
print('\nq2 is:')
print(q2)

q1 is:
SELECT *
FROM reviews r 
WHERE comments ILIKE '%%awesome%%' 
  AND date >= '2015-01-01'
  AND date <= '2015-12-31';

q2 is:
SELECT *
FROM reviews r 
WHERE comments_tsv @@ to_tsquery('awesome')
  AND date >= '2015-01-01'
  AND date <= '2015-12-31';


<span style=color:blue>The next cell generates the file 'text_search_query_on_mac_v01.json'<span>

<span style=color:blue>For these experiments the "no-index" case is making a search against the comments column of reviews, whereas the "with-index" case is making a search against the comments_tsv column of reviews.  So I don't need to drop the index on comments_tsv when running the "no-index" case.  This makes it simpler than the b-tree situation</span>

In [9]:
# this list does not include comments_tsv_in_reviews, which is will be left unchanged duding the runs
all_indexes = [['datetime','reviews'],               
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

# the testing will be on all combinations of the indexes in i_spec, and also with/without comments_tsv_in_reviews
i_spec_list = [['datetime', 'reviews']]

perf_file = 'text_search_query_on_mac_v01.json'

# using a low count so that the query without index runs reasonably quickly
count = 50


# reviews has data for years 2009 to 2024
# to reduce benchmarking time we focus on [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2019, 2023]
# for yr in [2009, 2010, 2011, 2012]: 
# for yr in [2013, 2014, 2017]:  
# for yr in [2019]:  
for yr in [2023]:
    print('\nEntering loop for yr = ', str(yr))
    q_name = 'listings_join_review_' + str(yr)
    start_date = str(yr) + '-01-01'
    end_date = str(yr) + '-12-31'
    for word in ['horrible', 'awesome', 'apartment']:
        print('Entering sub-loop for word: ', word)
        q_vec = [word, start_date, end_date]   
        profile_info = util.run_one_word_year_query_and_multi_index_specs_ts(db_eng, q_vec, all_indexes, i_spec_list, count)

        print('perf_file for the run for year "', yr, '" and word "', word, '" is:')
        pprint.pp(profile_info)
        updated_perf_summary = util.fetch_and_update_perf_data_deeper_merge(perf_file, profile_info)
        print('updated_perf_summary is:')
        pprint.pp(updated_perf_summary)

print(profile_info)


print('After running all years, the updated perf_summary info, in sorted order, is:')
# the sorted(...items()) returns list of ordered pairs; 
#     for each pair, the first element is a key in the dictionary
#                    and the second element is the value associated with that key
pprint.pp(sorted(updated_perf_summary.items()))




Entering loop for yr =  2023
Entering sub-loop for word:  horrible
{'horrible_2023': {}}
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)']}
{'avg': 1.2341,
 'min': 1.1164,
 'max': 4.8064,
 'std': 0.5117,
 'exec_count': 50,
 'timestamp': '2024-05-12-17:15:51'}
{'avg': 0.0053,
 'min': 0.0046,
 'max': 0.0173,
 'std': 0.0018,
 'exec_count': 50,
 'timestamp': '2024-05-12-17:15:52'}
{'__': {'avg': 1.2341,
        'min': 1.1164,
        'max': 4.8064,
        'std': 0.5117,
        'exec_count': 50,
        'timestamp': '2024-05-12-17:15:51'},
 '__comments_tsv_in_reviews__': {'avg': 0.0053,
                                 'min': 0.0046,
                                 'max': 0.0173,
                                 'std': 0.0018,
                                 'exec_count': 50,
                                 'timestamp': '2024-05-12-17:15:52'}}
{'reviews': ['CREATE INDEX datetime_in_reviews ON new_york_city.reviews USING btree (datet