## <span style=color:blue>Patterns used in Programming Assignment 2 Part 2(version using util.py file)  </span>

In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import copy

import pandas as pd
import numpy as np

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

In [2]:
# test that utils.py has been imported well
# util.hello_world()

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

<span style=color:blue>E.g., see https://docs.sqlalchemy.org/en/20/core/engines.html for how to construct the URLs that the create_engine command uses.  Also, one should store the user/password into environment variables and read them in to populate the URL.  </span>

<span style=color:blue>E.g., see https://stackoverflow.com/questions/4906977/how-can-i-access-environment-variables-in-python for how to work with environment variables on mac, </span>

In [3]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


### <span style=color:blue>Working with btree indexing</span>

<span style=color:blue>This generates a json file 'perf_test_v01.json'.  </span>

In [4]:
# testing function util.run_multiple_tests(db_eng, all_indexes, q_list, i_spec, perf_file, count)

all_indexes = [['date','reviews'], ['date','calendar'], ['id','listings'], ['comments_tsv', 'reviews', 'gin']] 

q_dict = {}
# reviews has data for years 2009 to 2024
for yr in range(2009,2025):
    q_name = 'listings_join_review_' + str(yr)
    date_start = str(yr) + '-01-01'
    date_end = str(yr) + '-12-31'
    q_dict[q_name] = util.build_query_listings_join_reviews(date_start, date_end)
# pprint.pp(q_dict)

q_dict['listings_join_reviews_2019'] = util.build_query_listings_join_reviews('2019-01-01', '2019-12-31')
# note: The reviews table has 126,469 entries in 2019


i_spec = [['id','listings'], ['date','reviews']]

perf_file = 'perf_test_v01.json'

# setting count to 3 for now so that things run faster
count = 3

for q in q_dict:
    print('\n====>>> Now working on query:', q)
    perf_summary = util.run_one_query_and_multi_index_specs(db_eng, all_indexes, q, q_dict[q], i_spec, count)
    # print('\nThe perf_summary of running query', q, 'is as follows:') 
    # pprint.pp(perf_summary)
    updated_perf_summary = util.fetch_and_update_perf_data(perf_file, perf_summary)

print('\nThe value of updated perf_summary is:')
pprint.pp(updated_perf_summary, sort_dicts=True)



====>>> Now working on query: listings_join_review_2009

Now working on the i_spec: []
Current set of indexes in effect is:
{'reviews': ["CREATE INDEX comments_in_reviews ON new_york_city.reviews USING gin (to_tsvector('simple'::regconfig, (comments)::text))"],
 'calendar': [],
 'listings': []}

Now invoking run_one_query

Now working on the i_spec: [['id', 'listings']]
Current set of indexes in effect is:
{'reviews': ["CREATE INDEX comments_in_reviews ON new_york_city.reviews USING gin (to_tsvector('simple'::regconfig, (comments)::text))"],
 'calendar': [],
 'listings': ['CREATE INDEX id_in_listings ON new_york_city.listings USING btree (id)']}

Now invoking run_one_query

Now working on the i_spec: [['date', 'reviews']]
Current set of indexes in effect is:
{'reviews': ["CREATE INDEX comments_in_reviews ON new_york_city.reviews USING gin (to_tsvector('simple'::regconfig, (comments)::text))",
             'CREATE INDEX date_in_reviews ON new_york_city.reviews USING btree (date)'],
 'c

### <span style=color:blue>Working with text indexing</span>

<span style=color:blue>This generates the file 'ts_pef_test_v01.json'

In [5]:
# testing function util.run_multiple_tests(db_eng, all_indexes, q_list, i_spec, perf_file, count)

perf_file = 'ts_perf_test_v01.json'

# using a low count so that the query without index runs reasonably quickly
count = 3

word = 'awesome'

# reviews has data for years 2009 to 2024
for yr in range(2009,2025):
    print('\nEntering loop for yr = ', str(yr))
    q_name = 'listings_join_review_' + str(yr)
    date_start = str(yr) + '-01-01'
    date_end = str(yr) + '-12-31'
    perf_info = util.run_one_text_query_without_with_tsindex(db_eng, word, date_start, date_end, count)
    print('perf_file for the run for  is:')
    pprint.pp(perf_info)
    updated_perf_summary = util.fetch_and_update_ts_perf_data(perf_file, perf_info)

print('After running all years, the updated perf_summary file is:')
pprint.pp(updated_perf_summary)





Entering loop for yr =  2009
starting on query q1
Perf profile for no index is:
{'avg': 1.2116,
 'min': 1.1818,
 'max': 1.247,
 'std': 0.0269,
 'count': 3,
 'timestamp': '2024-05-06-23:07:35'}
starting on query q2
Perf profile for ts index is:
{'avg': 0.0726,
 'min': 0.0704,
 'max': 0.0768,
 'std': 0.003,
 'count': 3,
 'timestamp': '2024-05-06-23:07:35'}
perf_file for the run for  is:
{'awesome_2009': {'no_index': {'avg': 1.2116,
                               'min': 1.1818,
                               'max': 1.247,
                               'std': 0.0269,
                               'count': 3,
                               'timestamp': '2024-05-06-23:07:35'},
                  'with_ts_index': {'avg': 0.0726,
                                    'min': 0.0704,
                                    'max': 0.0768,
                                    'std': 0.003,
                                    'count': 3,
                                    'timestamp': '2024-05-06-23:07

In [7]:
# the variable all_indexes will hold all of the indexes involved in your testing.
#   For now there are 3 indexes, but there will be more.  set of all indexes will get bigger once we do more explorations
# Here, a pair ['col','table'] refers to an index on column 'col' in table 'table'
# (in an ideal world, we would keep a copy of this on disk, probably in your computer's file system,
#   and read it in when we want to use it and/or add to it.  For the full Programming Assignment 2
#   we will be working with 4 to 6 indexes)

all_indexes = [['date','reviews'], ['date','calendar'], ['id','listings']] 


# pull in performance summary from previous tests done
perf_summary = fetch_perf_data('perf_summary.json')

# we will use the same query as above, and call it 'listings_join_reviews_2015'
#   in perf_summary.json, info about different runs for this query are
#   held in perf_summary[<<query_name>>]

# q = q_dict[query_name]
q_listings_join_reviews_2015 = """
SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2015-01-01'
  AND r.date <= '2015-12-31'
ORDER BY l.id;
"""

query_name = 'listings_join_reviews_2015'


# here the spec is a listing of column-table pairs corresponding to indexes that are
#    to be included in the test
# I have run this jupyter cell on the 4 specs listed below
spec = [['id','listings'], ['date','reviews']]
# spec = [['date','reviews']]
# spec = [['id','listings']]
# spec = []

# count will hold the number of times we want to run the query
count = 50

print('Processing spec: ', str(spec), '\n')
for index in all_indexes:
    if index not in spec:
        mod_index = util.add_drop_index(db_eng, 'drop', index[0], index[1])
        print('\nAfter doing the drop for', str(index), 'the indexes on table "' + index[1] + '" are: ')
        print(mod_index)
        
for index in spec:
    mod_index = util.add_drop_index(db_eng, 'add', index[0], index[1])
    print('\nAfter doing the add for', str(index), 'the indexes on table "' + index[1] + '" are: ')
    print(mod_index)

time_list = []
for i in range(0,count):
    time_start = datetime.now()
    # Open new db connection for each execution of the query to avoid multithreading
    with db_eng.connect() as conn:
        df = pd.read_sql(q_listings_join_reviews_2015, con=conn)
    time_end = datetime.now()
    diff = time_diff(time_start, time_end)
    time_list.append(diff)
    
perf_profile = {}
perf_profile['avg'] = round(sum(time_list)/len(time_list), 4)
perf_profile['min'] = round(min(time_list), 4)
perf_profile['max'] = round(max(time_list), 4)
perf_profile['std'] = round(np.std(time_list), 4)

print('\nThe list of running times is as follows:')
pprint.pp(time_list)

print('\nThe statistics on the list of running times are as follows:')
pprint.pp(perf_profile)

# util.build_index_description_key() creates a listing of strings corresponding
#    to the entries in spec, and concatenates them in the ordering given by all_indexes
#    For example, the description_key associated with having indexes date_in_reviews and id_in_listings
#        would be __date_in_reviews__id_in_listings__'
#        (You probably want to use a uniform ordering of index names when you create these description_keys
key_value = util.build_index_description_key(all_indexes, spec)
print('\nThe new value for"' + key_value + '"will be', str(perf_profile))


# we may have run some other tests with the query q_listings_join_reviews_2015' and
#   we don't want to overwrite those.  So we need to get the full contents
#   of perf_summary['listings_join_reviews_2015'] and then
#   write (or overwrite) the value for the current list of indexes used

if query_name in perf_summary:
    perf_dict = perf_summary[query_name]
    print("\nBefore modifying perf_dict, the value of perf_summary[query_name] (if it existed) was: ")
    pprint.pp(perf_dict)
else:
    perf_dict = {}
    print("\nBefore modifying perf_dict, the value of perf_summary[query_name] had empty value")
print()
perf_dict[key_value] = perf_profile
perf_summary['listings_join_reviews_2015'] = perf_dict

print("\nAfter modifying perf_dict, the value of perf_summary[query_name] is: ")
pprint.pp(perf_summary[query_name])
print()

print('\nThe full value of perf_summary is:')
pprint.pp(perf_summary)

write_perf_data(perf_summary, 'perf_summary.json')


NameError: name 'fetch_perf_data' is not defined