In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import copy

import pandas as pd
import numpy as np

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

In [2]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


### <span style=color:blue>The next segment is focused on the index datetime_in_reviews, including tests on time for updating the datetime attribute with/without that index </span>

In [3]:
ngroup = 'Manhattan'
days = 5

q = util.build_update_ngroup_datetime_in_reviews(db_eng, ngroup, days)
print(q)
print()

days = - days
neigh = 'Bedford-Stuyvesant'
q1 = util.build_update_neigh_datetime_in_reviews(db_eng, neigh, days)
print(q1)

UPDATE reviews r
SET datetime = datetime + interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood_group = 'Manhattan'
RETURNING 'done';

UPDATE reviews r
SET datetime = datetime - interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'Bedford-Stuyvesant'
RETURNING 'done';


In [4]:
all_indexes = [['datetime','reviews'], ['id','listings'],['neighbourhood', 'listings'], 
               ['neighbourhood_group','listings'],
               ['date','reviews'], ['price','listings'], ['comments_in_reviews']]
all_indexes = [['datetime','reviews'], ['id','listings'],['neighbourhood', 'listings'], ['date','reviews'], ['price','listings']]
i_spec_list = [['datetime','reviews'], ['neighbourhood', 'listings']]

for i_spec in all_indexes:
    result = util.add_drop_index(db_eng, 'drop', i_spec)
    pprint.pp(result)

print('\n\nCurrent set of indexes in effect is:')
indexes_in_effect = util.fetch_all_index_info(db_eng, i_spec_list)
pprint.pp(indexes_in_effect, width=150)

[('new_york_city', 'reviews', 'comments_tsv_in_reviews', None, 'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)')]
[('new_york_city', 'listings', 'neighbourhood_in_listings', None, 'CREATE INDEX neighbourhood_in_listings ON new_york_city.listings USING btree (neighbourhood)')]
[]
[('new_york_city', 'reviews', 'comments_tsv_in_reviews', None, 'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)')]
[]


Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'], 'listings': []}


In [6]:
all_indexes = [['id','listings'],['neighbourhood', 'listings'], ['neighbourhood_group','listings'],['datetime','reviews'],['date','reviews'], ['price','listings'], ['comments','reviews']]
i_spec_nbhd_list = [['datetime','reviews'],['neighbourhood','listings']]
i_spec_ngroup_list = [['datetime','reviews'],['neighbourhood_group','listings']]

neigh_list = ['Bedford-Stuyvesant',      # 99,705  about 20 vs 17.5
                'Long Island City',     # 10,895   about 2.58 vs 2.19
                'Fort Hamilton',        # 1,000           .72 vs .70
                'New Springville']       #  104

ngroup_list = ['Brooklyn',         # 432109
               'Manhattan',        # 341287
               'Queens',           # 173392
               'Bronx',            # 35296
               'Staten Island']    # 13726


neigh = 'New Springville'
# neigh = 'Fort Hamilton'           # done
# neigh = 'Long Island City'       # done
# neigh = 'Bedford-Stuyvesant'      # done
# neigh = 'Staten Island'         # done
# neigh = 'Bronx'               # done
# neigh = 'Queens'               # done
# neigh = 'Manhattan'           # done
# neigh = 'Brooklyn'    # will skip -- too big!!

days = 5 
count = 50
group = False  # we are running a neighbourhood_group, not a neighbourhood


if group:
    i_spec_list = i_spec_ngroup_list
else:
    i_spec_list = i_spec_nbhd_list

perf_dict = util.run_neigh_update_datetimes_multi_index_specs_ts(db_eng, group, neigh, days, all_indexes, i_spec_ngroup_list, count)
print('\nThe perf profile after running on neighborhood', neigh, 'is:')
# pprint.pp(sorted(perf_dict.items()))
pprint.pp(perf_dict)

perf_file = 'update_datetimes_query_v02.json'
query_name = 'update_datetimes_neigh_' + neigh
new_perf_summary = {}
new_perf_summary[query_name] = perf_dict
result = util.fetch_and_update_ts_perf_data_overwrite_for_query(perf_file, new_perf_summary)
print('\nContents of perf_file', perf_file, 'are now:')
pprint.pp(sorted(result.items()))
               


In neigh_updates routine, have set add_days to: True

Now working on the i_specs: []
Current set of indexes in effect is:
{'listings': [], 'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)']}
Will be running the query:
UPDATE reviews r
SET datetime = datetime + interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'New Springville'
RETURNING 'done';

Now invoking run_one_query

In neigh_updates routine, have set add_days to: False

Now working on the i_specs: [['datetime', 'reviews']]
Current set of indexes in effect is:
{'listings': [],
 'reviews': ['CREATE INDEX datetime_in_reviews ON new_york_city.reviews USING btree (datetime)',
             'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)']}
Will be running the query:
UPDATE reviews r
SET datetime = datetime - interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'New Springville'
RE

AttributeError: module 'util' has no attribute 'fetch_and_update_ts_perf_data_overwrite_for_query'

In [3]:
q = util.query_neighbourhood_groups_listings_counts()

print(q)
print()

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

pprint.pp(result.fetchall())

select count(*), neighbourhood_group
from listings l, reviews r 
where l.id = r.listing_id 
group by neighbourhood_group
order by count desc

[(423109, 'Brooklyn'),
 (341287, 'Manhattan'),
 (173392, 'Queens'),
 (35296, 'Bronx'),
 (13726, 'Staten Island')]


In [10]:
q = util.query_neighbourhoods_listings_counts()

print(q)
print()

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

pprint.pp(result.fetchall())

select count(*), neighbourhood, neighbourhood_group
from listings l, reviews r 
where l.id = r.listing_id 
group by neighbourhood, neighbourhood_group
order by count desc

[(99705, 'Bedford-Stuyvesant', 'Brooklyn'),
 (59240, 'Harlem', 'Manhattan'),
 (52541, 'Williamsburg', 'Brooklyn'),
 (35734, 'Crown Heights', 'Brooklyn'),
 (33856, 'Bushwick', 'Brooklyn'),
 (31521, "Hell's Kitchen", 'Manhattan'),
 (30218, 'Midtown', 'Manhattan'),
 (25166, 'East Village', 'Manhattan'),
 (22276, 'Upper West Side', 'Manhattan'),
 (20862, 'East Harlem', 'Manhattan'),
 (20373, 'Astoria', 'Queens'),
 (18856, 'Lower East Side', 'Manhattan'),
 (18470, 'Chelsea', 'Manhattan'),
 (17346, 'Upper East Side', 'Manhattan'),
 (16426, 'East Flatbush', 'Brooklyn'),
 (15913, 'Flushing', 'Queens'),
 (13909, 'Park Slope', 'Brooklyn'),
 (13526, 'East Elmhurst', 'Queens'),
 (13491, 'Flatbush', 'Brooklyn'),
 (12864, 'Greenpoint', 'Brooklyn'),
 (12727, 'Clinton Hill', 'Brooklyn'),
 (12560, 'Prospect-Lefferts Gardens', 'Brookl

In [None]:
# neigh = 'Bedford-Stuyvesant'      # 99,705  about 20 vs 17.5
neigh = 'Long Island City'      # 10,895   about 2.58 vs 2.19
# neigh = 'Fort Hamilton'         # 1,000           .72 vs .70
# neigh = 'New Springvile'        #  104
days = 5
neg_days = -5

# plus_5 = util.build_update_ngroup_datetime_in_reviews(db_eng, neigh, days)
plus_5 = util.build_update_neigh_datetime_in_reviews(db_eng, neigh, days)
print(plus_5)
print()

# minus_5 = util.build_update_ngroup_datetime_in_reviews(db_eng, neigh, neg_days)
minus_5 = util.build_update_neigh_datetime_in_reviews(db_eng, neigh, neg_days)
print(minus_5)
print()

i_spec_neigh = ['neighbourhood', 'listings']
util.add_drop_index(db_eng, 'add', i_spec_neigh)

i_spec = ['datetime','reviews']

count = 50

util.add_drop_index(db_eng, 'add', i_spec)
print('entering run_one_query for plus_5')
plus_profile, time_listing = util.run_one_query(db_eng, plus_5, count)
print('\nPlus_profile is:')
pprint.pp(plus_profile)

util.add_drop_index(db_eng, 'drop', i_spec)
print('entering run_one_query for minus_5')
minus_profile, time_listing = util.run_one_query(db_eng, minus_5, count)
print('\nMinus_profile is:')
pprint.pp(minus_profile)

In [3]:
ngroup = 'Brooklyn'
year = 2023
sword = 'awesome'

vec = [ngroup, year, sword]
ts_index = True

q = util.build_ngroup_year_datetime_sword(vec, ts_index)
print(q)



SELECT count(*)
FROM listings l, reviews r
WHERE l.id = r.listing_id
  AND l.neighbourhood_group = 'Brooklyn'
  AND r.datetime >= '2023-01-01' AND r.datetime <= '2023-12-31'
  AND comments_tsv @@ to_tsquery('awesome');


In [4]:
ngroup = 'Brooklyn'
year = 2016
sword = 'awesome'

specs = [['datetime','reviews'], ['neighbourhood_group','listings'], ['id','listings']]

count = 3

q_vec = [ngroup, year, sword]

query_vec = util.q_vec_name(q_vec)

print(query_vec)

result = util.run_ngroup_year_sword_and_multi_index_specs_with_ts_ngroup(db_eng, q_vec, specs, count)

print('\n\nThe result of the run on', query_vec, 'with indexes', str(specs), 'is:')
pprint.pp(sorted(result.items()))

q_Brooklyn_2016_awesome

Now working on the i_spec: []
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)',
             "CREATE INDEX comments_in_reviews ON new_york_city.reviews USING gin (to_tsvector('simple'::regconfig, (comments)::text))",
             'CREATE INDEX datetime_in_reviews ON new_york_city.reviews USING btree (datetime)'],
 'listings': ['CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)',
              'CREATE INDEX id_in_listings ON new_york_city.listings USING btree (id)',
              'CREATE INDEX price_in_listings ON new_york_city.listings USING btree (price)']}

Now invoking run_one_query with ts index

Now invoking run_one_query no ts index

Now working on the i_spec: [['datetime', 'reviews']]
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comme

In [3]:
prange = '000s'
year = 2023
word = 'awesome'

vec = [prange, year, word]
ts_index = True

q1 = util.build_price_range_year_word(vec, ts_index)
print(q1)

SELECT *
FROM listings l, reviews r
WHERE l.id = r.listing_id
  AND l.price >= 0 AND l.price < 100
  AND r.datetime >= '2023-01-01' AND r.datetime <= '2023-12-31'
  AND comments_tsv @@ to_tsquery('awesome');


In [4]:
all_indexes = [['date','reviews'], ['id','listings'], ['price','listings'], ['comments_tsv', 'reviews', 'gin']]
specs = [['date','reviews'], ['id','listings'], ['price','listings']]
specs = [['date','reviews'], ['price','listings']]

print(util.build_index_description_key_no_ts_index(specs))
print()
print(util.build_index_description_key_with_ts_index(specs))
      

__date_in_reviews__price_in_listings__

__date_in_reviews__price_in_listings__comments_tsv_in_reviews__


In [8]:
prange = '300s'
year = 2016
word = 'awesome'

# note: the index comments_tsv_on_reviews will get special treatment, so not included
#       in the lists here
all_indexes = [['datetime','reviews'], ['price','listings'], ['id','listings'],['date','reviews'],['comments_in_reviews']]
specs = [['datetime','reviews'], ['price','listings'], ['id','listings']]
specs = [['datetime','reviews']] # not much diff to inlude or not
specs = [['price','listings']]   # not much diff to inlude or not
specs = [['id','listings']]

count = 50

q_vec = [prange, year, word]

query_vec = util.q_vec_name(q_vec)

print(query_vec)

result = util.run_prange_year_sword_and_multi_index_specs_with_ts(db_eng, q_vec, specs, count)

print('\n\nThe result of the run on', query_vec, 'with indexes', str(specs), 'is:')
pprint.pp(sorted(result.items()))

q_300s_2016_awesome

Now working on the i_spec: []
Current set of indexes in effect is:
{'listings': ['CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)',
              'CREATE INDEX id_in_listings ON new_york_city.listings USING btree (id)',
              'CREATE INDEX price_in_listings ON new_york_city.listings USING btree (price)']}

Now invoking run_one_query with ts index

Now invoking run_one_query no ts index

Now working on the i_spec: [['id', 'listings']]
Current set of indexes in effect is:
{'listings': ['CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)',
              'CREATE INDEX id_in_listings ON new_york_city.listings USING btree (id)',
              'CREATE INDEX price_in_listings ON new_york_city.listings USING btree (price)']}

Now invoking run_one_query with ts index

Now invoking run_one_query no ts index


The result of the run on q_300s_2016_awesome with 