In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import copy

import pandas as pd
import numpy as np

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

In [2]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


<span style=color:blue>Now a couple of queries that give basic stats on numbers of reviews associated with each neighborhood_gtoup and each neighborhood     </span>

In [10]:
q = util.query_neighbourhood_groups_listings_counts()

print(q)
print()

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

pprint.pp(result.fetchall())

select count(*), neighbourhood_group
from listings l, reviews r 
where l.id = r.listing_id 
group by neighbourhood_group
order by count desc

[(423109, 'Brooklyn'),
 (341287, 'Manhattan'),
 (173392, 'Queens'),
 (35296, 'Bronx'),
 (13726, 'Staten Island')]


In [9]:
q = util.query_neighbourhoods_listings_counts()

print(q)
print()

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

pprint.pp(result.fetchall())

select count(*), neighbourhood, neighbourhood_group
from listings l, reviews r 
where l.id = r.listing_id 
group by neighbourhood, neighbourhood_group
order by count desc

[(99705, 'Bedford-Stuyvesant', 'Brooklyn'),
 (59240, 'Harlem', 'Manhattan'),
 (52541, 'Williamsburg', 'Brooklyn'),
 (35734, 'Crown Heights', 'Brooklyn'),
 (33856, 'Bushwick', 'Brooklyn'),
 (31521, "Hell's Kitchen", 'Manhattan'),
 (30218, 'Midtown', 'Manhattan'),
 (25166, 'East Village', 'Manhattan'),
 (22276, 'Upper West Side', 'Manhattan'),
 (20862, 'East Harlem', 'Manhattan'),
 (20373, 'Astoria', 'Queens'),
 (18856, 'Lower East Side', 'Manhattan'),
 (18470, 'Chelsea', 'Manhattan'),
 (17346, 'Upper East Side', 'Manhattan'),
 (16426, 'East Flatbush', 'Brooklyn'),
 (15913, 'Flushing', 'Queens'),
 (13909, 'Park Slope', 'Brooklyn'),
 (13526, 'East Elmhurst', 'Queens'),
 (13491, 'Flatbush', 'Brooklyn'),
 (12864, 'Greenpoint', 'Brooklyn'),
 (12727, 'Clinton Hill', 'Brooklyn'),
 (12560, 'Prospect-Lefferts Gardens', 'Brookl

### <span style=color:blue>The next segment is focused on the index datetime_in_reviews, including tests on time for updating the datetime attribute with/without that index </span>

<span style=color:blue>Here are some representative queries that we use</span>

In [3]:
ngroup = 'Manhattan'
days = 5

q = util.build_update_ngroup_datetime_in_reviews(db_eng, ngroup, days)
print(q)
print()

days = - days
neigh = 'Bedford-Stuyvesant'
q1 = util.build_update_neigh_datetime_in_reviews(db_eng, neigh, days)
print(q1)

UPDATE reviews r
SET datetime = datetime + interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood_group = 'Manhattan'
RETURNING 'done';

UPDATE reviews r
SET datetime = datetime - interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'Bedford-Stuyvesant'
RETURNING 'done';


<span style=color:blue>Flushing out all indexes that might be lurking about. (But leaving the GIN index comments_tsv_in_reviews.)    </span>

In [4]:
all_indexes = [['datetime','reviews'], 
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

for i_spec in all_indexes:
    result = util.add_drop_index(db_eng, 'drop', i_spec)
    print('Inside the loop, here is the result of calling add_drop_index on the table "' \
          + str(i_spec[1]) + '"')
    # Note: add_drop_index is called multiple times for some tables
    # Also, when creating this notebook I have run this cell multiple times, so the indexes are already flushed out
    pprint.pp(result)

print('\n\nCurrent set of indexes in effect is:')
indexes_in_effect = util.fetch_all_index_info(db_eng, all_indexes)
pprint.pp(indexes_in_effect, width=150)

Inside the loop, here is the result of calling add_drop_index on the table "reviews"
[('new_york_city', 'reviews', 'comments_tsv_in_reviews', None, 'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[('new_york_city', 'listings', 'neighbourhood_group_in_listings', None, 'CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[('new_york_city', 'listings', 'neighbourhood_group_in_listings', None, 'CREATE INDEX neighbourhood_group_in_listings ON new_york_city.listings USING btree (neighbourhood_group)')]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]
Inside the loop, here is the result of calling add_drop_index on the table "listings"
[]


Current set of indexes in effect is:
{'reviews'

In [8]:
all_indexes = [['datetime','reviews'], 
               ['id','listings'],
               ['neighbourhood','listings'], 
               ['neighbourhood_group','listings'],
               ['price','listings']]                 # used when I was playing with price ranges

i_spec_nbhd_list = [['datetime','reviews'],['neighbourhood','listings']]
i_spec_ngroup_list = [['datetime','reviews'],['neighbourhood_group','listings']]

neigh_list = ['Bedford-Stuyvesant',      # 99,705  about 20 vs 17.5
                'Long Island City',     # 10,895   about 2.58 vs 2.19
                'Fort Hamilton',        # 1,000           .72 vs .70
                'New Springville']       #  104

ngroup_list = ['Brooklyn',         # 432109
               'Manhattan',        # 341287
               'Queens',           # 173392
               'Bronx',            # 35296
               'Staten Island']    # 13726


# I ran each of the following locations one at a time.  
#     Manhattan takes almost an hour, New Springville takes only minutes

# neigh = 'New Springville'          # done
# neigh = 'Fort Hamilton'             # done    
# neigh = 'Long Island City'          # done
# neigh = 'Bedford-Stuyvesant'     # done
# neigh = 'Staten Island'         # done
# neigh = 'Bronx'               # done
# neigh = 'Queens'               # done
# neigh = 'Manhattan'           # done
# neigh = 'Brooklyn'    # will skip -- too big!!

days = 5 
count = 50

# This is used as input to the function below.
#   If set to true, then the "neigh" value is a neighborhood_group
#   If set to false, then the "neigh" value is a neighborhood
group = False  # we are running a neighbourhood_group, not a neighbourhood


if group:
    i_spec_list = i_spec_ngroup_list
else:
    i_spec_list = i_spec_nbhd_list

perf_dict = util.run_neigh_update_datetimes_multi_index_specs_ts(db_eng, group, neigh, days, all_indexes, i_spec_ngroup_list, count)
print('\nThe perf profile after running on neighborhood', neigh, 'is:')
# pprint.pp(sorted(perf_dict.items()))
pprint.pp(perf_dict)

perf_file = 'update_datetimes_query_v02.json'
query_name = 'update_datetimes_neigh_' + neigh
new_perf_summary = {}
new_perf_summary[query_name] = perf_dict
result = util.fetch_and_update_ts_perf_data_shallow_merge(perf_file, new_perf_summary)
print('\nContents of perf_file', perf_file, 'are now:')
pprint.pp(sorted(result.items()))
               


In neigh_updates routine, have set add_days to: True

Now working on the i_specs: []
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'], 'listings': []}
Will be running the query:
UPDATE reviews r
SET datetime = datetime + interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'Bedford-Stuyvesant'
RETURNING 'done';

Now invoking run_one_query

In neigh_updates routine, have set add_days to: False

Now working on the i_specs: [['datetime', 'reviews']]
Current set of indexes in effect is:
{'reviews': ['CREATE INDEX datetime_in_reviews ON new_york_city.reviews USING btree (datetime)',
             'CREATE INDEX comments_tsv_in_reviews ON new_york_city.reviews USING gin (comments_tsv)'],
 'listings': []}
Will be running the query:
UPDATE reviews r
SET datetime = datetime - interval '5 days'
FROM listings l
WHERE l.id = r.listing_id
  AND l.neighbourhood = 'Bedford-Stuyvesa