### <span style=color:blue> Loading Listings & Reviews data from postgresql into local MongoDB    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

In [2]:
# test that utils.py has been imported well
util.hello_world()

hello world


<span style=color:blue>Getting PostgreSQL connection set up</span>

In [3]:
# Load the env file 

dotenv_path = 'env_variables.env'
load_dotenv(dotenv_path=dotenv_path)

# Import the env variables

load_dotenv()

schema = os.getenv('DISC_6_SCHEMA')
port = os.getenv('DISC_6_PORT')
host = os.getenv('DISC_6_HOST')
database = os.getenv('DISC_6_DB')
password = os.getenv('DISC_6_PASSWORD')
connection = os.getenv('DISC_6_CONNECTION')

# Create the db engine 

db_eng = create_engine(f"postgresql+psycopg2://{connection}:{password}@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


<span style=color:blue>Getting mongodb connection set up</span>

In [4]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Setting up collection "listings" in mongodb</span>

In [5]:
# I have (or will have) a database "airbnb"
db = client.airbnb

# inside the "airbnb" database, I have (or will have) a collection "listings"
listings = db.listings
print(db.list_collection_names())
# I have some other collections in my airbnb database...

['testing', 'listings_previously_built', 'listings_test', 'calendar_previously_built', 'listings']


### <span style=color:blue>As preparation for the next steps, I have a created table reviewm in my PostgreSQL using DBeaver, in which I dropped the comments_tsv column (because not needed), renamed column "id" to "review_id" (so that it is not repeating the "id" column of the listings table, and dropped the datetime column.</span>

<span style=color:blue>In the following I focus on the query q10, which fetches a left join based on all listing_ids with prefix '10'.  This is useful for doing testing.  For your assignment you should use the left join query that includes all listings.</span>

In [6]:
import importlib
import util
# using this in case I have added stuff to util.py
importlib.reload(util)

# some other queries I was experimenting with
# q = util.build_query_full_join_listings_reviewsm()
# q = util.build_query_left_join_listings_reviewsm_null_right()

q10 = util.build_query_left_join_listings_reviewsm_10()
q = util.build_query_left_join_listings_reviewsm()

print('We will be using the following queries, produced by functions I defined in util.py:\n')
print(q10)
print()
print(q)

We will be using the following queries, produced by functions I defined in util.py:

select *
from listings l left join reviewsm r 
        on l.id = r.listing_id
  where left(l.id,2) = '10'
-- this query fetches data for 3313 listings, useful for testing

select *
from listings l left join reviewsm r 
        on l.id = r.listing_id


In [7]:
with db_eng.connect() as conn:
    df_ljr10 = pd.read_sql(q10, con=conn)
    # df_ljr = pd.read_sql(q, con=conn)
    

In [8]:
print(df_ljr10.head())
# print(df_ljr.head())

                    id                                               name  \
0  1007898112628596835         Rental unit in Bronx ¬∑ 2 bedrooms ¬∑ 1 bath   
1  1032550134459701382  Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 ...   
2  1032550134459701382  Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 ...   
3  1032550134459701382  Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 ...   
4  1032550134459701382  Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 ...   

     host_id host_name neighbourhood_group       neighbourhood   latitude  \
0  481526228   Michael               Bronx  Westchester Square  40.842243   
1  231138233     Filiz              Queens           Ridgewood  40.704277   
2  231138233     Filiz              Queens           Ridgewood  40.704277   
3  231138233     Filiz              Queens           Ridgewood  40.704277   
4  231138233     Filiz              Queens           Ridgewood  40.704277   

   longitude        room_type  price  ...  calculate

In [9]:
# print(df_ljr.shape)
# should be 998,310 rows in df_ljr.  This is
#     number of records in listings whose id do not show up in reviews['listing_id'] =  11,500
#   + number of reviews                                                              = 986,810

print(df_ljr10.shape)
# you might want to check this number against what you expect based on what exploration
#    you do with DBeaver

(25103, 24)


### <span style=color:blue>The left outer join has between 0 and many records for each listing_id.  There is one record for each review about that listing.  We will now re-format this data into a list of dictionaries.  Each dictionary will have the data for one listing along with a list of all of the associated reviews. </span>

In [10]:
cols = df_ljr10.columns.tolist()
# cols = df_ljr.columns.tolist()
print(cols)

['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license', 'listing_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']


<span style=color:blue>As a first step, we build a list of dictionaries with just the listing data.  To do this we use pandas to create a new dataframe with the reviews-related columns dropped</span>

In [11]:
# to do a projection and remove duplicates
cols_of_listings = ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 
                    'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 
                    'minimum_nights', 'number_of_reviews', 'last_review', 
                    'reviews_per_month', 'calculated_host_listings_count', 
                    'availability_365', 'number_of_reviews_ltm', 'license']
cols_of_reviews = ['listing_id', 'review_id', 'date', 'reviewer_id', 
                   'reviewer_name', 'comments']

df_ljr10_new = df_ljr10.drop(cols_of_reviews, axis=1).drop_duplicates()


print(df_ljr10_new.shape)
# print(df_ljr_new.head(10))

# print(df_ljr_new.iloc[13870])

(3313, 18)


<span style=color:blue>Converting the dataframe into a list of dictionaries     </span>

In [12]:
dict_ljr10_new = df_ljr10_new.to_dict('records')
print(len(dict_ljr10_new))
pprint.pp(dict_ljr10_new[0])

3313
{'id': '1007898112628596835',
 'name': 'Rental unit in Bronx ¬∑ 2 bedrooms ¬∑ 1 bath',
 'host_id': '481526228',
 'host_name': 'Michael',
 'neighbourhood_group': 'Bronx',
 'neighbourhood': 'Westchester Square',
 'latitude': 40.842243,
 'longitude': -73.853096,
 'room_type': 'Entire home/apt',
 'price': 319.0,
 'minimum_nights': 2,
 'number_of_reviews': 0,
 'last_review': None,
 'reviews_per_month': nan,
 'calculated_host_listings_count': 1,
 'availability_365': 89,
 'number_of_reviews_ltm': 0,
 'license': 'OSE-STRREG-0001056'}


<span style=color:blue>Let's try loading what we have so far into MongoDB, into a temporary collection     </span>

In [13]:
# testing with a new, temporary collection
listings_test = db.listings_test

try:
    result = listings_test.insert_many(dict_ljr10_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('There was an error when loading the dictionary into MongoDB:')
    print(e)



There was an error when loading the dictionary into MongoDB:
cannot encode object: datetime.date(2024, 1, 29), of type: <class 'datetime.date'>


<span style=color:blue>MongoDB does not handle dates, only datetimes.  Here is a function to convert the dates into datetimes.  (An alternative would have been to convert the dates in our table reviewsm into datetimes.)

In [14]:
# This converts date to datetime.  It also converts various kinds of
#     null values into None, which loads into MongoDB without creating errors
def convert_date_to_datetime(dt):
    if pd.isnull(dt):           # tests whether dt is None, NaN, or DaT (not a date)
        return None
    elif type(dt) == pd._libs.tslibs.nattype.NaTType:  # including this, but see below
        return None
    else:
        temp = datetime(dt.year, dt.month, dt.day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

# testing various cases:
# Here are four dictionaries to test with
dict1 = {'foo':1, 'date': date(2023,1,2)}
dict2 = {'goo':2, 'date': math.nan}
dict3 = {'hoo':3, 'date': None}
dict4 = {'koo':4, 'date': pd.NaT}

if pd.isnull(dict3['date']):        # pd.isnull tests whether something is 
    print("dict4['date'] tested positive as NaT")    
else:
    print("dict4['date'] did not test positive as NaT")
    

print(dict1)
dict1['date'] = convert_date_to_datetime(dict1['date'])
print(dict1)

print()
print(dict2)
dict2['date'] = convert_date_to_datetime(dict2['date'])
print(dict2)

print()
print(dict3)
dict3['date'] = convert_date_to_datetime(dict3['date'])
print(dict3)

print()
print(dict4)
dict4['date'] = convert_date_to_datetime(dict4['date'])
print(dict4)

dict4['date'] tested positive as NaT
{'foo': 1, 'date': datetime.date(2023, 1, 2)}
{'foo': 1, 'date': datetime.datetime(2023, 1, 2, 0, 0)}

{'goo': 2, 'date': nan}
{'goo': 2, 'date': None}

{'hoo': 3, 'date': None}
{'hoo': 3, 'date': None}

{'koo': 4, 'date': NaT}
{'koo': 4, 'date': None}


<span style=color:blue>Use pandas to replace the dates in the "last_review" column with datetimes</span>

In [15]:
# trying to replace all dates by datetimes (or None)

# df_ljr10_new['last_review'] = df_ljr10_new['last_review'].apply(convert_date_to_datetime)

# could also have written
df_ljr10_new['last_review'] = df_ljr10_new['last_review'].apply(lambda x: convert_date_to_datetime(x))

In [16]:
print(df_ljr10_new.head())

                     id                                               name  \
0   1007898112628596835         Rental unit in Bronx ¬∑ 2 bedrooms ¬∑ 1 bath   
1   1032550134459701382  Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 ...   
11  1061632971843436364  Hotel in Queens ¬∑ ‚òÖ4.40 ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ ...   
16  1039753101132672547  Rental unit in New York ¬∑ ‚òÖNew ¬∑ Studio ¬∑ 1 be...   
17  1024485974616152538  Home in Staten Island ¬∑ ‚òÖNew ¬∑ 1 bedroom ¬∑ 2 b...   

      host_id   host_name neighbourhood_group       neighbourhood   latitude  \
0   481526228     Michael               Bronx  Westchester Square  40.842243   
1   231138233       Filiz              Queens           Ridgewood  40.704277   
11  544554359  John Hotel              Queens            Flushing  40.760290   
16  371213084       Nabel           Manhattan   Battery Park City  40.711452   
17  524771414     Shaimaa       Staten Island          St. George  40.640290   

    longitude        room_t

In [17]:
# As you can see in the result from the last cell,
#   somehow the NaT's are still there, in spite of the special case included in
#   the function convert_time_to_timestamp()
#   BTW, curiously, on very small dataframes the convert_time_to_timestamp() does convert NaT to None

# Happily, all of the actual dates have converted into datetimes, as illustrated by the following:
#    Using "iloc" because the index values in df_ljr10_new are not consecutive
print(type(df_ljr10_new.iloc[0, 12]))  # 12 is position of 'last_review'
print(df_ljr10_new.iloc[0,12])
print(type(df_ljr10_new.iloc[1, 12]))  
print(df_ljr10_new.iloc[1,12])
print(type(df_ljr10_new.iloc[2, 12]))  
print(df_ljr10_new.iloc[2,12])
print(type(df_ljr10_new.iloc[3, 12]))  
print(df_ljr10_new.iloc[3,12])

<class 'pandas._libs.tslibs.nattype.NaTType'>
NaT
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2024-01-29 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2024-01-20 00:00:00
<class 'pandas._libs.tslibs.nattype.NaTType'>
NaT


In [18]:
# recomputing dict_ljr10_new
dict_ljr10_new = df_ljr10_new.to_dict('records')
print(len(dict_ljr10_new))
pprint.pp(dict_ljr10_new[0:2])

3313
[{'id': '1007898112628596835',
  'name': 'Rental unit in Bronx ¬∑ 2 bedrooms ¬∑ 1 bath',
  'host_id': '481526228',
  'host_name': 'Michael',
  'neighbourhood_group': 'Bronx',
  'neighbourhood': 'Westchester Square',
  'latitude': 40.842243,
  'longitude': -73.853096,
  'room_type': 'Entire home/apt',
  'price': 319.0,
  'minimum_nights': 2,
  'number_of_reviews': 0,
  'last_review': NaT,
  'reviews_per_month': nan,
  'calculated_host_listings_count': 1,
  'availability_365': 89,
  'number_of_reviews_ltm': 0,
  'license': 'OSE-STRREG-0001056'},
 {'id': '1032550134459701382',
  'name': 'Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ 1 bath',
  'host_id': '231138233',
  'host_name': 'Filiz',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'Ridgewood',
  'latitude': 40.704277,
  'longitude': -73.89964,
  'room_type': 'Private room',
  'price': 53.0,
  'minimum_nights': 1,
  'number_of_reviews': 10,
  'last_review': Timestamp('2024-01-29 00:00:00'),
  'reviews_per_mont

In [19]:
# However, the load into MongoDB still fails, because of the NaT values
#    As noted above, the convert_time_to_timestamp did not convert the NaT values
try:
    result = listings_test.insert_many(dict_ljr10_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('\nThere was an error when loading the dictionary into MongoDB:')
    print(e)


There was an error when loading the dictionary into MongoDB:
NaTType does not support utcoffset


<span style=color:blue>OK, so let's convert the NaT's in the dictionary rather than in pandas  </span>

In [20]:
for doc in dict_ljr10_new:
    if pd.isnull(doc['last_review']): 
        doc['last_review'] = None

pprint.pp(dict_ljr10_new[0:10])

[{'id': '1007898112628596835',
  'name': 'Rental unit in Bronx ¬∑ 2 bedrooms ¬∑ 1 bath',
  'host_id': '481526228',
  'host_name': 'Michael',
  'neighbourhood_group': 'Bronx',
  'neighbourhood': 'Westchester Square',
  'latitude': 40.842243,
  'longitude': -73.853096,
  'room_type': 'Entire home/apt',
  'price': 319.0,
  'minimum_nights': 2,
  'number_of_reviews': 0,
  'last_review': None,
  'reviews_per_month': nan,
  'calculated_host_listings_count': 1,
  'availability_365': 89,
  'number_of_reviews_ltm': 0,
  'license': 'OSE-STRREG-0001056',
  '_id': ObjectId('6658d7d82d77bbd68b58f9d7')},
 {'id': '1032550134459701382',
  'name': 'Rental unit in Queens ¬∑ ‚òÖ4.70 ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ 1 bath',
  'host_id': '231138233',
  'host_name': 'Filiz',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'Ridgewood',
  'latitude': 40.704277,
  'longitude': -73.89964,
  'room_type': 'Private room',
  'price': 53.0,
  'minimum_nights': 1,
  'number_of_reviews': 10,
  'last_review': Timestamp(

<span style=color:blue>Now trying the load again    </span>

In [21]:
try:
    result = listings_test.insert_many(dict_ljr10_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('\nThere was an error when loading the dictionary into MongoDB:')
    print(e)


Last element of result for the last run was:
[ObjectId('6658d7d82d77bbd68b5906c7')]


<span style=color:blue>Now we add, for each listing, a list of all reviews for that listing     </span>

In [22]:
i = 0

# We will keep track of the time to do each 1000 listings
time1 = datetime.now()


for d in dict_ljr10_new:
    i += 1

    # building a df with just reviews info, and corresponding to the listing we are focusing on
    df_reviews_one_listing = df_ljr10.loc[df_ljr10['id'] == d['id']].drop(cols_of_listings, axis=1)

    # Note: This does not run super quickly.  As an alternative I tried pulling this 
    #    data with a query against PostgreSQL, but it was even slower

    # there are no null values in the 'date' column of reviews, so we can do the
    #    date to datetime conversion using pandas
    df_reviews_one_listing['date'] = df_reviews_one_listing['date'].apply(lambda x: convert_date_to_datetime(x))

    dicts_reviews_one_listing = df_reviews_one_listing.to_dict('records')

    # Need special handling for the case of no reviews 
    if len(dicts_reviews_one_listing) == 1 and dicts_reviews_one_listing[0]['review_id'] is None:
        d['reviews'] = {}
    else:
        d['reviews'] = dicts_reviews_one_listing

    if i % 1000 == 0:
        time2 = datetime.now()
        time_taken = util.time_diff(time1,time2)
        print('Have now completed step number:', str(i), 'and it took', str(time_taken), 'seconds' )
        time1 = datetime.now()

    # given the time it takes to do 1000 listings, how long will it take to do all of the listings?

print()
print(len(dict_ljr10_new))
print()
pprint.pp(dict_ljr10_new[-10:])

Have now completed step number: 1000 and it took 3.137363 seconds
Have now completed step number: 2000 and it took 3.194379 seconds
Have now completed step number: 3000 and it took 3.38387 seconds

3313

[{'id': '1051404502211155820',
  'name': 'Rental unit in Queens ¬∑ ‚òÖNew ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ 1 shared bath',
  'host_id': '490393628',
  'host_name': 'Veronica',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'Astoria',
  'latitude': 40.766,
  'longitude': -73.92658,
  'room_type': 'Private room',
  'price': 65.0,
  'minimum_nights': 30,
  'number_of_reviews': 0,
  'last_review': None,
  'reviews_per_month': nan,
  'calculated_host_listings_count': 2,
  'availability_365': 254,
  'number_of_reviews_ltm': 0,
  'license': '',
  '_id': ObjectId('6658d7d82d77bbd68b5906be'),
  'reviews': {}},
 {'id': '1025204455815288168',
  'name': 'Rental unit in New York ¬∑ 1 bedroom ¬∑ 1 bed ¬∑ 1 bath',
  'host_id': '407304997',
  'host_name': 'Boomerang',
  'neighbourhood_group': 'Manhatta

<span style=color:blue>Sanity check, that we did not lose any listings </span>

In [23]:
print(len(dict_ljr10_new))

3313


<span style=color:blue>Now loading dict_ljr10_new into mongodb.   </span>

<span style=color:blue>The loading is done 100 documents at a time, with a last small lot </span>

In [24]:
print(len(dict_ljr10_new) % 100)
print(len(dict_ljr10_new))

13
3313


In [25]:
# CAUTION: the first step here erases db.listing
#    I have kept this here during testing
db.listings.drop()


listings = db.listings

time0 = datetime.now()
time1 = datetime.now()

for i in range(0,33):
# for i in range(0,10):
    result = listings.insert_many(dict_ljr10_new[100*i:100*(i+1)])

    time2 = datetime.now()
    time_taken = util.time_diff(time1,time2)
    print('Have now completed step number:', str(i), 'and it took', str(time_taken), 'seconds' )
    time1 = datetime.now()
    
time3 = datetime.now()


print('\nThe last ObjectID in the collection is:')
print(result.inserted_ids[-1:])

# print('\nThe time to do the load of 39K documents into local mongodb, with a total of about 300MB was:')
print('\nThe time for this run was:')
print(util.time_diff(time0,time3))



# this is for the last 13 records in dict_ljr10_new, but built for arbitrary number of records
result = listings.insert_many(dict_ljr10_new[3300:])


print('\nThe total number of documents in the collection db.listings is now:')
print(listings.count_documents({}))

print('\nLast few ObjectIds of result for the last run was:')
print(result.inserted_ids[-5:])

print('\nThe last few documents of result for the last run was:')
# Curiously, the next line fetches only the object ids, not the full documents
# out = listings.find({'_id' : {'$in' : result.inserted_ids[-5:]} } )
# pprint.pp(out)
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(listings.find_one({ '_id': o}))
pprint.pp(outdocs)

Have now completed step number: 0 and it took 0.103784 seconds
Have now completed step number: 1 and it took 0.026726 seconds
Have now completed step number: 2 and it took 0.004255 seconds
Have now completed step number: 3 and it took 0.007909 seconds
Have now completed step number: 4 and it took 0.005486 seconds
Have now completed step number: 5 and it took 0.021856 seconds
Have now completed step number: 6 and it took 0.01311 seconds
Have now completed step number: 7 and it took 0.015675 seconds
Have now completed step number: 8 and it took 0.00939 seconds
Have now completed step number: 9 and it took 0.00479 seconds
Have now completed step number: 10 and it took 0.031557 seconds
Have now completed step number: 11 and it took 0.008377 seconds
Have now completed step number: 12 and it took 0.017266 seconds
Have now completed step number: 13 and it took 0.002597 seconds
Have now completed step number: 14 and it took 0.004067 seconds
Have now completed step number: 15 and it took 0.0034

<span style=color:blue>Here is a query testing against the 'last_review' values    </span>

In [26]:
cursor = listings.find( { 'last_review' : { '$lte' : datetime(2024,1,1,0,0,0,0)}})
l = list(cursor)
print(len(l))
# pprint.pp(l)

449


<span style=color:blue>Interestingly, you cannot write the dictionary we created out to a json file...     </span>

In [27]:
def write_dict_to_json(dict, filename):
    with open(filename, 'w') as fp:
        json.dump(dict, fp)

try:
    filename = 'listings_with_reviews_embedded__v01.json'
    write_dict_to_json(dict_ljr10_new, filename)
except Exception as e:
    print('\nThere was an error, as follows:')
    print(e)
    print()

# There are some suggestions at
#   https://stackoverflow.com/questions/50404559/python-error-typeerror-object-of-type-timestamp-is-not-json-serializable



There was an error, as follows:
Object of type ObjectId is not JSON serializable



<span style=color:blue>Query 1: How many listings have last_review between February 1, 2021, and March 15, 2023 </span>

<span style=color:blue>Note: to include March 15, 2023, you should use <= 2023-03-16 00:00:00</span>


In [28]:
cursor = listings.find( { '$and' : [{ 'last_review' : {'$gte' : datetime(2021,2,1,0,0,0,0)}},
                                    { 'last_review' : {'$lte' : datetime(2023,3,16,0,0,0,0)}}
                                    ]
                         })
                                        
l = list(cursor)
print(len(l))

31


<span style=color:blue>Query 2: How many listings have an array of reviews with length at least 50? </span>

<span style=color:blue>Can take inspiration from https://stackoverflow.com/questions/41918605/mongodb-find-array-length-greater-than-specified-size</span>

In [29]:
# counting starts at 0
cursor = listings.find( { 'reviews.49' : { '$exists': True} } )
l = list(cursor)
print(len(l))

108


<span style=color:blue> Query 3: Output is the number of listings that have a review containing the word "awesome" (case sensitive) OR a review containing the word "amazing" (case sensitive).  </span>

<span style=color:blue> Query 4: Output is the number of listings that have a review containing the word "awesome" (case insensitive) OR a review containing the word "amazing" (case insensitive).  </span>


In [30]:
cursor = listings.find( { '$or' : [ {'reviews.comments' : { '$regex':  '^.*awesome.*$' , '$options': 'i'  } } ,
                                    {'reviews.comments' :  { '$regex':  '^.*amazing.*$' , '$options': 'i'    } }
                                  ]
                        }
                      )
l = list(cursor)
print(len(l))
i = 0
for d in l[0:5]:
    i += 1
    print('\nReviews for listing number', i)
    for r in d['reviews']:
        pprint.pp(r['comments'])

384

Reviews for listing number 1
('Honestly Amazing Host Very Helpful And Responsive And Accommodating! The '
 'Apartment Was Clean And Included Toiletries/Amenities! Parking Was Very Easy '
 'To Find Which Was Great! I Would Definitely Recommend Booking Here!!')

Reviews for listing number 2
'A really nice place in a useful location.'
'An attractive apartment in a great location.'
('Beautiful appt., lovely neighbourhood. Everything as described, only better. '
 'Will be staying again hopefully!\r'
 '<br/>')
'Awesome! Recommended.'
'LOVE IT!!!'
'Very comfortable, characterful apartment in a great location in Greenpoint!'
('Had a great stay arranged at the last minute.  Very responsive host, stylish '
 'apartment, and great location.')
('Great!  Maya is very kind and accommodating. Her place was perfect for our '
 'stay, very cozy and well situated.  ')
('Accurately described place, stylish and in a great location. Maya was '
 'responsive and friendly. We had a great stay and would hig

In [31]:
# counting starts at 0
cursor = listings.find( { 'reviews.29.comments' : { '$regex':  '^.*awesome.*$'    } } )
l = list(cursor)
print(len(l))
pprint.pp(l[0:5])

4
[{'_id': ObjectId('6658d7d82d77bbd68b58fae7'),
  'id': '1043046',
  'name': 'Rental unit in New York ¬∑ ‚òÖ4.90 ¬∑ Studio ¬∑ 1 bed ¬∑ 1 bath',
  'host_id': '2335804',
  'host_name': 'Lindsay',
  'neighbourhood_group': 'Manhattan',
  'neighbourhood': 'Chelsea',
  'latitude': 40.74118,
  'longitude': -74.00071,
  'room_type': 'Entire home/apt',
  'price': nan,
  'minimum_nights': 30,
  'number_of_reviews': 72,
  'last_review': datetime.datetime(2020, 3, 15, 0, 0),
  'reviews_per_month': 0.62,
  'calculated_host_listings_count': 1,
  'availability_365': 0,
  'number_of_reviews_ltm': 0,
  'license': '',
  'reviews': [{'listing_id': '1043046',
               'review_id': '341292751',
               'date': datetime.datetime(2018, 10, 26, 0, 0),
               'reviewer_id': '1618825',
               'reviewer_name': 'Leron',
               'comments': 'Great place, great location!'},
              {'listing_id': '1043046',
               'review_id': '65891831',
               'date': dat