### <span style=color:blue> Loading Listings & Reviews data from postgresql into local MongoDB    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

In [2]:
# test that utils.py has been imported well
util.hello_world()

hello world


<span style=color:blue>Getting PostgreSQL connection set up</span>

In [3]:
# Load the env file 

dotenv_path = 'env_variables.env'
load_dotenv(dotenv_path=dotenv_path)

# Import the env variables

load_dotenv()

schema = os.getenv('DISC_6_SCHEMA')
port = os.getenv('DISC_6_PORT')
host = os.getenv('DISC_6_HOST')
database = os.getenv('DISC_6_DB')
password = os.getenv('DISC_6_PASSWORD')
connection = os.getenv('DISC_6_CONNECTION')

# Create the db engine 

db_eng = create_engine(f"postgresql+psycopg2://{connection}:{password}@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


<span style=color:blue>Getting mongodb connection set up</span>

In [4]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Setting up collection "listings" in mongodb</span>

In [5]:
# I have (or will have) a database "airbnb"
db = client.airbnb

# inside the "airbnb" database, I have (or will have) a collection "listings"
listings = db.listings
print(db.list_collection_names())
# I have some other collections in my airbnb database...

['testing', 'listings_previously_built', 'listings_test', 'calendar_by_agg', 'calendar_previously_built', 'listings']


### <span style=color:blue>As preparation for the next steps, I have a created table reviewm in my PostgreSQL using DBeaver, in which I dropped the comments_tsv column (because not needed), renamed column "id" to "review_id" (so that it is not repeating the "id" column of the listings table, and dropped the datetime column.</span>

<span style=color:blue>In the following I focus on the query q10, which fetches a left join based on all listing_ids with prefix '10'.  This is useful for doing testing.  For your assignment you should use the left join query that includes all listings.</span>

In [6]:
import importlib
import util
# using this in case I have added stuff to util.py
importlib.reload(util)

# some other queries I was experimenting with
# q = util.build_query_full_join_listings_reviewsm()
# q = util.build_query_left_join_listings_reviewsm_null_right()

q10 = util.build_query_left_join_listings_reviewsm_10()
q = util.build_query_left_join_listings_reviewsm()

print('We will be using the following queries, produced by functions I defined in util.py:\n')
print(q10)
print()
print(q)

We will be using the following queries, produced by functions I defined in util.py:

select *
from listings l left join reviewsm r 
        on l.id = r.listing_id
  where left(l.id,2) = '10'
-- this query fetches data for 3313 listings, useful for testing

select *
from listings l left join reviewsm r 
        on l.id = r.listing_id


In [10]:
with db_eng.connect() as conn:
    df_ljr10 = pd.read_sql(q10, con=conn)
    df_ljr = pd.read_sql(q, con=conn)


    

In [11]:
print(df_ljr10.head())
# print(df_ljr.head())

                    id                                               name  \
0  1007898112628596835         Rental unit in Bronx · 2 bedrooms · 1 bath   
1  1032550134459701382  Rental unit in Queens · ★4.70 · 1 bedroom · 1 ...   
2  1032550134459701382  Rental unit in Queens · ★4.70 · 1 bedroom · 1 ...   
3  1032550134459701382  Rental unit in Queens · ★4.70 · 1 bedroom · 1 ...   
4  1032550134459701382  Rental unit in Queens · ★4.70 · 1 bedroom · 1 ...   

     host_id host_name neighbourhood_group       neighbourhood   latitude  \
0  481526228   Michael               Bronx  Westchester Square  40.842243   
1  231138233     Filiz              Queens           Ridgewood  40.704277   
2  231138233     Filiz              Queens           Ridgewood  40.704277   
3  231138233     Filiz              Queens           Ridgewood  40.704277   
4  231138233     Filiz              Queens           Ridgewood  40.704277   

   longitude        room_type  price  ...  calculated_host_listings_count 

In [12]:
# print(df_ljr.shape)
# should be 998,310 rows in df_ljr.  This is
#     number of records in listings whose id do not show up in reviews['listing_id'] =  11,500
#   + number of reviews                                                              = 986,810

print(df_ljr10.shape)
# you might want to check this number against what you expect based on what exploration
#    you do with DBeaver
print(df_ljr.shape)

(25103, 24)
(998310, 24)


### <span style=color:blue>The left outer join has between 0 and many records for each listing_id.  There is one record for each review about that listing.  We will now re-format this data into a list of dictionaries.  Each dictionary will have the data for one listing along with a list of all of the associated reviews. </span>

In [13]:
# cols = df_ljr10.columns.tolist()
cols = df_ljr.columns.tolist()
print(cols)

['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license', 'listing_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']


<span style=color:blue>As a first step, we build a list of dictionaries with just the listing data.  To do this we use pandas to create a new dataframe with the reviews-related columns dropped</span>

In [14]:
# to do a projection and remove duplicates
cols_of_listings = ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 
                    'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 
                    'minimum_nights', 'number_of_reviews', 'last_review', 
                    'reviews_per_month', 'calculated_host_listings_count', 
                    'availability_365', 'number_of_reviews_ltm', 'license']
cols_of_reviews = ['listing_id', 'review_id', 'date', 'reviewer_id', 
                   'reviewer_name', 'comments']

# df_ljr10_new = df_ljr10.drop(cols_of_reviews, axis=1).drop_duplicates()
df_ljr_new = df_ljr.drop(cols_of_reviews, axis=1).drop_duplicates()


# print(df_ljr10_new.shape)
print(df_ljr_new.head(10))

# print(df_ljr_new.iloc[13870])

                    id                                               name  \
0             51944693  Home in Queens · ★4.82 · 1 bedroom · 5 beds · ...   
8             52966412  Home in Queens · ★4.93 · 1 bedroom · 2 beds · ...   
9              4365276  Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...   
10            51620506  Rental unit in Queens · ★4.75 · 1 bedroom · 1 ...   
12            52013393  Condo in Queens · ★4.82 · 1 bedroom · 1 bed · ...   
14  761666718123905227  Home in Queens · ★5.0 · 1 bedroom · 1 bed · 1 ...   
15  689043579053364669  Rental unit in Queens · ★4.87 · 1 bedroom · 1 ...   
17            10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...   
21  760316567719628232  Rental unit in Queens · ★4.82 · 2 bedrooms · 3...   
25  709364398556175620  Home in Queens · ★5.0 · 1 bedroom · 1 bed · 1 ...   

      host_id host_name neighbourhood_group  neighbourhood   latitude  \
0    91646104       Pao              Queens       Woodside  40.743950   
8   39

<span style=color:blue>Converting the dataframe into a list of dictionaries     </span>

In [15]:
# dict_ljr10_new = df_ljr10_new.to_dict('records')
# print(len(dict_ljr10_new))
# pprint.pp(dict_ljr10_new[0])

dict_ljr_new = df_ljr_new.to_dict('records')
print(len(dict_ljr_new))
pprint.pp(dict_ljr_new[0])

39202
{'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.date(2023, 9, 24),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': ''}


<span style=color:blue>Let's try loading what we have so far into MongoDB, into a temporary collection     </span>

In [16]:
# testing with a new, temporary collection
listings_test = db.listings_test

try:
    # result = listings_test.insert_many(dict_ljr10_new)
    result = listings_test.insert_many(dict_ljr_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('There was an error when loading the dictionary into MongoDB:')
    print(e)



There was an error when loading the dictionary into MongoDB:
cannot encode object: datetime.date(2023, 9, 24), of type: <class 'datetime.date'>


<span style=color:blue>MongoDB does not handle dates, only datetimes.  Here is a function to convert the dates into datetimes.  (An alternative would have been to convert the dates in our table reviewsm into datetimes.)

In [17]:
# This converts date to datetime.  It also converts various kinds of
#     null values into None, which loads into MongoDB without creating errors
def convert_date_to_datetime(dt):
    if pd.isnull(dt):           # tests whether dt is None, NaN, or DaT (not a date)
        return None
    elif type(dt) == pd._libs.tslibs.nattype.NaTType:  # including this, but see below
        return None
    else:
        temp = datetime(dt.year, dt.month, dt.day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

# testing various cases:
# Here are four dictionaries to test with
dict1 = {'foo':1, 'date': date(2023,1,2)}
dict2 = {'goo':2, 'date': math.nan}
dict3 = {'hoo':3, 'date': None}
dict4 = {'koo':4, 'date': pd.NaT}

if pd.isnull(dict3['date']):        # pd.isnull tests whether something is 
    print("dict4['date'] tested positive as NaT")    
else:
    print("dict4['date'] did not test positive as NaT")
    

print(dict1)
dict1['date'] = convert_date_to_datetime(dict1['date'])
print(dict1)

print()
print(dict2)
dict2['date'] = convert_date_to_datetime(dict2['date'])
print(dict2)

print()
print(dict3)
dict3['date'] = convert_date_to_datetime(dict3['date'])
print(dict3)

print()
print(dict4)
dict4['date'] = convert_date_to_datetime(dict4['date'])
print(dict4)

dict4['date'] tested positive as NaT
{'foo': 1, 'date': datetime.date(2023, 1, 2)}
{'foo': 1, 'date': datetime.datetime(2023, 1, 2, 0, 0)}

{'goo': 2, 'date': nan}
{'goo': 2, 'date': None}

{'hoo': 3, 'date': None}
{'hoo': 3, 'date': None}

{'koo': 4, 'date': NaT}
{'koo': 4, 'date': None}


<span style=color:blue>Use pandas to replace the dates in the "last_review" column with datetimes</span>

In [19]:
# trying to replace all dates by datetimes (or None)

# df_ljr10_new['last_review'] = df_ljr10_new['last_review'].apply(convert_date_to_datetime)

# could also have written
# df_ljr10_new['last_review'] = df_ljr10_new['last_review'].apply(lambda x: convert_date_to_datetime(x))
df_ljr_new['last_review'] = df_ljr_new['last_review'].apply(lambda x: convert_date_to_datetime(x))

In [41]:
# print(df_ljr10_new.head())
print(df_ljr_new.shape)
print()
print(df_ljr_new.head())

(39202, 18)

          id                                               name    host_id  \
0   51944693  Home in Queens · ★4.82 · 1 bedroom · 5 beds · ...   91646104   
8   52966412  Home in Queens · ★4.93 · 1 bedroom · 2 beds · ...  396969684   
9    4365276  Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...    3081990   
10  51620506  Rental unit in Queens · ★4.75 · 1 bedroom · 1 ...  239139334   
12  52013393  Condo in Queens · ★4.82 · 1 bedroom · 1 bed · ...  420148935   

   host_name neighbourhood_group  neighbourhood  latitude  longitude  \
0        Pao              Queens       Woodside  40.74395  -73.90858   
8   Tafazzul              Queens  East Elmhurst  40.76668  -73.86583   
9     Amanda            Brooklyn   Williamsburg  40.70832  -73.95588   
10     Helen              Queens       Flushing  40.76350  -73.82608   
12    Nusrat              Queens        Jamaica  40.67344  -73.78200   

          room_type  price  minimum_nights  number_of_reviews last_review  \
0   Enti

In [42]:
# When the previous cell is run on df_ljr10_new, the first few entries included
#   NaT values, in spite of the special case included in
#   the function convert_time_to_timestamp()
#   BTW, curiously, on very small dataframes the convert_time_to_timestamp() does convert NaT to None

# Happily, all of the actual dates have converted into datetimes, as illustrated by the following:
#    Using "iloc" because the index values in df_ljr10_new are not consecutive
"""
print(type(df_ljr10_new.iloc[0, 12]))  # 12 is position of 'last_review'
print(df_ljr10_new.iloc[0,12])
print(type(df_ljr10_new.iloc[1, 12]))  
print(df_ljr10_new.iloc[1,12])
print(type(df_ljr10_new.iloc[2, 12]))  
print(df_ljr10_new.iloc[2,12])
print(type(df_ljr10_new.iloc[3, 12]))  
print(df_ljr10_new.iloc[3,12])
"""

df_ljr_NaT = df_ljr_new.loc[df_ljr_new['last_review'] != df_ljr_new['last_review']]
print(df_ljr_NaT.shape)

print()

print(type(df_ljr_new.iloc[0, 12]))  # 12 is position of 'last_review'
print(df_ljr_new.iloc[0,12])
print(type(df_ljr_new.iloc[1, 12]))  
print(df_ljr_new.iloc[1,12])

print()
print(type(df_ljr_NaT.iloc[0, 12]))  # 12 is position of 'last_review'
print(df_ljr_NaT.iloc[0,12])
print(type(df_ljr_NaT.iloc[1, 12]))  
print(df_ljr_NaT.iloc[1,12])

print()
print(df_ljr_NaT.head())

(11500, 18)

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-09-24 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-11-25 00:00:00

<class 'pandas._libs.tslibs.nattype.NaTType'>
NaT
<class 'pandas._libs.tslibs.nattype.NaTType'>
NaT

                         id  \
519326  1009467369984606523   
519327             34890712   
519328             30101694   
519329   899029977125048428   
519330   852516497115442763   

                                                     name    host_id  \
519326  Rental unit in New York · 2 bedrooms · 1 bed ·...  305240193   
519327  Home in Brooklyn · 1 bedroom · 1 bed · 2 share...  192890383   
519328     Condo in New York · 1 bedroom · 1 bed · 1 bath    4750332   
519329  Home in Queens · 1 bedroom · 1 bed · 1 shared ...   19303369   
519330  Rental unit in Brooklyn · 5 bedrooms · 1 bed ·...    3223938   

       host_name neighbourhood_group   neighbourhood   latitude  longitude  \
519326      June           Manhattan        G

In [51]:
"""
# recomputing dict_ljr10_new
dict_ljr10_new = df_ljr10_new.to_dict('records')
print(len(dict_ljr10_new))
pprint.pp(dict_ljr10_new[0:2])
"""

# recomputing dict_ljr10_new
dict_ljr_new = df_ljr_new.to_dict('records')
print(len(dict_ljr_new))

# pprint.pp(dict_ljr_new[0:2])

flag = True
for i in range(0,len(dict_ljr_new)):
    if flag and type(dict_ljr_new[i]['last_review']) == pd._libs.tslibs.nattype.NaTType:    # testing for a null value
        print('\nAt position', i, 'the doc in dict_ljr_new is:')
        print()
        pprint.pp(dict_ljr_new[i])
        flag = False
    if not flag:
        exit

if flag:
    print('No records were found in dict_ljr_new with "last_review" holding a NaT value')
        


39202

At position 13870 the doc in dict_ljr_new is:

{'id': '1009467369984606523',
 'name': 'Rental unit in New York · 2 bedrooms · 1 bed · 0 shared baths',
 'host_id': '305240193',
 'host_name': 'June',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'Gramercy',
 'latitude': 40.736534,
 'longitude': -73.98062,
 'room_type': 'Private room',
 'price': 113.0,
 'minimum_nights': 30,
 'number_of_reviews': 0,
 'last_review': NaT,
 'reviews_per_month': nan,
 'calculated_host_listings_count': 333,
 'availability_365': 342,
 'number_of_reviews_ltm': 0,
 'license': ''}


In [52]:
# However, the load into MongoDB still fails, because of the NaT values
#    As noted above, the convert_time_to_timestamp did not convert the NaT values
try:
    # result = listings_test.insert_many(dict_ljr10_new)
    result = listings_test.insert_many(dict_ljr_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('\nThere was an error when loading the dictionary into MongoDB:')
    print(e)


There was an error when loading the dictionary into MongoDB:
NaTType does not support utcoffset


<span style=color:blue>OK, so let's convert the NaT's in the dictionary rather than in pandas  </span>

In [54]:
"""
for doc in dict_ljr10_new:
    if pd.isnull(doc['last_review']): 
        doc['last_review'] = None

pprint.pp(dict_ljr10_new[0:10])
"""

for doc in dict_ljr_new:
    if pd.isnull(doc['last_review']): 
        doc['last_review'] = None

# pprint.pp(dict_ljr_new[0:10])

flag = True
for i in range(0,len(dict_ljr_new)):
    if flag and type(dict_ljr_new[i]['last_review']) == pd._libs.tslibs.nattype.NaTType:    # testing for a null value
        print('\nAt position', i, 'the doc in dict_ljr_new is:')
        print()
        pprint.pp(dict_ljr_new[i])
        flag = False
    if not flag:
        exit

if flag:
    print('No records were found in dict_ljr_new with "last_review" holding a NaT value')
        


No records were found in dict_ljr_new with "last_review" holding a NaT value


<span style=color:blue>Now trying the load again    </span>

In [56]:
try:
    # result = listings_test.insert_many(dict_ljr10_new)
    result = listings_test.insert_many(dict_ljr_new)
    print('\nLast element of result for the last run was:')
    print(result.inserted_ids[-1:])
except Exception as e:
    print('\nThere was an error when loading the dictionary into MongoDB:')
    print(e)


Last element of result for the last run was:
[ObjectId('665b4bfc545e9d9c2b03e646')]


<span style=color:blue>Now we add, for each listing, a list of all reviews for that listing     </span>

In [60]:
i = 0

# We will keep track of the time to do each 1000 listings
time0 = datetime.now()
time1 = datetime.now()


# for d in dict_ljr10_new:
for d in dict_ljr_new:
    i += 1

    # building a df with just reviews info, and corresponding to the listing we are focusing on
    # df_reviews_one_listing = df_ljr10.loc[df_ljr10['id'] == d['id']].drop(cols_of_listings, axis=1)
    df_reviews_one_listing = df_ljr.loc[df_ljr['id'] == d['id']].drop(cols_of_listings, axis=1)

    # Note: This does not run super quickly.  As an alternative I tried pulling this 
    #    data with a query against PostgreSQL, but it was even slower

    # there are no null values in the 'date' column of reviews, so we can do the
    #    date to datetime conversion using pandas
    df_reviews_one_listing['date'] = df_reviews_one_listing['date'].apply(lambda x: convert_date_to_datetime(x))

    dicts_reviews_one_listing = df_reviews_one_listing.to_dict('records')

    # Need special handling for the case of no reviews 
    if len(dicts_reviews_one_listing) == 1 and dicts_reviews_one_listing[0]['review_id'] is None:
        d['reviews'] = {}
    else:
        d['reviews'] = dicts_reviews_one_listing

    if i % 1000 == 0:
        time2 = datetime.now()
        time_taken = util.time_diff(time1,time2)
        print('Have now completed step number:', str(i), 'and it took', str(time_taken), 'seconds' )
        time1 = datetime.now()

    # given the time it takes to do 1000 listings, how long will it take to do all of the listings?

time3 = datetime.now()
full_time_taken = util.time_diff(time0,time3)
print(f'\nThe total time taken was {full_time_taken}.')
# about 4445 seconds = about 75 minutes


print()
# print(len(dict_ljr10_new))
print(len(dict_ljr_new))
print()
# pprint.pp(dict_ljr10_new[-10:])
pprint.pp(dict_ljr_new[-3:])

Have now completed step number: 1000 and it took 107.689356 seconds
Have now completed step number: 2000 and it took 111.258282 seconds
Have now completed step number: 3000 and it took 112.244499 seconds
Have now completed step number: 4000 and it took 110.543519 seconds
Have now completed step number: 5000 and it took 109.433707 seconds
Have now completed step number: 6000 and it took 109.461624 seconds
Have now completed step number: 7000 and it took 110.533374 seconds
Have now completed step number: 8000 and it took 114.208683 seconds
Have now completed step number: 9000 and it took 108.953158 seconds
Have now completed step number: 10000 and it took 121.684653 seconds
Have now completed step number: 11000 and it took 118.579797 seconds
Have now completed step number: 12000 and it took 110.965727 seconds
Have now completed step number: 13000 and it took 106.239165 seconds
Have now completed step number: 14000 and it took 118.120527 seconds
Have now completed step number: 15000 and i

<span style=color:blue>Sanity check, that we did not lose any listings, and checking a few listings </span>

In [66]:
print()
# print(len(dict_ljr10_new))
print(len(dict_ljr_new))
print()
pprint.pp(dict_ljr_new[0:3])
print()
pprint.pp(dict_ljr_new[-3:])


39202

[{'id': '51944693',
  'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
  'host_id': '91646104',
  'host_name': 'Pao',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'Woodside',
  'latitude': 40.74395,
  'longitude': -73.90858,
  'room_type': 'Entire home/apt',
  'price': 294.0,
  'minimum_nights': 30,
  'number_of_reviews': 57,
  'last_review': Timestamp('2023-09-24 00:00:00'),
  'reviews_per_month': 1.98,
  'calculated_host_listings_count': 4,
  'availability_365': 89,
  'number_of_reviews_ltm': 23,
  'license': '',
  '_id': ObjectId('665b4bfc545e9d9c2b034d25'),
  'reviews': [{'listing_id': '51944693',
               'review_id': '883354811516703393',
               'date': Timestamp('2023-05-03 00:00:00'),
               'reviewer_id': '78568329',
               'reviewer_name': 'Laaziz',
               'comments': 'Hôte très réactif et avenant. Logement très bien '
                           'desservi, idéal pour visiter New York. Toutefois '
          

<span style=color:blue>Now loading dict_ljr_new into mongodb.   </span>

<span style=color:blue>The loading is done 100 documents at a time, with a last small lot </span>

In [67]:
# print(len(dict_ljr10_new) % 100)
# print(len(dict_ljr10_new))

print(len(dict_ljr_new) % 1000)
print(len(dict_ljr_new))

202
39202


In [65]:
# CAUTION: the first step here erases db.listing
#    I have kept this here during testing
db.listings.drop()


listings = db.listings

time0 = datetime.now()
time1 = datetime.now()

# for i in range(0,len(dict_ljr10_new) // 1000):
for i in range(0,len(dict_ljr_new) // 1000):
    # result = listings.insert_many(dict_ljr10_new[1000*i:1000*(i+1)])
    result = listings.insert_many(dict_ljr_new[1000*i:1000*(i+1)])

    time2 = datetime.now()
    time_taken = util.time_diff(time1,time2)
    print(f'Have now completed step number: {i} (with 1000 entries each) and it took {str(time_taken)} seconds' )
    time1 = datetime.now()
    
time3 = datetime.now()


print('\nThe last ObjectID in the collection is:')
print(result.inserted_ids[-1:])

# print('\nThe time to do the load of 39K documents into local mongodb, with a total of about 300MB was:')
print('\nThe time for this run was:')
print(util.time_diff(time0,time3))



# this is for the remaining records in dict_ljr_new, but built for arbitrary number of records
# result = listings.insert_many(dict_ljr10_new[3300:])
result = listings.insert_many(dict_ljr_new[(len(dict_ljr_new) // 1000) * 1000:])


print('\nThe total number of documents in the collection db.listings is now:')
print(listings.count_documents({}))

print('\nLast few ObjectIds of result for the last run was:')
print(result.inserted_ids[-5:])

print('\nThe last few documents of result for the last run was:')
# Curiously, the next line fetches only the object ids, not the full documents
# out = listings.find({'_id' : {'$in' : result.inserted_ids[-5:]} } )
# pprint.pp(out)
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(listings.find_one({ '_id': o}))
pprint.pp(outdocs)

Have now completed step number: 0 (with 1000 entries each) and it took 0.897935 seconds
Have now completed step number: 1 (with 1000 entries each) and it took 0.686927 seconds
Have now completed step number: 2 (with 1000 entries each) and it took 0.654771 seconds
Have now completed step number: 3 (with 1000 entries each) and it took 0.667428 seconds
Have now completed step number: 4 (with 1000 entries each) and it took 0.647702 seconds
Have now completed step number: 5 (with 1000 entries each) and it took 0.45687 seconds
Have now completed step number: 6 (with 1000 entries each) and it took 0.237932 seconds
Have now completed step number: 7 (with 1000 entries each) and it took 0.215479 seconds
Have now completed step number: 8 (with 1000 entries each) and it took 0.199625 seconds
Have now completed step number: 9 (with 1000 entries each) and it took 0.1984 seconds
Have now completed step number: 10 (with 1000 entries each) and it took 0.201246 seconds
Have now completed step number: 11

<span style=color:blue>Sanity check on size of the collection listings   </span>

In [68]:
print(db.listings.count_documents({}))

39202


<span style=color:blue>Here is a query testing against the 'last_review' values    </span>

In [69]:
cursor = listings.find( { 'last_review' : { '$lte' : datetime(2024,1,1,0,0,0,0)}})
l = list(cursor)
print(len(l))
# pprint.pp(l)
# BTW, this matches answer if using PostgreSQL and the listings table

24534


<span style=color:blue>Interestingly, you cannot write the dictionary we created out to a json file...     </span>

In [70]:
def write_dict_to_json(dict, filename):
    with open(filename, 'w') as fp:
        json.dump(dict, fp)

try:
    filename = 'listings_with_reviews_embedded__v01.json'
    write_dict_to_json(dict_ljr10_new, filename)
except Exception as e:
    print('\nThere was an error, as follows:')
    print(e)
    print()

# There are some suggestions at
#   https://stackoverflow.com/questions/50404559/python-error-typeerror-object-of-type-timestamp-is-not-json-serializable



There was an error, as follows:
name 'dict_ljr10_new' is not defined



<span style=color:blue>Query 1: How many listings have last_review between February 1, 2021, and March 15, 2023, inclusive </span>

<span style=color:blue>Note: to include March 15, 2023, you could use < 2023-03-16 00:00:00.  However, also OK to use <= 2023-03-15 00:00:00, because the dates are converted to datetimes with the 00:00:00 extension.</span>


In [74]:
cursor = listings.find( { '$and' : [{ 'last_review' : {'$gte' : datetime(2021,2,1,0,0,0,0)}},
                                    { 'last_review' : {'$lt' : datetime(2023,3,16,0,0,0,0)}}
                                    ]
                         })
                                        
l = list(cursor)
print(len(l))

3685


<span style=color:blue>Query 2: How many listings have an array of reviews with length at least 50? </span>

<span style=color:blue>Can take inspiration from https://stackoverflow.com/questions/41918605/mongodb-find-array-length-greater-than-specified-size</span>

In [75]:
# counting starts at 0
cursor = listings.find( { 'reviews.49' : { '$exists': True} } )
l = list(cursor)
print(len(l))

5658


<span style=color:blue> Query 3: Output is the number of listings that have a review containing the word "awesome" (case sensitive) OR a review containing the word "amazing" (case sensitive).  </span>

<span style=color:blue> Query 4: Output is the number of listings that have a review containing the word "awesome" (case insensitive) OR a review containing the word "amazing" (case insensitive).  </span>


In [77]:
# query 3
cursor = listings.find( { '$or' : [ {'reviews.comments' : { '$regex':  '^.*awesome.*$'  } } ,
                                    {'reviews.comments' :  { '$regex':  '^.*amazing.*$'    } }
                                  ]
                        }
                      )
l = list(cursor)
print(len(l))
i = 0
for d in l[0:5]:
    i += 1
    print('\nReviews for listing number', i)
    for r in d['reviews']:
        pprint.pp(r['comments'])

15890

Reviews for listing number 1
('Hôte très réactif et avenant. Logement très bien desservi, idéal pour '
 'visiter New York. Toutefois il ne faut pas avoir peur du bruit (proximité de '
 'la ligne de métro aérienne). Odeur persistante dans l’escalier de l’entrée. '
 '<br/>Arrivée et départ autonome appréciable. Quartier calme et sécurisé')
('Nice place, comfy living room with friends. Pao is a great responsive host. '
 'We forget AirPod and he wait to return for us. Loudly with nearby high '
 'train. Overall is good for friend trip.')
('Pao’s place is a great place to stay- very close to the metro to take you '
 'into Manhattan. Note, the beds are mattresses on the floor and all in one '
 'shared space, and there is only one bathroom. Pao was very responsible '
 'whenever we needed anything.')
('It was such a hassle free stay! We felt at welcomed even before we arrived! '
 'Pao sent messages about our stay right after we booked. Check in and out '
 'were seemless! The place was cl

In [76]:
# query 4
cursor = listings.find( { '$or' : [ {'reviews.comments' : { '$regex':  '^.*awesome.*$' , '$options': 'i'  } } ,
                                    {'reviews.comments' :  { '$regex':  '^.*amazing.*$' , '$options': 'i'    } }
                                  ]
                        }
                      )
l = list(cursor)
print(len(l))
i = 0
for d in l[0:5]:
    i += 1
    print('\nReviews for listing number', i)
    for r in d['reviews']:
        pprint.pp(r['comments'])

17121

Reviews for listing number 1
('Hôte très réactif et avenant. Logement très bien desservi, idéal pour '
 'visiter New York. Toutefois il ne faut pas avoir peur du bruit (proximité de '
 'la ligne de métro aérienne). Odeur persistante dans l’escalier de l’entrée. '
 '<br/>Arrivée et départ autonome appréciable. Quartier calme et sécurisé')
('Nice place, comfy living room with friends. Pao is a great responsive host. '
 'We forget AirPod and he wait to return for us. Loudly with nearby high '
 'train. Overall is good for friend trip.')
('Pao’s place is a great place to stay- very close to the metro to take you '
 'into Manhattan. Note, the beds are mattresses on the floor and all in one '
 'shared space, and there is only one bathroom. Pao was very responsible '
 'whenever we needed anything.')
('It was such a hassle free stay! We felt at welcomed even before we arrived! '
 'Pao sent messages about our stay right after we booked. Check in and out '
 'were seemless! The place was cl

In [31]:
# counting starts at 0
cursor = listings.find( { 'reviews.29.comments' : { '$regex':  '^.*awesome.*$'    } } )
l = list(cursor)
print(len(l))
pprint.pp(l[0:5])

4
[{'_id': ObjectId('6658d7d82d77bbd68b58fae7'),
  'id': '1043046',
  'name': 'Rental unit in New York · ★4.90 · Studio · 1 bed · 1 bath',
  'host_id': '2335804',
  'host_name': 'Lindsay',
  'neighbourhood_group': 'Manhattan',
  'neighbourhood': 'Chelsea',
  'latitude': 40.74118,
  'longitude': -74.00071,
  'room_type': 'Entire home/apt',
  'price': nan,
  'minimum_nights': 30,
  'number_of_reviews': 72,
  'last_review': datetime.datetime(2020, 3, 15, 0, 0),
  'reviews_per_month': 0.62,
  'calculated_host_listings_count': 1,
  'availability_365': 0,
  'number_of_reviews_ltm': 0,
  'license': '',
  'reviews': [{'listing_id': '1043046',
               'review_id': '341292751',
               'date': datetime.datetime(2018, 10, 26, 0, 0),
               'reviewer_id': '1618825',
               'reviewer_name': 'Leron',
               'comments': 'Great place, great location!'},
              {'listing_id': '1043046',
               'review_id': '65891831',
               'date': datetime.