### <span style=color:blue> Joining listings_with_reviews and listings_with_calendar_dates    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

In [2]:
# test that utils.py has been imported well
util.hello_world()

hello world


<span style=color:blue>Getting mongodb connection set up</span>

In [3]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

# I already have a database "airbnb"
db = client.airbnb

# checking collections in airbnb 
print(db.list_collection_names())

['listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'reviews_3', 'listings_3', 'listings_with_reviews', 'calendar_previously_built', 'listings', 'listings_with_reviews_m_3']


In [5]:
db.calendar_by_ag.drop()
print(db.list_collection_names())

['listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'reviews_3', 'listings_3', 'listings_with_reviews', 'calendar_previously_built', 'listings', 'listings_with_reviews_m_3']


In [6]:
print(f'Size of listings_with_reviews_m_3 is {db.listings_with_reviews_m_3.count_documents({})}.')
print(f'Size of listings_with_calendar is {db.listings_with_calendar.count_documents({})}.')

Size of listings_with_reviews_m_3 is 39202.
Size of listings_with_calendar is 39201.


In [8]:
pprint.pp(db.listings_with_reviews_m_3.find_one())

{'_id': ObjectId('665e91ad81a877ddad1a7549'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1981a877ddad19ca7b'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': datetime.datetime(2024, 1, 3, 0, 0),
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place

In [9]:
pprint.pp(db.listings_with_calendar.find_one())

{'_id': '10000070',
 'average_price': 85.0,
 'first_available_date': datetime.datetime(2024, 2, 6, 0, 0),
 'last_available_date': datetime.datetime(2025, 2, 4, 0, 0),
 'dates_list': [{'date': datetime.datetime(2024, 2, 6, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 7, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 8, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 9, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30

In [7]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$out' : 'listings_test1' }
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())

The time taken for this operation was 40.434621 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test1', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
  

In [8]:
pprint.pp(db.listings_test1.find_one({'id': '35384734'}))

{'_id': ObjectId('664be01238b2bd10b477193c'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'listing_id': '35384734',
              'review_id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of Chelsea '
                          '.<br/>Its a coz

In [9]:
# In the $unwind, using the option << "preserveNullAndEmptyArrays": true >>
#    so that we don't drop the listing with id = '35384734', which has empty array for cal_docs
# In general, if you do an $unwind, it converts a left outer join into a left (full) join,
#    because it removes documents that came from the left side, but have no matching records
#    from the right side.  By including the << "preserveNullAndEmptyArrays": true >> option,
#    you preserve the left join aspect of the $lookup
# This is following https://stackoverflow.com/questions/36725519/how-to-solve-empty-array-with-unwind

db.listings_test2.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$out' : 'listings_test2'}
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""
time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test2.count_documents({})
print(f'\nThe number of documents in listings_test2 is {size}.')

print()
pprint.pp(db.listings_test2.find_one())

The time taken for this operation was 36.539288 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test1', 'listings_test2', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test2 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '88335

In [10]:
pprint.pp(db.listings_test2.find_one({'id': '35384734'}))

{'_id': ObjectId('664be01238b2bd10b477193c'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'listing_id': '35384734',
              'review_id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of Chelsea '
                          '.<br/>Its a coz

In [11]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$out' : 'listings_test1'}
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())



The time taken for this operation was 64.452248 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test2', 'listings_with_reviews', 'listings_test1', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '88335

In [12]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$unset': 'cal_docs' }      
]
"""

    { '$out' : 'listings_with_reviews_and_cal'}
"""
"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())


The time taken for this operation was 0.193244 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test2', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 0.

None


In [10]:
db.listings_with_reviews_and_cal.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$unset': 'cal_docs' },     
    { '$out' : 'listings_with_reviews_and_cal'}
]

time1 = datetime.now()
result = db.listings_with_reviews_m_3.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')
# about 50 seconds, plus or minus

print()
print(db.list_collection_names())

size = db.listings_with_reviews_and_cal.count_documents({})
print(f'\nThe number of documents in listings_with_reviews_and_cal is {size}.')

print()
pprint.pp(db.listings_with_reviews_and_cal.find_one())


The time taken for this operation was 50.519691 seconds.

['calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'reviews_3', 'listings_3', 'listings_with_reviews', 'calendar_previously_built', 'listings', 'listings_with_reviews_m_3', 'listings_with_reviews_and_cal']

The number of documents in listings_with_reviews_and_cal is 39202.

{'_id': ObjectId('665e91ad81a877ddad1a7549'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1981a877ddad19ca7b

In [11]:
# the one listing that is not in listings_with_calendar
pprint.pp(db.listings_with_reviews_and_cal.find_one({'id': '35384734'}))

{'_id': ObjectId('665e91ad81a877ddad1b00ad'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1781a877ddad14062e'),
              'listing_id': '35384734',
              'id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of 

In [12]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^3538473.*$'}})
print(len(list(result)))

1


In [13]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^35384...$'}})
print(len(list(result)))

2


In [14]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^35384...$'}})
for doc in result:
    pprint.pp(doc)

{'_id': ObjectId('665e91ad81a877ddad1ad4ae'),
 'id': '35384123',
 'name': 'Home in Queens · ★4.78 · 1 bedroom · 1 bed · 1 shared bath',
 'host_id': '266360944',
 'host_name': 'Olga',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Springfield Gardens',
 'latitude': 40.6617,
 'longitude': -73.76261,
 'room_type': 'Private room',
 'price': 70.0,
 'minimum_nights': 30,
 'number_of_reviews': 96,
 'last_review': datetime.datetime(2023, 9, 3, 0, 0),
 'reviews_per_month': 1.7,
 'calculated_host_listings_count': 1,
 'availability_365': 89,
 'number_of_reviews_ltm': 17,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1781a877ddad1405b5'),
              'listing_id': '35384123',
              'id': '470846731',
              'date': datetime.datetime(2019, 6, 16, 0, 0),
              'reviewer_id': '12952513',
              'reviewer_name': 'Wendy',
              'comments': 'I was the first guest at Olga and her mom’s Airbnb. '
                          'A very sweet encounter! All ac

In [15]:
print(db.list_collection_names())

['calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'reviews_3', 'listings_3', 'listings_with_reviews', 'calendar_previously_built', 'listings', 'listings_with_reviews_m_3', 'listings_with_reviews_and_cal']


In [16]:
db.listings_test1.drop()
db.listings_test2.drop()
print(db.listings_test.count_documents({}))
print(db.list_collection_names())

0
['calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'reviews_3', 'listings_3', 'listings_with_reviews', 'calendar_previously_built', 'listings', 'listings_with_reviews_m_3', 'listings_with_reviews_and_cal']


### <span style=color:blue>Producing json output    </span>

In [17]:
size = db.listings_with_reviews_and_cal.count_documents({})
print(f'\nThe number of documents in listings_with_reviews_and_cal is {size}.')

print()
doc = db.listings_with_reviews_and_cal.find_one({})
pprint.pp(doc)


The number of documents in listings_with_reviews_and_cal is 39202.

{'_id': ObjectId('665e91ad81a877ddad1a7549'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1981a877ddad19ca7b'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': datetime.datetime(2024, 1, 3, 0, 0),
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you ge

In [19]:
# this function converts MongoDB docs in listings_with_reviews_and_cal into json storable
def convert_lwrc_to_json(doc):
    doc_new = {}
    # start by transferring all scalar keys over, then fix some of them
    for key in doc.keys():
        if key not in ['reviews', 'dates_list']:
            doc_new[key] = doc[key]
    # now fixing some possible issues
    doc_new['_id'] = str(doc['_id'])
    if doc['last_review'] == None:    # is null
        doc_new['last_review'] = None
    else:
        doc_new['last_review'] = doc['last_review'].strftime('%Y-%m-%d')
    if math.isnan(doc['price']):
        doc_new['price'] = None
    else:
        doc_new['price'] = doc['price']
    if math.isnan(doc['reviews_per_month']):
        doc_new['reviews_per_month'] = None
    else:
        doc_new['reviews_per_month'] = doc['reviews_per_month']
    # there is one document in the merger that has no calendar entries
    if 'first_available_date' in doc:
        if doc['first_available_date'] == None:
            doc_new['first_available_date'] = None
        else:
            doc_new['first_available_date'] = doc['first_available_date'].strftime('%Y-%m-%d')
    if 'last_available_date' in doc:
        if doc['last_available_date'] == None:
            doc_new['last_available_date'] = None
        else:
            doc_new['last_available_date'] = doc['last_available_date'].strftime('%Y-%m-%d')
    if 'average_price' in doc:
        doc_new['average_price'] = doc['average_price']
    
    # now dealing with the 'reviews' array
    rlist = []
    for r in doc['reviews']:
        r_new = {}
        r_new['_id'] = str(r['_id'])
        r_new['date'] = r['date'].strftime('%Y-%m-%d')
        for key in r.keys():
            # I am cheating, and will rename the 'listing_id' column back to 'id'
            if key not in ['date', '_id']:
                r_new[key] = r[key]
        rlist.append(r_new)
    doc_new['reviews'] = rlist

    # now dealing with the 'dates_list' array
    dlist = []
    for d in doc['dates_list']:
        d_new = {}
        d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['price', 'minimum_nights', 'maximum_nights', 'available']:
            d_new[key] = d[key]
        dlist.append(d_new)
    doc_new['dates_list'] = dlist
    
    return doc_new

# pprint.pp(doc)

pprint.pp(convert_lwrc_to_json(doc))

{'_id': '665e91ad81a877ddad1a7549',
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'average_price': 30.0,
 'first_available_date': '2024-02-06',
 'last_available_date': '2025-02-04',
 'reviews': [{'_id': '665e7a1981a877ddad19ca7b',
              'date': '2024-01-03',
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
            

In [20]:
print(db.listings_with_reviews_and_cal.count_documents({}))

cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^111.*$'}})
    
l = list(cursor)
print(len(l))

39202
28


In [22]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^111.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrc_to_json(doc))

print(len(output))

28


In [23]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_111__v03.json'
util.write_dict_to_dir_json(output, dir, filename)

In [24]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrc_to_json(doc))

print(len(output))

43


In [25]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_1000__v03.json'
util.write_dict_to_dir_json(output, dir, filename)