### <span style=color:blue> Joining listings_with_reviews and listings_with_calendar_dates    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

In [2]:
# test that utils.py has been imported well
util.hello_world()

hello world


<span style=color:blue>Getting mongodb connection set up</span>

In [3]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

# I already have a database "airbnb"
db = client.airbnb

# checking collections in airbnb 
print(db.list_collection_names())

['calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'listings_with_reviews', 'listings_with_reviews_and_cal', 'calendar_previously_built', 'listings']


In [4]:
print(f'Size of listings_with_reviews is {db.listings_previously_built.count_documents({})}.')
print(f'Size of listings_with_calendar is {db.listings_with_calendar.count_documents({})}.')
print(f'Size of listings_with_calendar is {db.calendar_by_agg.count_documents({})}.')

Size of listings_with_reviews is 39202.
Size of listings_with_calendar is 39201.
Size of listings_with_calendar is 0.


In [5]:
pprint.pp(db.listings_with_reviews.find_one())

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
              'date': datetime.datetime(2023, 5, 3, 0, 0),
              'reviewer_id': '78568329',
              'reviewer_name': 'Laaziz',
              'comments': 'Hôte très réactif et avenant. Logement très bien '
                          'desservi, idéal pour visiter New York. Toutefois il '
                          'ne faut

In [6]:
pprint.pp(db.listings_with_calendar.find_one())

{'_id': '10000070',
 'average_price': 85.0,
 'first_available_date': datetime.datetime(2024, 2, 6, 0, 0),
 'last_available_date': datetime.datetime(2025, 2, 4, 0, 0),
 'dates_list': [{'date': datetime.datetime(2024, 2, 6, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 7, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 8, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 9, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30

In [7]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$out' : 'listings_test1' }
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())

The time taken for this operation was 40.434621 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test1', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
  

In [8]:
pprint.pp(db.listings_test1.find_one({'id': '35384734'}))

{'_id': ObjectId('664be01238b2bd10b477193c'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'listing_id': '35384734',
              'review_id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of Chelsea '
                          '.<br/>Its a coz

In [9]:
# In the $unwind, using the option << "preserveNullAndEmptyArrays": true >>
#    so that we don't drop the listing with id = '35384734', which has empty array for cal_docs
# In general, if you do an $unwind, it converts a left outer join into a left (full) join,
#    because it removes documents that came from the left side, but have no matching records
#    from the right side.  By including the << "preserveNullAndEmptyArrays": true >> option,
#    you preserve the left join aspect of the $lookup
# This is following https://stackoverflow.com/questions/36725519/how-to-solve-empty-array-with-unwind

db.listings_test2.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$out' : 'listings_test2'}
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""
time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test2.count_documents({})
print(f'\nThe number of documents in listings_test2 is {size}.')

print()
pprint.pp(db.listings_test2.find_one())

The time taken for this operation was 36.539288 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test1', 'listings_test2', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test2 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '88335

In [10]:
pprint.pp(db.listings_test2.find_one({'id': '35384734'}))

{'_id': ObjectId('664be01238b2bd10b477193c'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'listing_id': '35384734',
              'review_id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of Chelsea '
                          '.<br/>Its a coz

In [11]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$out' : 'listings_test1'}
]

"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())



The time taken for this operation was 64.452248 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test2', 'listings_with_reviews', 'listings_test1', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '88335

In [12]:
db.listings_test1.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$unset': 'cal_docs' }      
]
"""

    { '$out' : 'listings_with_reviews_and_cal'}
"""
"""
result = db.listings_with_reviews.aggregate(pipeline)

pprint.pp(result.next())
"""

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_test1.count_documents({})
print(f'\nThe number of documents in listings_test1 is {size}.')

print()
pprint.pp(db.listings_test1.find_one())


The time taken for this operation was 0.193244 seconds.

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test2', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built', 'listings_with_reviews_and_cal', 'listings']

The number of documents in listings_test1 is 0.

None


In [9]:
db.listings_with_reviews_and_cal.drop()

pipeline = [
    { '$lookup': {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    { '$unwind': { 'path': '$cal_docs',  
                   'preserveNullAndEmptyArrays' : True
                 }
    },
    { '$addFields': 
          {'average_price': '$$ROOT.cal_docs.average_price',
           'first_available_date': '$$ROOT.cal_docs.first_available_date',
           'last_available_date':  '$$ROOT.cal_docs.last_available_date',
           'dates_list': '$$ROOT.cal_docs.dates_list'
          }
    },
    { '$unset': 'cal_docs' },     
    { '$out' : 'listings_with_reviews_and_cal'}
]

time1 = datetime.now()
result = db.listings_with_reviews.aggregate(pipeline)
time2 = datetime.now()
print(f'The time taken for this operation was {util.time_diff(time1,time2)} seconds.')

print()
print(db.list_collection_names())

size = db.listings_with_reviews_and_cal.count_documents({})
print(f'\nThe number of documents in listings_with_reviews_and_cal is {size}.')

print()
pprint.pp(db.listings_with_reviews_and_cal.find_one())


The time taken for this operation was 22.612153 seconds.

['listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'listings_with_reviews', 'calendar_previously_built', 'listings']

The number of documents in listings_with_reviews_and_cal is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
              'date': datetim

In [14]:
# the one listing that is not in listings_with_calendar
pprint.pp(db.listings_with_reviews_and_cal.find_one({'id': '35384734'}))

{'_id': ObjectId('664be01238b2bd10b477193c'),
 'id': '35384734',
 'name': 'Rental unit in New York · ★4.82 · Studio · 1 bed · 1 bath',
 'host_id': '266380288',
 'host_name': 'Rachel',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'West Village',
 'latitude': 40.73924,
 'longitude': -74.00366,
 'room_type': 'Entire home/apt',
 'price': nan,
 'minimum_nights': 30,
 'number_of_reviews': 22,
 'last_review': datetime.datetime(2023, 10, 1, 0, 0),
 'reviews_per_month': 0.41,
 'calculated_host_listings_count': 5,
 'availability_365': 0,
 'number_of_reviews_ltm': 2,
 'license': '',
 'reviews': [{'listing_id': '35384734',
              'review_id': '522113820',
              'date': datetime.datetime(2019, 9, 1, 0, 0),
              'reviewer_id': '69002790',
              'reviewer_name': 'Sanjay',
              'comments': 'The location is indeed spot on and very easily '
                          'approachable right in the heart of Chelsea '
                          '.<br/>Its a coz

In [15]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^3538473.*$'}})
print(len(list(result)))

1


In [16]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^35384...$'}})
print(len(list(result)))

2


In [17]:
result = db.listings_with_reviews_and_cal.find({'id': { '$regex': '^35384...$'}})
for doc in result:
    pprint.pp(doc)

{'_id': ObjectId('664b9cf038b2bd10b476ac6b'),
 'id': '35384123',
 'name': 'Home in Queens · ★4.78 · 1 bedroom · 1 bed · 1 shared bath',
 'host_id': '266360944',
 'host_name': 'Olga',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Springfield Gardens',
 'latitude': 40.6617,
 'longitude': -73.76261,
 'room_type': 'Private room',
 'price': 70.0,
 'minimum_nights': 30,
 'number_of_reviews': 96,
 'last_review': datetime.datetime(2023, 9, 3, 0, 0),
 'reviews_per_month': 1.7,
 'calculated_host_listings_count': 1,
 'availability_365': 89,
 'number_of_reviews_ltm': 17,
 'license': '',
 'reviews': [{'listing_id': '35384123',
              'review_id': '957331027272749353',
              'date': datetime.datetime(2023, 8, 13, 0, 0),
              'reviewer_id': '130415543',
              'reviewer_name': 'Rachel',
              'comments': 'Great stay, flexible check in, easy to contact.'},
             {'listing_id': '35384123',
              'review_id': '424587480734838861',
            

In [18]:
print(db.list_collection_names())

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_test2', 'listings_with_reviews', 'calendar_by_agg', 'listings_with_reviews_and_cal', 'calendar_previously_built', 'listings']


In [19]:
db.listings_test1.drop()
db.listings_test2.drop()
print(db.listings_test.count_documents({}))
print(db.list_collection_names())

72332
['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'listings_with_reviews', 'calendar_by_agg', 'listings_with_reviews_and_cal', 'calendar_previously_built', 'listings']


### <span style=color:blue>Producing json output    </span>

In [13]:
size = db.listings_with_reviews_and_cal.count_documents({})
print(f'\nThe number of documents in listings_with_reviews_and_cal is {size}.')

print()
doc = db.listings_with_reviews_and_cal.find_one({})
pprint.pp(doc)


The number of documents in listings_with_reviews_and_cal is 39202.

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
              'date': datetime.datetime(2023, 5, 3, 0, 0),
              'reviewer_id': '78568329',
              'reviewer_name': 'Laaziz',
              'comments': 'Hôte très réactif et avenant. Logement très bien '
                          'desservi, idéal pou

In [22]:
# this function is cheating, and converts "reviews_id" in the reviews lists back to "id"
def convert_lwrc_to_json_and_convert_listing_id_to_id(doc):
    doc_new = {}
    # start by transferring all scalar keys over, then fix some of them
    for key in doc.keys():
        if key not in ['reviews', 'dates_list']:
            doc_new[key] = doc[key]
    # now fixing some possible issues
    doc_new['_id'] = str(doc['_id'])
    if doc['last_review'] == None:    # is null
        doc_new['last_review'] = None
    else:
        doc_new['last_review'] = doc['last_review'].strftime('%Y-%m-%d')
    if math.isnan(doc['price']):
        doc_new['price'] = None
    else:
        doc_new['price'] = doc['price']
    if math.isnan(doc['reviews_per_month']):
        doc_new['reviews_per_month'] = None
    else:
        doc_new['reviews_per_month'] = doc['reviews_per_month']
    # there is one document in the merger that has no calendar entries
    if 'first_available_date' in doc:
        if doc['first_available_date'] == None:
            doc_new['first_available_date'] = None
        else:
            doc_new['first_available_date'] = doc['first_available_date'].strftime('%Y-%m-%d')
    if 'last_available_date' in doc:
        if doc['last_available_date'] == None:
            doc_new['last_available_date'] = None
        else:
            doc_new['last_available_date'] = doc['last_available_date'].strftime('%Y-%m-%d')
    if 'average_price' in doc:
        doc_new['average_price'] = doc['average_price']
    
    # now dealing with the 'reviews' array
    rlist = []
    for r in doc['reviews']:
        r_new = {}
        r_new['date'] = r['date'].strftime('%Y-%m-%d')
        r_new['id'] = r['review_id']
        for key in r.keys():
            # I am cheating, and will rename the 'listing_id' column back to 'id'
            if key not in ['date', 'review_id']:
                r_new[key] = r[key]
        rlist.append(r_new)
    doc_new['reviews'] = rlist

    # now dealing with the 'dates_list' array
    dlist = []
    for d in doc['dates_list']:
        d_new = {}
        d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['price', 'minimum_nights', 'maximum_nights', 'available']:
            d_new[key] = d[key]
        dlist.append(d_new)
    doc_new['dates_list'] = dlist
    
    return doc_new

# pprint.pp(doc)

pprint.pp(convert_lwrc_to_json_and_convert_listing_id_to_id(doc))

{'_id': '664be01338b2bd10b477426e',
 'id': '1000128045386999700',
 'name': 'Rental unit in Queens · 1 bedroom · 1 bed · 1 shared bath',
 'host_id': '501499086',
 'host_name': 'CrossOver',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74052,
 'longitude': -73.89424,
 'room_type': 'Private room',
 'price': 81.0,
 'minimum_nights': 30,
 'number_of_reviews': 0,
 'last_review': None,
 'reviews_per_month': None,
 'calculated_host_listings_count': 162,
 'availability_365': 270,
 'number_of_reviews_ltm': 0,
 'license': '',
 'average_price': 81.0,
 'first_available_date': '2024-02-06',
 'last_available_date': '2025-02-04',
 'reviews': [],
 'dates_list': [{'date': '2024-02-06',
                 'price': 81.0,
                 'minimum_nights': 30,
                 'maximum_nights': 365,
                 'available': True},
                {'date': '2024-02-07',
                 'price': 81.0,
                 'minimum_nights': 30,
                 'maximum_night

In [23]:
print(db.listings_with_reviews_and_cal.count_documents({}))

cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^111.*$'}})
    
l = list(cursor)
print(len(l))

39202
28


In [24]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^111.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrc_to_json_and_convert_listing_id_to_id(doc))

print(len(output))

28


In [25]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_111.json'
util.write_dict_to_dir_json(output, dir, filename)

In [26]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrc_to_json_and_convert_listing_id_to_id(doc))

print(len(output))

43


In [27]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_1000.json'
util.write_dict_to_dir_json(output, dir, filename)