In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

<span style=color:blue>Getting mongodb connection set up, and focusing on airbnb database</span>

In [2]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

db = client.airbnb

In [9]:
print(db.list_collection_names())

['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'calendar_by_agg', 'calendar_previously_built', 'listings']


<span style=color:blue>Copying the collection 'listings_previously_built' into listings_with_reviews'</span>

<span style=color:blue>Following https://stackoverflow.com/questions/39788664/how-can-i-copy-one-collection-from-mongodb-using-pymongo-and-paste-to-another-em</span>

In [13]:
db.listings_with_reviews.drop()

pipeline = [ {'$match': {}},
             {'$out': 'listings_with_reviews'}
           ]
db.listings_previously_built.aggregate(pipeline)

print(db.list_collection_names())
# count1 = db.listings_previously_built.count_documents({})
print(f'Size of listings_previously_built is {db.listings_previously_built.count_documents({})}.')
print(f'Size of listings_with_reviews is {db.listings_with_reviews.count_documents({})}.')


['listings_with_calendar', 'testing', 'listings_previously_built', 'listings', 'listings_test', 'listings_with_reviews', 'calendar_by_agg', 'calendar_previously_built']
Size of listings_previously_built is 39202.
Size of listings_with_reviews is 39202.


In [16]:
pprint.pp(db.listings_with_reviews.find_one())
print('=======================================')
pprint.pp(db.listings_with_calendar.find_one())

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
              'date': datetime.datetime(2023, 5, 3, 0, 0),
              'reviewer_id': '78568329',
              'reviewer_name': 'Laaziz',
              'comments': 'Hôte très réactif et avenant. Logement très bien '
                          'desservi, idéal pour visiter New York. Toutefois il '
                          'ne faut

In [44]:
# do a left join of listings_with_reviews and listings_with_calendar
# then look for docs in there with no value for the listings_with_calendar id



pipeline1 = [
    { '$lookup' : {
          'from': 'listings_with_calendar',
          'localField' : 'id',
          'foreignField' : '_id',
          'as' : 'cal_docs' 
        }
    },
    {'$match' : {
        'cal_docs' : {'$eq' : []}                
              }
  }
]

time1 = datetime.now()
test1 = db.listings_with_reviews.aggregate(pipeline1)
time2 = datetime.now()
print(f'This operation took {util.time_diff(time1, time2)} seconds.')

This operation took 5.653047 seconds.


In [45]:
print(type(test1))

print()
for doc in list(test1):
    # doc = test1.next()
    pprint.pp(doc['id'])


<class 'pymongo.command_cursor.CommandCursor'>

'35384734'


In [None]:
"""
BTW, here are 2 queries in SQL to find this "missing" listing_id. 
The first one takes forever to run - more than 40 minutes
The second takes about 

-- optional for first query: create index on listing_id in calendar
create index if not exists listing_id_in_calendar
on calendar(listing_id)
-- took about 10 seconds

select id 
from listings l 
where id not in ( 
    select listing_id
    from calendar)
-- I gave up after about 40 minutes
    
-- in PostgreSQL, they use "EXCEPT" instead of "MINUS" (aargh!!!)
with ids_in_listings as
(select distinct id 
 from listings),
listing_ids_in_calendar as
(select distinct listing_id as id 
 from calendar)
(select id
 from ids_in_listings
 except
 select id 
 from listing_ids_in_calendar
)
-- takes about 2 seconds


"""