In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

In [2]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

db = client.airbnb

print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'listings_with_reviews', 'calendar_previously_built', 'listings']


In [3]:
filenamel = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/listings.csv'

# Using partial list of dtypes, so that first several fields are interpreted as strings
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"id": str,  "host_id": str }
# note including these, because the null values make trouble:  , "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan
         
df_listings = pd.read_csv(filenamel, dtype=dtype, keep_default_na=False)

In [4]:
print(df_listings.shape)

(39202, 18)


In [5]:
filenamer = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/reviews.csv'

# Using partial list of dtypes, so that first several fields are interpreted as strings
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"id": str, "listing_id": str, "reviewer_id": str}
# note including these, because the null values make trouble:  , "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan
         
df_reviews = pd.read_csv(filenamer, dtype=dtype, keep_default_na=False)

In [6]:
print(df_reviews.shape)

(986810, 6)


In [7]:
# also put this into util.py
# dt has format such as '1/3/24'
def convert_date_slash_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        dt = dt[:-2] + '20' + dt[-2:]
        new_dt = datetime.strptime(dt, '%m/%d/%Y') 
        return new_dt

print(convert_date_slash_to_datetime('1/3/23'))

2023-01-03 00:00:00


In [8]:
# using this because I added stuff to util.py
importlib.reload(util)

# dates in listings.csv have format such as '1/3/23', so using function to convert that to datetime
df_listings['last_review'] = df_listings['last_review'].apply(util.convert_date_slash_to_datetime)

In [33]:
df_listings['price'] = pd.to_numeric(df_listings['price'])

In [44]:
df_listings['reviews_per_month'] = pd.to_numeric(df_listings['reviews_per_month'])

In [45]:
print(df_listings.dtypes)

id                                        object
name                                      object
host_id                                   object
host_name                                 object
neighbourhood_group                       object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
number_of_reviews_ltm                      int64
license                                   object
dtype: object


In [9]:
df_reviews['date'] = df_reviews['date'].apply(util.convert_date_str_to_datetime)

In [13]:
print(df_reviews.dtypes)

listing_id               object
id                       object
date             datetime64[ns]
reviewer_id              object
reviewer_name            object
comments                 object
dtype: object


In [46]:
dict_listings = df_listings.to_dict('records')
print(len(dict_listings))

39202


In [16]:
dict_reviews = df_reviews.to_dict('records')
print(len(dict_reviews))

986810


In [47]:
# converting the NaT's into Nones
i = 0
for d in dict_listings:
    if pd.isnull(d['last_review']):
        i = i+1
        d['last_review'] = None
print(i)

11500


In [48]:
db.listings_3.drop()

time1 = datetime.now()
result = db.listings_3.insert_many(dict_listings)
time2 = datetime.now()
print(f'Time to load into MongoDB was {util.time_diff(time1,time2)} seconds.')

Time to load into MongoDB was 0.605676 seconds.


In [49]:
print(db.listings_3.count_documents({}))

39202


In [22]:
db.reviews_3.drop()

time1 = datetime.now()
result = db.reviews_3.insert_many(dict_reviews)
time2 = datetime.now()
print(f'Time to load into MongoDB was {util.time_diff(time1,time2)} seconds.')
print()
print(db.reviews_3.count_documents({}))

Time to load into MongoDB was 15.475313 seconds.

986810


<span style=color:blue> Adding the index    </span>

In [25]:
time1 = datetime.now()
index_name = db.reviews_3.create_index('listing_id')
time2 = datetime.now()
print(f'The time taken to create the index was {util.time_diff(time1,time2)} seconds.')
print(index_name)

The time taken to create the index was 3.260526 seconds.
listing_id_1


In [51]:
db.listings_with_reviews_m_3.drop()

pipeline = [
    {
        '$lookup': {
            'from': 'reviews_3',
            'localField': 'id',
            'foreignField': 'listing_id',
            'as': 'reviews'
        }
    },
    {
        '$out': 'listings_with_reviews_m_3'
    }
]


time1 = datetime.now()
print(time1)
db.listings_3.aggregate(pipeline)
time2 = datetime.now()
print(f'Time to load into MongoDB was {util.time_diff(time1,time2)} seconds.')

print(db.listings_with_reviews_m_3.count_documents({}))


2024-06-03 21:02:47.940114
Time to load into MongoDB was 6.741878 seconds.
39202


In [52]:
doc = db.listings_with_reviews_m_3.find_one()
pprint.pp(doc)

{'_id': ObjectId('665e91ad81a877ddad1a7549'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('665e7a1981a877ddad19ca7b'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': datetime.datetime(2024, 1, 3, 0, 0),
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place

In [58]:
def convert_lwr_to_json(doc):
    doc_new = {}
    # start by transferring all scalar keys over, then fix some of them
    for key in doc.keys():
        if key != 'reviews':
            doc_new[key] = doc[key]
    # now fixing some possible issues
    doc_new['_id'] = str(doc['_id'])
    if doc['last_review'] == None:    # is null
        doc_new['last_review'] = None
    else:
        doc_new['last_review'] = doc['last_review'].strftime('%Y-%m-%d')
    if math.isnan(doc['price']):
        doc_new['price'] = None
    else:
        doc_new['price'] = doc['price']
    if math.isnan(doc['reviews_per_month']):
        doc_new['reviews_per_month'] = None
    else:
        doc_new['reviews_per_month'] = doc['reviews_per_month']
    # now dealing with the 'review' array
    dlist = []
    for d in doc['reviews']:
        d_new = {}
        d_new['_id'] = str(d['_id'])
        d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in d.keys():
            if key not in ['date', '_id']:
                d_new[key] = d[key]
        dlist.append(d_new)
    doc_new['reviews'] = dlist
    return doc_new

# pprint.pp(doc)

pprint.pp(convert_lwr_to_json(doc))

{'_id': '665e91ad81a877ddad1b04b0',
 'id': '11162564',
 'name': 'Rental unit in New York · ★4.20 · Studio · 1 bed · 1 bath',
 'host_id': '2119276',
 'host_name': 'Urban Furnished',
 'neighbourhood_group': 'Manhattan',
 'neighbourhood': 'Gramercy',
 'latitude': 40.73323,
 'longitude': -73.9828,
 'room_type': 'Entire home/apt',
 'price': 111.0,
 'minimum_nights': 30,
 'number_of_reviews': 10,
 'last_review': '2022-09-15',
 'reviews_per_month': 0.11,
 'calculated_host_listings_count': 53,
 'availability_365': 342,
 'number_of_reviews_ltm': 0,
 'license': '',
 'reviews': [{'_id': '665e7a1681a877ddad0f1240',
              'date': '2016-08-09',
              'listing_id': '11162564',
              'id': '92592458',
              'reviewer_id': '14915683',
              'reviewer_name': 'Zen',
              'comments': 'The studio is very well located, right next to the '
                          'metro station. '},
             {'_id': '665e7a1681a877ddad0f1241',
              'date': '2017

In [68]:
print(db.listings_with_reviews.count_documents({}))

cursor = db.listings_with_reviews_m_3.find({'id' : {'$regex' : '^111.*$'}})
    
l = list(cursor)
print(len(l))

39202
28


In [69]:
cursor = db.listings_with_reviews_m_3.find({'id' : {'$regex' : '^111.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwr_to_json(doc))

print(len(output))

28


In [70]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_3_subset_111.json'
util.write_dict_to_dir_json(output, dir, filename)

In [72]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_m_subset_111__v03.json'
util.write_dict_to_dir_json(output, dir, filename)

In [62]:
print(db.listings_with_reviews.count_documents({}))

cursor = db.listings_with_reviews_m_3.find({'id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39202
43


In [63]:
cursor = db.listings_with_reviews_m_3.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwr_to_json(doc))

print(len(output))

43


In [67]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_m_subset_1000.json'
util.write_dict_to_dir_json(output, dir, filename)

In [73]:
cursor = db.listings_with_reviews_m_3.find({})

output = []

for doc in cursor:
    output.append(convert_lwr_to_json(doc))

print(len(output))

39202


In [74]:
dir = 'OUTPUTS'
filename = 'listings_with_reviews_m_all__v03.json'
util.write_dict_to_dir_json(output, dir, filename)