### <span style=color:blue> Loading Listings & Reviews data from postgresql into local MongoDB    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

<span style=color:blue>Getting mongodb connection set up</span>

In [2]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Getting access to airbnb database, and setting up collection "cal" to hold the calendar data in mongodb</span>

In [3]:
# I have (or will have) a database "airbnb"
db = client.airbnb

# inside the "airbnb" database, I have (or will have) a collection "listings"
cal = db.calendar_by_agg

print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['calendar_by_agg', 'testing', 'listings_previously_built', 'listings_test', 'calendar_previously_built', 'listings']


<span style=color:blue>FAILED ATTEMPT AT: Loading contents of calendar csv file into a dataframe</span>

In [4]:
"""
# This became a black hole!

filename = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/calendar.csv'

# see https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options for
#    various info about including data types into the read_csv command
# see also https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-dtypes
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"listing_id": str, "date": str, "available": str, 
        "price": str, "adjusted_price": str, "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan

# there are 7000+ entries for minimum_nights and maximum_nights with NULL values
# so following this pattern:
#    df2 = pd.read_csv('test_int64.csv', converters={'nr_nan':lambda x: pd.NA if x == '' else int(x)})
#    df2.nr_nan = df2.nr_nan.astype('Int64')
# see https://stackoverflow.com/questions/70776957/pandas-read-csv-with-integer-columns-with-null-values-without-precision-loss
converters = {'nr_nan':lambda x: pd.NA if x == '' else int(x)}

df = pd.read_csv(filename, dtype=dtype, converters=converters, keep_default_na=False)
df.nr_nan = df.nr_nan.astype('Int64')

print('The datatypes for the fields of df are:')
print(df.types)

print('\nThe first few rows of df are:')
print(df.head())
"""
print('This was a black hole')

This was a black hole


<span style=color:blue>Loading contents of calendar csv file into a dataframe</span>

<span style=color:blue>The system will give a warning, but it appears safe to ignore it.</span>

In [5]:
filename = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/calendar.csv'

# Using partial list of dtypes, so that first several fields are interpreted as strings
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"listing_id": str, "date": str, "available": str, 
        "price": str, "adjusted_price": str}
# note including these, because the null values make trouble:  , "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan
         
df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)

  df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)


In [6]:
print('The datatypes for the fields of df are:')
print(df.dtypes)

print('\nThe first few rows of df are:')
print(df.head())

The datatypes for the fields of df are:
listing_id        object
date              object
available         object
price             object
adjusted_price    object
minimum_nights    object
maximum_nights    object
dtype: object

The first few rows of df are:
  listing_id        date available    price adjusted_price minimum_nights  \
0     144087  2024-02-10         t  $259.00                            30   
1     144087  2024-02-11         t  $259.00                            30   
2     144087  2024-02-12         t  $259.00                            30   
3     144087  2024-02-13         t  $259.00                            30   
4     144087  2024-02-14         t  $259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


<span style=color:blue>Function to convert dates into datetimes.  This is useful because MongoDB recognizes datetime types but not date types.  So, will convert all dates into datetimes for insertion into MongoDB (after which we can do date arithmetic).</span>

<span style=color:blue>It also has conditions that turn various kinds of null values into None.  (Note: curiously this works on small dataframes, but left some values of "NaT" when applied on very large dataframes.)</span>

In [7]:
# also converts NaT to None, because MongoDB does not recognize NaT
def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

print(convert_date_str_to_datetime('2024-05-23'))


2024-05-23 00:00:00


<span style=color:blue>Function to convert the values of field "available" to booleans </span>

In [8]:
def convert_tf_to_boolean(val):
    if val == 't':
        return True
    elif val == 'f':
        return False
    else:
        return None

print(convert_tf_to_boolean('t'), convert_tf_to_boolean('f'), convert_tf_to_boolean('foo'))


True False None


<span style=color:blue>Cleaning up the values in df, to be more compatible with MongoDB  </span>

In [9]:
df['date'] = df['date'].apply(convert_date_str_to_datetime)
df['available'] = df['available'].apply(convert_tf_to_boolean)

print(df.head())

  listing_id       date  available    price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  $259.00                            30   
1     144087 2024-02-11       True  $259.00                            30   
2     144087 2024-02-12       True  $259.00                            30   
3     144087 2024-02-13       True  $259.00                            30   
4     144087 2024-02-14       True  $259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


In [None]:
# need to strip the leading '$' from the price value, and remove commas
df['price'] = df['price'].apply(lambda x:x.replace('$','').replace(',',''))

In [23]:
df['price'] = pd.to_numeric(df['price']) 
# df["a"] = pd.to_numeric(df["a"])
print(type(df.loc[0,'price']))
print(df.head())

<class 'numpy.float64'>
  listing_id       date  available  price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  259.0                            30   
1     144087 2024-02-11       True  259.0                            30   
2     144087 2024-02-12       True  259.0                            30   
3     144087 2024-02-13       True  259.0                            30   
4     144087 2024-02-14       True  259.0                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


### <span style=color:blue>Now working to fill calendar_by_agg with the dataframe df.  After that we will attempt to use an agg function to produce a collection is an array of listings and for each listing an arracy of dates that it is available,   </span>

<span style=color:blue>First step is to get a dataframe with just the listing_ids</span>


In [24]:
df_small = df.iloc[:1000]
print(df_small.head())

  listing_id       date  available  price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  259.0                            30   
1     144087 2024-02-11       True  259.0                            30   
2     144087 2024-02-12       True  259.0                            30   
3     144087 2024-02-13       True  259.0                            30   
4     144087 2024-02-14       True  259.0                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


In [None]:
# dict_small = df_small.to_dict('records')
dict_full = df.to_dict('records')

In [56]:
print(len(dict_full))
print(len(dict_full)//100)

db.calendar_by_agg.drop()
# result = cal.insert_many(dict_small)
# result = cal.insert_many(dict_full)

"""
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(cal.find_one({ '_id': o}))
pprint.pp(outdocs)
"""

14299870
142998


"\noutdocs = []\nfor o in result.inserted_ids[-5:]:\n    outdocs.append(cal.find_one({ '_id': o}))\npprint.pp(outdocs)\n"

In [57]:
time0 = datetime.now()
time1 = datetime.now()
for i in range(0, len(dict_full)//1000):
    result = cal.insert_many(dict_full[1000*i:1000*(i+1)])
    if i % 100 == 0:
        time2 = datetime.now()
        diff = util.time_diff(time1, time2)
        print('Have now performed step', i, '(or 1000 inserts each), and it took', format(diff, '.4f'), 'seconds' )
        time1 = datetime.now()

time3 = datetime.now()

print('\nThe last ObjectID in the collection is:')
print(result.inserted_ids[-1:])

# print('\nThe time to do the load of 39K documents into local mongodb, with a total of about 300MB was:')
print('\nThe time for this loading into local mongodb was:')
print(format(util.time_diff(time0,time3)), '.4f')

Have now performed step 0 (or 1000 inserts each), and it took 0.1420 seconds
Have now performed step 100 (or 1000 inserts each), and it took 0.6580 seconds
Have now performed step 200 (or 1000 inserts each), and it took 0.6742 seconds
Have now performed step 300 (or 1000 inserts each), and it took 0.6864 seconds
Have now performed step 400 (or 1000 inserts each), and it took 0.6824 seconds
Have now performed step 500 (or 1000 inserts each), and it took 0.6674 seconds
Have now performed step 600 (or 1000 inserts each), and it took 0.6715 seconds
Have now performed step 700 (or 1000 inserts each), and it took 0.7060 seconds
Have now performed step 800 (or 1000 inserts each), and it took 0.6672 seconds
Have now performed step 900 (or 1000 inserts each), and it took 0.6770 seconds
Have now performed step 1000 (or 1000 inserts each), and it took 0.7040 seconds
Have now performed step 1100 (or 1000 inserts each), and it took 0.6658 seconds
Have now performed step 1200 (or 1000 inserts each),

In [58]:
# this is for the last handful records in dict_ljr_new, but built for some number of records over 2
# again, testing for whether the 'last_review' value is NaT, and if so, changing it to None

result = cal.insert_many(dict_full[(len(dict_full)//1000)*1000:])
print('\nLast element of result for the last run was:')
print(result.inserted_ids[-1:])


print('\nThe total number of documents in the collection db.calendar is now:')
print(cal.count_documents({}))


Last element of result for the last run was:
[ObjectId('665a7fe4b7e82b000ffe5685')]

The total number of documents in the collection db.calendar is now:
14299870


In [65]:
# making sure that listings_with_calendar is empty
db.listings_with_calendar.drop()

pipeline = [
    {
        # Group documents by listing_id, and for each one associate
        #     average price across all available dates, 
        #     and also an array of all available dates
        "$group": {
            "_id": '$listing_id',
            "average_price": {"$avg": "$price"},
            'earliest_available_date': {'$min': '$date'},
            'last_available_date': {'$max': '$date'},
            'dates_list' : {
                '$push': {
                    'date': '$date',
                    'available' : '$available',
                    'price' : '$price',
                    'minimum_nights' : '$minimum_nights',
                    'maximum_nights' : '$maximum_nights'
                }
            }
        }
    },
    { '$out' : 'listings_with_calendar' }
]

time1 = datetime.now()
test1 = cal.aggregate(pipeline)
time2 = datetime.now()
diff = util.time_diff(time1, time2)

print('\nTime it took was:', format(diff, '.4f'), '.')

print(db.list_collection_names())

print("test1:")

# print(len(list(test1)))

print(type(test1))




Time it took was: 68.2444 .
['listings_with_calendar', 'testing', 'listings_previously_built', 'listings_test', 'calendar_by_agg', 'calendar_previously_built', 'listings']
test1:
<class 'pymongo.command_cursor.CommandCursor'>


In [79]:
count = db.listings_with_calendar.count_documents({})
print(count)

# a slower alternative approach:
"""
result = db.listings_with_calendar.find()
# by using clone(), the cursor is not consumed
print(len(list(result.clone())))
"""
print()

39201

