### <span style=color:blue> Loading Listings & Reviews data from postgresql into local MongoDB    </span>

In [9]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

<span style=color:blue>Getting mongodb connection set up</span>

In [2]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Getting access to airbnb database, and setting up collection "cal" to hold the calendar data in mongodb</span>

In [6]:
# I have (or will have) a database "airbnb"
db = client.airbnb

# inside the "airbnb" database, I have (or will have) a collection "listings"
cal = db.calendar

print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['testing', 'listings']


<span style=color:blue>FAILED ATTEMPT AT: Loading contents of calendar csv file into a dataframe</span>

In [19]:
"""
# This became a black hole!

filename = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/calendar.csv'

# see https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options for
#    various info about including data types into the read_csv command
# see also https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-dtypes
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"listing_id": str, "date": str, "available": str, 
        "price": str, "adjusted_price": str, "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan

# there are 7000+ entries for minimum_nights and maximum_nights with NULL values
# so following this pattern:
#    df2 = pd.read_csv('test_int64.csv', converters={'nr_nan':lambda x: pd.NA if x == '' else int(x)})
#    df2.nr_nan = df2.nr_nan.astype('Int64')
# see https://stackoverflow.com/questions/70776957/pandas-read-csv-with-integer-columns-with-null-values-without-precision-loss
converters = {'nr_nan':lambda x: pd.NA if x == '' else int(x)}

df = pd.read_csv(filename, dtype=dtype, converters=converters, keep_default_na=False)
df.nr_nan = df.nr_nan.astype('Int64')

print('The datatypes for the fields of df are:')
print(df.types)

print('\nThe first few rows of df are:')
print(df.head())
"""
print('This was a black hole')

This was a black hole


<span style=color:blue>Loading contents of calendar csv file into a dataframe</span>

<span style=color:blue>The system will give a warning, but it appears safe to ignore it.</span>

In [23]:
filename = '/Users/rick/DM-for-DS--Davis-Spring-2024/DATA-SETS/AirBNB/New-York-City/calendar.csv'

# Using partial list of dtypes, so that first several fields are interpreted as strings
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"listing_id": str, "date": str, "available": str, 
        "price": str, "adjusted_price": str}
# note including these, because the null values make trouble:  , "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan
         
df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)

  df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)


In [24]:
print('The datatypes for the fields of df are:')
print(df.dtypes)

print('\nThe first few rows of df are:')
print(df.head())

The datatypes for the fields of df are:
listing_id        object
date              object
available         object
price             object
adjusted_price    object
minimum_nights    object
maximum_nights    object
dtype: object

The first few rows of df are:
  listing_id        date available    price adjusted_price minimum_nights  \
0     144087  2024-02-10         t  $259.00                            30   
1     144087  2024-02-11         t  $259.00                            30   
2     144087  2024-02-12         t  $259.00                            30   
3     144087  2024-02-13         t  $259.00                            30   
4     144087  2024-02-14         t  $259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


<span style=color:blue>Function to convert dates into datetimes.  This is useful because MongoDB recognizes datetime types but not date types.  So, will convert all dates into datetimes for insertion into MongoDB (after which we can do date arithmetic).</span>

<span style=color:blue>It also has conditions that turn various kinds of null values into None.  (Note: curiously this works on small dataframes, but left some values of "NaT" when applied on very large dataframes.)</span>

In [42]:
# also converts NaT to None, because MongoDB does not recognize NaT
def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

print(convert_date_str_to_datetime('2024-05-23'))


2024-05-23 00:00:00


<span style=color:blue>Function to convert the values of field "available" to booleans </span>

In [39]:
def convert_tf_to_boolean(val):
    if val == 't':
        return True
    elif val == 'f':
        return False
    else:
        return None

print(convert_tf_to_boolean('t'), convert_tf_to_boolean('f'), convert_tf_to_boolean('foo'))


True False None


<span style=color:blue>Cleaning up the values in df, to be more compatible with MongoDB  </span>

In [None]:
df['date'] = df['date'].apply(convert_date_str_to_datetime)
df['available'] = df['available'].apply(convert_tf_to_boolean)

print(df.head())

In [54]:
df['price'] = df['price'].apply(lambda x:x.replace('$','')) 

print(df.head())

  listing_id       date  available   price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  259.00                            30   
1     144087 2024-02-11       True  259.00                            30   
2     144087 2024-02-12       True  259.00                            30   
3     144087 2024-02-13       True  259.00                            30   
4     144087 2024-02-14       True  259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


### <span style=color:blue>Now working to create a list of documents, where each document has a listing_id and then a list of dictionaries about available dates for that listing  </span>

<span style=color:blue>First step is to get a dataframe with just the listing_ids</span>

In [45]:
cols = df.columns.tolist()
print(cols)

['listing_id', 'date', 'available', 'price', 'adjusted_price', 'minimum_nights', 'maximum_nights']


In [46]:
other_cols = ['date', 'available', 'price', 'adjusted_price', 'minimum_nights', 'maximum_nights']

df_new = df.drop(other_cols, axis=1).drop_duplicates()

print(df.shape)
print(df_new.shape)
print(df_new.head())


(14299870, 7)
(39201, 1)
     listing_id
0        144087
152        2595
322       43105
507      144148
1310      44229


<span style=color:blue>Next, convert df_new into a list of dictionaries</span>

In [48]:
# I'm calling it "dict_new" but actually it is a list of dictionaries
dict_new = df_new.to_dict('records')

pprint.pp(dict_new[0:10])

[{'listing_id': '144087'},
 {'listing_id': '2595'},
 {'listing_id': '43105'},
 {'listing_id': '144148'},
 {'listing_id': '44229'},
 {'listing_id': '5121'},
 {'listing_id': '146726'},
 {'listing_id': '6848'},
 {'listing_id': '44973'},
 {'listing_id': '6872'}]


<span style=color:blue>Now adding in all of the dates for each listing.  This might take an hour or more to run so plan accordingly</span>

In [56]:
i = 0

time1 = datetime.now()

# for d in dict_new[0:2]:
for d in dict_new:
    i += 1

    # print(d['listing_id'])

    df_cal_one_listing = df.loc[df['listing_id'] == d['listing_id']].drop(['listing_id'], axis=1)

    # print(df_cal_one_listing.head(30))

    dicts_cal_one_listing = df_cal_one_listing.to_dict('records')

    d['dates_info'] = dicts_cal_one_listing

    if i % 50 == 0:
        time2 = datetime.now()
        time_taken = util.time_diff(time1,time2)
        print('Have now completed step number:', str(i), 'and it took', str(time_taken), 'seconds' )
        time1 = datetime.now()

# print(len(dict_new))
pprint.pp(dict_new[-5:-1])


Have now completed step number: 50
Have now completed step number: 100
Have now completed step number: 150
Have now completed step number: 200
Have now completed step number: 250
Have now completed step number: 300
Have now completed step number: 350
Have now completed step number: 400
Have now completed step number: 450
Have now completed step number: 500
Have now completed step number: 550
Have now completed step number: 600
Have now completed step number: 650
Have now completed step number: 700
Have now completed step number: 750
Have now completed step number: 800
Have now completed step number: 850
Have now completed step number: 900
Have now completed step number: 950
Have now completed step number: 1000
Have now completed step number: 1050
Have now completed step number: 1100
Have now completed step number: 1150
Have now completed step number: 1200
Have now completed step number: 1250
Have now completed step number: 1300
Have now completed step number: 1350
Have now completed st

<span style=color:blue>Now loading into MongoDB</span>

In [None]:
# CAUTION: the first step here erases db.listing
db.calendar.drop()
# print(db.list_collection_names())

cal = db.calendar


time1 = datetime.now()
# for i in range(0,392):
for i in range(0,1):
# for j in range(0,10):
# for k in range(0,10):
    # testing for whether the 'last_review' value is NaT, and if so, changing it to None
    #    normally the "apply" function using the "change_date_to_datetime" should take care
    #    of this substitution, but it does not work on my mac for large df's.  
    #    (Oddly, it does work on small df's.)
    # for j in range(0,100):
    #     if dict_ljr_new[100*i + j]['last_review'] != dict_ljr_new[100*i + j]['last_review']:
    #         dict_ljr_new[100*i + j]['last_review'] = None
    result = listings.insert_many(dict_new[100*i:100*(i+1)])
    # result = listings.insert_many(dict_ljr_new[13800 + 10*j:13800 + 10*(j+1)])
    # result = listings.insert_many(dict_ljr_new[13870 + k:13870 + k + 1])
    # print('Last element of result for run number', str(i), 'was:')
    # print(result.inserted_ids[-1:])

time2 = datetime.now()

print('\nThe last ObjectID in the collection is:')
print(result.inserted_ids[-1:])

print('\nThe time to do the load of 39K documents into local mongodb, with a total of about 300MB was:')
print(util.time_diff(time1,time2))



# this is for the last 2 records in dict_ljr_new, but built for some number of records over 2
# again, testing for whether the 'last_review' value is NaT, and if so, changing it to None
for j in range(0,len(dict_ljr_new) % 100):
    if dict_ljr_new[39100 + j]['last_review'] != dict_ljr_new[39200 + j]['last_review']:
        dict_ljr_new[39200 + j]['last_review'] = None
result = listings.insert_many(dict_ljr_new[39200:])
print('\nLast element of result for the last run was:')
print(result.inserted_ids[-1:])


print('\nThe total number of documents in the collection db.listings is now:')
print(listings.count_documents({}))
