### <span style=color:blue> Loading data from postgresql into mongodb (local)    </span>

In [23]:
import sys
import json
import csv
import yaml

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

import time
import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
import util as util

In [2]:
# test that utils.py has been imported well
util.hello_world()

hello world


<span style=color:blue>Getting PostgreSQL connection set up</span>

In [3]:
# Load the env file 

dotenv_path = 'env_variables.env'
load_dotenv(dotenv_path=dotenv_path)

# Import the env variables

load_dotenv()

schema = os.getenv('DISC_6_SCHEMA')
port = os.getenv('DISC_6_PORT')
host = os.getenv('DISC_6_HOST')
database = os.getenv('DISC_6_DB')
password = os.getenv('DISC_6_PASSWORD')
connection = os.getenv('DISC_6_CONNECTION')

# Create the db engine 

db_eng = create_engine(f"postgresql+psycopg2://{connection}:{password}@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


<span style=color:blue>Getting mongodb connection set up</span>

In [4]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Setting up collection "listings" in mongodb</span>

In [7]:
# I have (or will have) a database "airbnb"
db = client.airbnb

# inside the "airbnb" database, I have (or will have) a collection "listings"
listings = db.listings

<span style=color:blue>Installing all indexes I can think of</span>

In [28]:
testing = db.testing

dict = {'date': datetime.datetime(2023, 1, 1,0,0,0)}

ts = dict['date'].timestamp()
print(ts)
db.testing.insert_one({'date': datetime.datetime.fromtimestamp(ts)})

1672560000.0


InsertOneResult(ObjectId('664987068b80b91420d2b513'), acknowledged=True)

### <span style=color:blue>As preparation for this, I have a table reviewm (for review_mongodb) in which I dropped the comments_tsv column (because not needed) and renamed column "id" to "review_id" (so that it is not repeating the "id" column of the listings table</span>

In [6]:
q = util.build_query_full_join_listings_reviewsm_100()

with db_eng.connect() as conn:
    df_ljr100 = pd.read_sql(q, con=conn)

print(df_ljr100.head())


         id                                               name   host_id  \
0  10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...  51688993   
1  10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...  51688993   
2  10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...  51688993   
3  10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...  51688993   
4  10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...  51688993   

  host_name neighbourhood_group neighbourhood  latitude  longitude  \
0     Taran              Queens       Jamaica  40.68826  -73.80384   
1     Taran              Queens       Jamaica  40.68826  -73.80384   
2     Taran              Queens       Jamaica  40.68826  -73.80384   
3     Taran              Queens       Jamaica  40.68826  -73.80384   
4     Taran              Queens       Jamaica  40.68826  -73.80384   

      room_type  price  ...  availability_365  number_of_reviews_ltm license  \
0  Private room  200.0  ...               

In [8]:
cols = df_ljr100.columns.tolist()
print(cols)

['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license', 'listing_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments', 'datetime']


In [9]:
# to do a projection and remove duplicates
cols_of_listings = ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 
                    'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 
                    'minimum_nights', 'number_of_reviews', 'last_review', 
                    'reviews_per_month', 'calculated_host_listings_count', 
                    'availability_365', 'number_of_reviews_ltm', 'license']
cols_of_reviews = ['listing_id', 'review_id', 'date', 'reviewer_id', 
                   'reviewer_name', 'comments', 'datetime']

df_ljr100_left = df_ljr100.drop(cols_of_reviews, axis=1).drop_duplicates()


print(df_ljr100_left.shape)
print(df_ljr100_left.head(10))

(169, 18)
                      id                                               name  \
0               10073940  Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1...   
91   1004466409355496588  Bed and breakfast in Queens · ★4.61 · 1 bedroo...   
92   1001759466923816441  Rental unit in Queens · 1 bedroom · 2 beds · 1...   
93   1006068540323632978  Home in Queens · ★4.93 · 1 bedroom · 1 bed · 1...   
94   1005207590507549831  Rental unit in Queens · ★4.85 · 1 bedroom · 1 ...   
108  1005170940682176047  Home in Queens · ★4.75 · 1 bedroom · 1 bed · 1...   
120  1005215166382279445  Hotel in Queens · 1 bedroom · 1 bed · 1 privat...   
121  1005236888864691555  Hotel in Queens · ★3.80 · 1 bedroom · 1 bed · ...   
123  1001730138556048837  Home in Queens · ★5.0 · 3 bedrooms · 3 beds · ...   
128  1002217051026941997  Hotel in Queens · ★4.25 · 1 bedroom · 1 bed · ...   

       host_id     host_name neighbourhood_group     neighbourhood   latitude  \
0     51688993         Taran           

In [10]:
dict_ljr100_left = df_ljr100_left.to_dict('records')
print(len(dict_ljr100_left))
pprint.pp(dict_ljr100_left)

169
[{'id': '10073940',
  'name': 'Home in Queens · ★4.77 · 1 bedroom · 1 bed · 1 private bath',
  'host_id': '51688993',
  'host_name': 'Taran',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'Jamaica',
  'latitude': 40.68826,
  'longitude': -73.80384,
  'room_type': 'Private room',
  'price': 200.0,
  'minimum_nights': 30,
  'number_of_reviews': 184,
  'last_review': datetime.date(2023, 9, 17),
  'reviews_per_month': 1.95,
  'calculated_host_listings_count': 1,
  'availability_365': 365,
  'number_of_reviews_ltm': 18,
  'license': ''},
 {'id': '1004466409355496588',
  'name': 'Bed and breakfast in Queens · ★4.61 · 1 bedroom · 1 bed · 1 bath',
  'host_id': '539071710',
  'host_name': 'Cesar',
  'neighbourhood_group': 'Queens',
  'neighbourhood': 'East Elmhurst',
  'latitude': 40.76701,
  'longitude': -73.86704,
  'room_type': 'Private room',
  'price': 105.0,
  'minimum_nights': 1,
  'number_of_reviews': 23,
  'last_review': datetime.date(2024, 1, 30),
  'reviews_per_month': 6.

In [13]:
for d in dict_ljr100_left:

# selection of df that has df['id'] = d['id']

# selection by columnm value:
#       df.loc[df['column_name'] == some_value]
    dicts_reviews_one_listing = df_ljr100.loc[df_ljr100['id'] == d['id']].drop(cols_of_listings, axis=1).to_dict('records')

    print('The length of dicts_reviews_one_listing for', d['id'], 'is:', len(dicts_reviews_one_listing))

    # dict3 = df3.to_dict('records')
    # print(dict3[0:10])

    d['reviews'] = dicts_reviews_one_listing

pprint.pp(dict_ljr100_left)


    

The length of dicts_reviews_one_listing for 10073940 is: 184
The length of dicts_reviews_one_listing for 1004466409355496588 is: 23
The length of dicts_reviews_one_listing for 1001759466923816441 is: 2
The length of dicts_reviews_one_listing for 1006068540323632978 is: 14
The length of dicts_reviews_one_listing for 1005207590507549831 is: 13
The length of dicts_reviews_one_listing for 1005170940682176047 is: 8
The length of dicts_reviews_one_listing for 1005215166382279445 is: 2
The length of dicts_reviews_one_listing for 1005236888864691555 is: 5
The length of dicts_reviews_one_listing for 1001730138556048837 is: 9
The length of dicts_reviews_one_listing for 1002217051026941997 is: 4
The length of dicts_reviews_one_listing for 1006526346791543478 is: 10
The length of dicts_reviews_one_listing for 1006786464665372746 is: 13
The length of dicts_reviews_one_listing for 1008761034690239341 is: 18
The length of dicts_reviews_one_listing for 1008301599260572123 is: 15
The length of dicts_re

<span style=color:blue> Doing a bulk insert    </span>

In [24]:
foo = datetime.date(2023, 6, 10)
print(foo)
goo = foo.replace(microsecond=0)
print(goo)

2023-06-10


TypeError: 'microsecond' is an invalid keyword argument for replace()

In [15]:
result = listings.insert_many(dict_ljr100_left)
print(result.inserted_ids)

InvalidDocument: cannot encode object: datetime.date(2023, 9, 17), of type: <class 'datetime.date'>