### <span style=color:blue> Identifying "good" listings   </span>

<span style=color:blue>This notebook starts with a simple but crude way of identify listings that are "good", based on the presence of a handful of superlative words (e.g. 'awesome', 'splendid') in a given percentage of the reviews</span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import math

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
# NOTE: I moved my util.py to the directory "helper_functions" -- seems like a better name
sys.path.append('helper_functions/')
import util

<span style=color:blue>Getting mongodb connection set up, and getting some info about the airbnb database</span>

In [2]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

# I already have a database "airbnb"
db = client.airbnb

# checking collections in airbnb 
print(db.list_collection_names())

['listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'testing', 'listings_previously_built', 'listings_with_reviews', 'calendar_previously_built', 'listings']


In [3]:
print(f'Size of listings_with_reviews is {db.listings_with_reviews.count_documents({})}.')
print(f'Size of listings_with_calendar is {db.listings_with_calendar.count_documents({})}.')
print(f'Size of listings_with_reviews_and_cal is {db.listings_with_reviews_and_cal.count_documents({})}.')

Size of listings_with_reviews is 39202.
Size of listings_with_calendar is 39201.
Size of listings_with_reviews_and_cal is 39202.


<span style=color:blue>Looking at shape of typical elements in listings_with_reviews_and_cal</span>

In [4]:
pprint.pp(db.listings_with_reviews_and_cal.find_one())

{'_id': ObjectId('6651189238b2bd10b4774432'),
 'id': '51944693',
 'name': 'Home in Queens · ★4.82 · 1 bedroom · 5 beds · 1 bath',
 'host_id': '91646104',
 'host_name': 'Pao',
 'neighbourhood_group': 'Queens',
 'neighbourhood': 'Woodside',
 'latitude': 40.74395,
 'longitude': -73.90858,
 'room_type': 'Entire home/apt',
 'price': 294.0,
 'minimum_nights': 30,
 'number_of_reviews': 57,
 'last_review': datetime.datetime(2023, 9, 24, 0, 0),
 'reviews_per_month': 1.98,
 'calculated_host_listings_count': 4,
 'availability_365': 89,
 'number_of_reviews_ltm': 23,
 'license': '',
 'reviews': [{'listing_id': '51944693',
              'review_id': '883354811516703393',
              'date': datetime.datetime(2023, 5, 3, 0, 0),
              'reviewer_id': '78568329',
              'reviewer_name': 'Laaziz',
              'comments': 'Hôte très réactif et avenant. Logement très bien '
                          'desservi, idéal pour visiter New York. Toutefois il '
                          'ne faut

In [5]:
superlative_words = ['astounding',
                     'amazing',
                     'awesome',
                     'excellent',
                     'exceptional',
                     'extraordinary',
                     'fantastic',
                     'great',
                     'magnificent',
                     'splendid',
                     'wonderful']

In [39]:
super_negative_words = ['aweful',
                        'horrible',
                        'terrible']

In [7]:
condition = { '$or' : [ {'reviews.comments' : { '$regex':  '^.*awesome.*$' , '$options': 'i' } } ,
                        {'reviews.comments' : { '$regex':  '^.*amazing.*$' , '$options': 'i' } }
                      ]
            }
                        
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000117 seconds.

The time taken to create the list was 23.422925 seconds.

17121


In [9]:
def build_word_cond(word):
    dict = {}
    dict['reviews.comments'] = { '$regex' : '^.*' + word + '.*$' , '$options': 'i' } 
    return dict

pprint.pp(build_word_cond('awesome'))

{'reviews.comments': {'$regex': '^.*awesome.*$', '$options': 'i'}}


In [30]:
regex_positive_list = []
for word in superlative_words:
    regex_positive_list.append(build_word_cond(word))

pprint.pp(regex_positive_list)

print()
regex_negative_list = []
for word in super_negative_words:
    regex_negative_list.append(build_word_cond(word))

pprint.pp(regex_negative_list)

[{'reviews.comments': {'$regex': '^.*astounding.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*amazing.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*awesome.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*excellent.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*exceptional.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*extraordinary.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*fantastic.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*great.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*magnificent.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*splendid.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*wonderful.*$', '$options': 'i'}}]

[{'reviews.comments': {'$regex': '^.*aweful.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*horrible.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*terrible.*$', '$options': 'i'}}]


In [11]:
condition_pos = { '$or' : regex_positive_list }

time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition_pos)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000165 seconds.

The time taken to create the list was 41.951336 seconds.

25196


In [31]:
condition_neg = { '$or' : regex_negative_list }

time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition_neg)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000823 seconds.

The time taken to create the list was 33.521738 seconds.

1672


<span style=color:blue>Adding in a condition about dates and prices</span>

In [32]:
condition1_pos = { '$or' : regex_positive_list, 
               'last_available_date' : { '$gte' : datetime(2025,2,1,0,0,0,0)},
               'average_price' : { '$lte' : 200 }
             }

time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition1_pos)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000145 seconds.

The time taken to create the list was 13.490646 seconds.

19228


In [33]:
condition1_neg = { '$or' : regex_negative_list, 
               'last_available_date' : { '$gte' : datetime(2025,2,1,0,0,0,0)},
               'average_price' : { '$lte' : 200 }
             }

time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition1_neg)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000123 seconds.

The time taken to create the list was 16.489125 seconds.

1281


#### <span style=color:blue>Now setting up a text index on reviews.comments</span>

<span style=color:blue>Following https://www.mongodb.com/docs/manual/core/indexes/index-types/index-text/</span>

<span style=color:blue>Note: text search indexes are case (and diacritic) insensitive. </span>

In [19]:
db.listings_with_reviews_and_cal.drop_indexes()

time1 = datetime.now()
index_name = db.listings_with_reviews_and_cal.create_index( { 'reviews.comments' : 'text' } )
time2 = datetime.now()
print(f'The time taken to create the index was {util.time_diff(time1,time2)} seconds.')
print(index_name)

The time taken to create the index was 38.73205 seconds.
reviews.comments_text


In [23]:
# db.listings_with_reviews_and_cal.drop_indexes()

In [24]:
cursor = db.listings_with_reviews_and_cal.index_information()
cursor1 = db.listings_with_reviews_and_cal.list_indexes()

for i in cursor:
    print(i)
print()
for i in cursor1:
    print(i)

_id_
reviews.comments_text

SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')])
SON([('v', 2), ('key', SON([('_fts', 'text'), ('_ftsx', 1)])), ('name', 'reviews.comments_text'), ('weights', SON([('reviews.comments', 1)])), ('default_language', 'english'), ('language_override', 'language'), ('textIndexVersion', 3)])


In [25]:
# recall that in MongoDB, text indexes are case insensitive

# search for "or" of multiple words by listing them with a space between
condition_ind = { '$text' : { '$search': 'awesome amazing' } 
                }
                        
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition_ind)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.00839 seconds.

The time taken to create the list was 31.853036 seconds.

17132


In [34]:
pos_word_list_blanks = ''
for word in superlative_words:
    pos_word_list_blanks = pos_word_list_blanks + word + ' '
pos_word_list_blanks = pos_word_list_blanks[0:-1]
print(pos_word_list_blanks + 'THIS MARKS END OF STRING')

astounding amazing awesome excellent exceptional extraordinary fantastic great magnificent splendid wonderfulTHIS MARKS END OF STRING


In [36]:
neg_word_list_blanks = ''
for word in super_negative_words:
    neg_word_list_blanks = neg_word_list_blanks + word + ' '
neg_word_list_blanks = neg_word_list_blanks[0:-1]
print(neg_word_list_blanks + 'THIS MARKS END OF STRING')

aweful horrible terribleTHIS MARKS END OF STRING


In [37]:
# recall that in MongoDB, text indexes are case insensitive

# search for "or" of multiple words by listing them with a space between
condition_ind_pos = { '$text' : { '$search': pos_word_list_blanks } 
                }
                        
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition_ind_pos)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000149 seconds.

The time taken to create the list was 15.177099 seconds.

25197


In [37]:
print(db.listings_with_reviews_and_cal.count_documents({}))

39202


In [38]:
# recall that in MongoDB, text indexes are case insensitive

# search for "or" of multiple words by listing them with a space between
condition_ind_neg = { '$text' : { '$search': neg_word_list_blanks } 
                }
                        
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(condition_ind_neg)
time2 = datetime.now()
print(f'The time taken for the selection was {util.time_diff(time1,time2)} seconds.')

time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f'\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.')

# this step takes about 5.4e-05 seconds
print()
print(len(l))

The time taken for the selection was 0.000126 seconds.

The time taken to create the list was 7.209305 seconds.

1930


In [17]:
condition1 = { '$or' : regex_cond_list, 
               'last_available_date' : { '$gte' : datetime(2025,2,1,0,0,0,0)},
               'average_price' : { '$lte' : 200 }
             }


In [None]:
{'reviews.comments': {'$regex': '^.*aweful.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*horrible.*$', '$options': 'i'}},
 {'reviews.comments': {'$regex': '^.*terrible.*$', '$options': 'i'}}]

In [40]:
result = db.listings_with_reviews_and_cal.find({'reviews.comments': {'$regex': '^.*aweful.*$', '$options': 'i'}})
l = list(result)
print(len(l))
result = db.listings_with_reviews_and_cal.find({'reviews.comments': {'$regex': '^.*horrible.*$', '$options': 'i'}})
l = list(result)
print(len(l))
result = db.listings_with_reviews_and_cal.find({'reviews.comments': {'$regex': '^.*terrible.*$', '$options': 'i'}})
l = list(result)
print(len(l))

2
728
1107


In [42]:
print(neg_word_list_blanks)  awefulhorrible


aweful horrible terrible
