## Import Data and Connect to Database

In [1]:
import pandas as pd
from neo4j import GraphDatabase
from modules.database_population import create_nodes
from modules.database_population import create_relationships_by_property, create_relationships_by_id
from modules.database_population import create_index_by_node, create_index_by_relation

In [2]:
# Establish connection to local neo4j database
auth = ('neo4j', 'password')
uri = 'neo4j://localhost:7687'
database_name = 'neo4j'

driver = GraphDatabase.driver(uri, auth = auth)

In [3]:
# Load in data files
listings = pd.read_csv('../data/cleaned/listings.csv.xz', compression = 'xz')
hosts = pd.read_csv('../data/cleaned/hosts.csv.xz', compression = 'xz')
reviews = pd.read_csv('../data/cleaned/reviewers.csv.xz', compression = 'xz')

# Create guests as a subset of reviews
guests = reviews[['reviewer_id', 'reviewer_name']].drop_duplicates().reset_index(drop = True)

## Creating Nodes

In [4]:
# Create nodes for listings
# create_nodes(driver, node_type = 'Listing', df = listings, database_name = database_name)

In [5]:
# Create nodes for hosts
# create_nodes(driver, node_type = 'Host', df = hosts, database_name = database_name)

In [6]:
# Create nodes for guests
# create_nodes(driver, node_type = 'Guest', df = guests, database_name = database_name)

## Creating Relationships

In [7]:
# Pull out the unique guests
unique_guests = guests.reviewer_id.unique().tolist()
guest_stays = {id_: {'start_node': {'type': 'Guest', 'field': 'guest_id'}} for id_ in unique_guests}

Populate the guest STAYS at listing relationships (744,193 relationships, took about 14 hours to run)

In [8]:
# For each guest, find all of the listings they have stayed at, then create a relationship
# from that guest to that listing with the date of stay as a relationship property

# for guest in list(guest_stays.keys()):
#     stays = reviews.loc[reviews.reviewer_id == guest][['listing_id', 'date']]
#     locations = list(stays['listing_id'])
#     dates = list(stays['date'])
#     guest_stays[guest]['end_nodes'] = [{'type': 'Listing', 'field': 'listing_id',
#                                         'key': locations[i],
#                                         'props': {'date': dates[i]}}
#                                         for i in range(len(stays.index))]
# #     print(guest_stays[guest])
#     create_relationships_by_id(driver, relationship_type = 'STAYS', id_ = guest,
#                                relation_dict = guest_stays[guest], database_name = database_name)

In [9]:
# Create the relationships from Hosts to Listing
# create_relationships_by_property(driver, relationship_type = 'OWNS',
#                                  node_from = 'Host', node_to = 'Listing',
#                                  on = 'host_id', database_name = database_name)

## Creating Indexes

Create some indexes on common properties to speed up queries.

In [10]:
# Create a number of indexes on listings as this is the subject of the bulk of queries
listing_indexes = [('listing_host_index', 'host_id'), ('listing_id_index', 'listing_id'),
                   ('listing_borough_index', 'borough'), ('listing_beds_index', 'beds'),
                   ('listing_price_index', 'price'), ('listing_reviews_index', 'review_count'),
                   ('listing_rating_index', 'review_rating'), ('listing_neighborhood_index', 'neighborhood')]

# for name, key in listing_indexes:
#     create_index_by_node(driver, index_name = name, node_type = 'Listing',
#                          on = key, database_name = database_name)

In [11]:
# Create indexes on the ids for hosts and guests
# create_index_by_node(driver, index_name = 'host_id_index', node_type = 'Host',
#                      on = 'host_id', database_name = database_name)
# create_index_by_node(driver, index_name = 'guest_id_index', node_type = 'Guest',
#                      on = 'guest_id', database_name = database_name)

In [12]:
# Create index on date of guest stay for ease of searching and aggregation
# create_index_by_relation(driver, index_name = 'date_stay_index', relation_type = 'STAYS',
#                          on = 'date', database_name = database_name)

In [13]:
# Close connection
driver.close()