# Neo4j Data Import
This file will load all of the data files into memory and query the Neo4j Database to populate the database with information. **Note: You must adjust the settings of the Neo4j database to have 3G of heap space.** 
This file will take a few hours to populate the database entirely.

In [None]:
# Install package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install neo4j-driver
!{sys.executable} -m pip install tqdm

# imports
from neo4j import GraphDatabase, basic_auth
from tqdm import tqdm
import json
import csv

In [None]:
# Connect to DB

driver = GraphDatabase.driver(
    "bolt://localhost:7687", 
    auth=basic_auth("neo4j", "915997582"))
session = driver.session()

cypher_query = '''
MATCH (n)
RETURN id(n) AS id
LIMIT 1
'''

results = session.run(cypher_query,
  parameters={})

for x in results:
    print('Verified')

In [None]:
# clean the DB
query = f"""
MATCH (n)
DETACH DELETE n
"""
session.run(query)

# clean out indexes
query = f"""
CALL apoc.schema.assert({},{},true) YIELD label, key
RETURN *
"""
session.run(query)

In [None]:
# load system data

data = []

with open('../data/systems_populated.json') as f:
    data = json.load(f)

#load the commodities data
with open('../data/commodities.json') as commodities_file:
    commodities_data = json.load(commodities_file)
        
station_listings = {}
#Load the listings data
with open('../data/listings.csv') as listings_file:
    csv_reader = csv.DictReader(listings_file)
    for listing in csv_reader:
        if not listing['station_id'] in station_listings:
            station_listings[listing['station_id']] = []
        station_listings[listing['station_id']].append(listing)
    
#load the stations data
with open('../data/stations.json') as stations_file:
    stations_data = json.load(stations_file)
    
#load the factions data
with open('../data/factions.json') as factions_file:
    factions_data = json.load(factions_file)
    
#load the modules data
with open('../data/modules.json') as modules_file:
    modules_data = json.load(modules_file)

## Module Nodes
Create the module nodes

In [None]:
#Create module nodes
for module in tqdm(modules_data, desc="Importing module nodes"):
    query = f"""
            MERGE (:Module{{id: {module['id']}, ed_symbol: '{module['ed_symbol']}'}})
            """
    session.run(query)
#Create an index on module ids so they are easy to link with the systems
index = """CREATE INDEX module_id FOR (m:Module) ON (m.id)"""
session.run(index)

## Faction Nodes
Creates the faction nodes

In [None]:
#Create factions nodes
for faction in tqdm(factions_data, desc="Importing faction nodes"):
    faction['name'] = faction['name'].replace("'", "")
    query = f"""
            MERGE (:Faction{{id: {faction['id']}, name: '{faction['name']}'}})
            """
    session.run(query)
#Create index for the faction id so linking factions below is faster
index = """CREATE INDEX faction_id FOR (f:Faction) ON (f.id)"""
session.run(index)

## System and Station Nodes
The following block loads all of the system and stations nodes. Additionally, it creates a relationship between faction and system for the controlling faction. It also creates a relationship between System and Station nodes for a which System contains a station.

In [None]:
def create_service_list(station):
    services = []
    if station['has_blackmarket']:
        services.append('blackmarket')
    if station['has_market']:
        services.append('market')
    if station['has_refuel']:
        services.append('refuel')
    if station['has_repair']:
        services.append('repair')
    if station['has_rearm']:
        services.append('rearm')
    if station['has_outfitting']:
        services.append('outfitting')
    if station['has_shipyard']:
        services.append('shipyard')
    if station['has_docking']:
        services.append('docking')
    if station['has_commodities']:
        services.append('commodities')
    return services

In [None]:
# #Import all system nodes with positional information, creates relationship for a faction controlling a system
for x in tqdm(data, desc="Importing all system nodes"):
    x['name'] = x['name'].replace("'", "")
    faction_id = x['controlling_minor_faction_id']
    query = f"""
            MERGE (s:System{{id: {x['id']}, name: '{x['name']}', x: {x['x']}, y: {x['y']}, z: {x['z']}, 
            allegiance: '{x['allegiance']}', population: {x['population']}, needs_permit: {x['needs_permit']}, 
            power_state: '{x['power_state']}' }})"""
    if faction_id:
        query = query + f"""WITH s
            MATCH (f:Faction {{id: {faction_id}}})
            MERGE (f)-[:CONTROLS]->(s)"""
    session.run(query,
      parameters={})
#Create index for the system id so linking systems is faster
index = """CREATE INDEX system_id FOR (s:System) ON(s.id)"""
session.run(index)
    
#Import all station nodes and connect them to their system
for station in tqdm(stations_data, desc="Importing all station nodes"):
    station['name'] = station['name'].replace("'", "")
    system_id = station['system_id']
    services = create_service_list(station)
    query = f"""
             MERGE (s:Station{{id: {station['id']}, name: '{station['name']}', services: {services}}})"""
    if system_id:
        query = query + f"""
             WITH s
             MATCH (sys:System{{id: {system_id}}})
             MERGE (sys)-[:CONTAINS]->(s)"""
    session.run(query)
# Create index for the system id so linking stations is faster
index = """CREATE INDEX station_id FOR (s:Station) ON(s.id)"""
session.run(index)

## Modules sold at Stations Relationship
The below query imports all the module-SOLD_AT->station relationships. The 113 module listings chunk files must be in the import folder of the database. The files were needed to reduce the import time from days to just a few minutes. **Note: The Database must not have an SOLD_AT relationships already created otherwise this query will create duplicates** 

In [None]:
#The module listings are chunked in 113 different files for import
num_chunks = 113
for i in tqdm(range(num_chunks), desc="Importing module relationships"):
    query = f"""LOAD CSV WITH HEADERS FROM 'file:///module_listings/modules_listing{i}.csv' as row
                MATCH (s:Station {{id: toInteger(row.station_id)}})
                UNWIND split(replace(replace(row.modules, "[", ""), "]", ""), ",") as module_id
                WITH s, module_id
                MATCH (m: Module {{id: toInteger(lTrim(module_id))}})
                CREATE (m)-[:SOLD_AT]->(s)"""
    session.run(query)

## Commodity Nodes
Create the commodity nodes

In [None]:
#Import all commodity nodes
for commodity in tqdm(commodities_data, desc="Importing all commodity nodes"):
    commodity['name'] = commodity['name'].replace("'", "")
    query = f"""
             MERGE (c:Commodity{{id: {commodity['id']}, name: '{commodity['name']}', average_price: {commodity['average_price'] or "null"},
             max_buy_price: {commodity['max_buy_price'] or "null"}, max_sell_price: {commodity['max_sell_price'] or "null"}, min_buy_price: {commodity['min_buy_price'] or "null"},
             min_sell_price: {commodity['min_sell_price'] or "null"}, buy_price_lower_average: {commodity['buy_price_lower_average'] or "null"},
             sell_price_upper_average: {commodity['sell_price_upper_average'] or "null"}}})
             """
    session.run(query)
index = """CREATE INDEX commodity_id FOR (c:Commodity) ON (c.id)"""
session.run(query)

## Buy/Sell Listings Data
In order for the buy and sell listings to be populated, ensure the listings_stripped(0-15).csv are placed in the Neo4j import folder for the database. The files needed to be stripped otherwise Neo4j would run out of memory when trying to do the import. Also, doing one query per listing was also very slow and is why Loading from CSV was the preferred choice.

In [None]:
#Create all the buy/sell relations
num_file_chunks = 15
for i in tqdm(range(num_file_chunks+1), desc="Creating listings with sell relationships for each commodity"):
    query = f"""LOAD CSV WITH HEADERS FROM 'file:///listing_chunks/listings_stripped{i}.csv' AS row
            MATCH (s:Station {{id: toInteger(row.station_id)}}), (c:Commodity {{id: toInteger(row.commodity_id)}})
            WHERE toInteger(row.demand) > 0
            MERGE (c)-[:SELL_AT {{sell_price: toInteger(row.sell_price), demand: toInteger(row.demand)}}]->(s)
    session.run(query)"""
    
for i in tqdm(range(num_file_chunks+1), desc="Creating listings with buy relationships for each commodity"):
    query = f"""LOAD CSV WITH HEADERS FROM 'file:///listing_chunks/listings_stripped{i}.csv' AS row
            MATCH (s:Station {{id: toInteger(row.station_id)}}), (c:Commodity {{id: toInteger(row.commodity_id)}})
            WHERE toInteger(row.supply) > 0
            MERGE (c)-[:BUY_AT {{buy_price: toInteger(row.buy_price), supply: toInteger(row.supply)}}]->(s)"""
    session.run(query)

## System Connecting Relationship
The code below connects all the system with their distance away from each other. Systems are only connected to each other if their distance between them is less than 42 light years. This is because the best explorer can only jump a max range of 42 lightyears.

In [None]:
#Connect the graph where the distance is < 8 lightyears. 8 is the max range on a sidewinder(starting vessel) 
#42 is the max jump range for an upgraded Diamondback Explorer(best Explorer).

#Queries need to be chunked otherwise the heap will run out of memory and the query will fail / freeze the database

for system in tqdm(data, desc="Connecting Systems..."):
    query = f"""
            MATCH (s1:System {{id: {system['id']}}}), (s2:System)
            WHERE s1.id <> s2.id
            WITH s1, s2, (s1.x - s2.x)^2 + (s1.y - s2.y)^2 + (s1.z - s2.z)^2 as disSquared
            WHERE disSquared < 42^2
            MERGE (s1)<-[:DISTANCE{{lightYears: sqrt(disSquared), x1: s1.x, x2: s2.x, y1: s1.y, y2: s2.y, z1: s1.z, z2: s2.z}}]->(s2)
            """
    session.run(query)

In [None]:
# # Hard reset

# for x in tqdm(data, desc="Deleting systems"):
#     name = x['name'].replace("'", "")
#     query = """
#     MATCH (s:System)-[d:DISTANCE]-()
#     WHERE s.name = '{name}'
#     DETACH DELETE s
#     """.format(name=name)
#     session.run(query,
#       parameters={})