# Imports

In [1]:
import shutil
from glob import glob

from neo4j import GraphDatabase
import pandas as pd

# Config

In [2]:
DB_PATH = '/media/mateusz-destroyer/sdb3/dev_sandbox/dev_sandbox_neo4j_import'

DDL_COMP_DETAILS = './data/company_details/ddl'
DDL_NEWS_PRICE = './data/news/ddl'
DDL_STOCK_PRICE = './data/stock_price/ddl'
# DDL_PATH_matches_history = './data/matches_history/ddl'
DDL_PATH_files = './data/_file_transforms'

NODES_FILES = {
    'Industry': [DDL_COMP_DETAILS, 'industry_ddl.csv'],
    'Sector': [DDL_COMP_DETAILS, 'sector_ddl.csv'],
    'City': [DDL_COMP_DETAILS, 'city_ddl.csv'],
    'State': [DDL_COMP_DETAILS, 'state_ddl.csv'],
    'Country': [DDL_COMP_DETAILS, 'country_ddl.csv'],
    'Company': [DDL_COMP_DETAILS, 'comp_details_ddl.csv'],
    'Shareholder': [DDL_COMP_DETAILS, 'shareholders_ddl.csv'],
    'Publisher':[DDL_NEWS_PRICE, 'publisher_ddl.csv'],
    'News':[DDL_NEWS_PRICE, 'news_ddl.csv'],
    'Date':[DDL_NEWS_PRICE, 'date_ddl.csv'],
    'Stock':[DDL_STOCK_PRICE, 'stock_ddl.csv'],
    
    'File': [DDL_PATH_files, 'files_ddl.csv'],
    'Transform': [DDL_PATH_files, 'transforms_ddl.csv'],
    'App': [DDL_PATH_files, 'apps_ddl.csv']
}

RELATIONS_FILES = {
    'IN_INDUSTRY': [DDL_COMP_DETAILS, 'company_IN_INDUSTRY_industry_ddl.csv'],
    'MEMBER_OF': [DDL_COMP_DETAILS, 'industry_MEMBER_OF_sector_ddl.csv'],
    'LOCALIZED_IN': [DDL_COMP_DETAILS, 'company_LOCALIZED_IN_city_ddl.csv'],
    'IS_IN': [DDL_COMP_DETAILS, 'city_IS_IN_state_ddl.csv'],
    'IS_IN_2': [DDL_COMP_DETAILS, 'city_IS_IN_country_ddl.csv'],
    'PART_OF': [DDL_COMP_DETAILS, 'state_PART_OF_country_ddl.csv'],
    'HAS_SHARES_IN': [DDL_COMP_DETAILS, 'shareholder_HAS_SHARES_IN_company_ddl.csv'],
    'IS_VALUED_FOR':[DDL_STOCK_PRICE, 'stock_IS_VALUED_FOR_company_ddl.csv'],
    'PUBLISHES':[DDL_NEWS_PRICE, 'publisher_PUBLISHES_news_ddl.csv'],
    'CONCERNS':[DDL_NEWS_PRICE, 'news_CONCERNS_company_ddl.csv'],
    'IS_ISSUED_ON':[DDL_NEWS_PRICE, 'news_IS_ISSUED_ON_date_ddl.csv'],
    'IS_VALUED_ON':[DDL_STOCK_PRICE, 'stock_IS_VALUED_ON_date_ddl.csv'],
    
    'INPUT': [DDL_PATH_files, 'file_INPUT_transform_ddl.csv'],
    'OUTPUT': [DDL_PATH_files, 'file_OUTPUT_transform_ddl.csv'],
    'METADATA': [DDL_PATH_files, 'file_METADATA_transform_ddl.csv'],
    'STAGE_OF': [DDL_PATH_files, 'transform_STAGE_OF_app_ddl.csv']
}

In [3]:
RELATIONS_FILES

{'IN_INDUSTRY': ['./data/company_details/ddl',
  'company_IN_INDUSTRY_industry_ddl.csv'],
 'MEMBER_OF': ['./data/company_details/ddl',
  'industry_MEMBER_OF_sector_ddl.csv'],
 'LOCALIZED_IN': ['./data/company_details/ddl',
  'company_LOCALIZED_IN_city_ddl.csv'],
 'IS_IN': ['./data/company_details/ddl', 'city_IS_IN_state_ddl.csv'],
 'IS_IN_2': ['./data/company_details/ddl', 'city_IS_IN_country_ddl.csv'],
 'PART_OF': ['./data/company_details/ddl', 'state_PART_OF_country_ddl.csv'],
 'HAS_SHARES_IN': ['./data/company_details/ddl',
  'shareholder_HAS_SHARES_IN_company_ddl.csv'],
 'IS_VALUED_FOR': ['./data/stock_price/ddl',
  'stock_IS_VALUED_FOR_company_ddl.csv'],
 'PUBLISHES': ['./data/news/ddl', 'publisher_PUBLISHES_news_ddl.csv'],
 'CONCERNS': ['./data/news/ddl', 'news_CONCERNS_company_ddl.csv'],
 'IS_ISSUED_ON': ['./data/news/ddl', 'news_IS_ISSUED_ON_date_ddl.csv'],
 'IS_VALUED_ON': ['./data/stock_price/ddl', 'stock_IS_VALUED_ON_date_ddl.csv'],
 'INPUT': ['./data/_file_transforms', 'fil

In [4]:
NODES_FILES

{'Industry': ['./data/company_details/ddl', 'industry_ddl.csv'],
 'Sector': ['./data/company_details/ddl', 'sector_ddl.csv'],
 'City': ['./data/company_details/ddl', 'city_ddl.csv'],
 'State': ['./data/company_details/ddl', 'state_ddl.csv'],
 'Country': ['./data/company_details/ddl', 'country_ddl.csv'],
 'Company': ['./data/company_details/ddl', 'comp_details_ddl.csv'],
 'Shareholder': ['./data/company_details/ddl', 'shareholders_ddl.csv'],
 'Publisher': ['./data/news/ddl', 'publisher_ddl.csv'],
 'News': ['./data/news/ddl', 'news_ddl.csv'],
 'Date': ['./data/news/ddl', 'date_ddl.csv'],
 'Stock': ['./data/stock_price/ddl', 'stock_ddl.csv'],
 'File': ['./data/_file_transforms', 'files_ddl.csv'],
 'Transform': ['./data/_file_transforms', 'transforms_ddl.csv'],
 'App': ['./data/_file_transforms', 'apps_ddl.csv']}

# Ingest

In [5]:
for key in NODES_FILES:

    original = glob(f'{NODES_FILES[key][0]}/{NODES_FILES[key][1]}/*.csv')[0]
   
    destination = f'{DB_PATH}/{NODES_FILES[key][1]}'
    shutil.copyfile(original, destination)

In [6]:
for key in RELATIONS_FILES:
    print(key)
    original = glob(f'{RELATIONS_FILES[key][0]}/{RELATIONS_FILES[key][1]}/*.csv')[0]
    destination = f'{DB_PATH}/{RELATIONS_FILES[key][1]}'
    shutil.copyfile(original, destination)

IN_INDUSTRY
MEMBER_OF
LOCALIZED_IN
IS_IN
IS_IN_2
PART_OF
HAS_SHARES_IN
IS_VALUED_FOR
PUBLISHES
CONCERNS
IS_ISSUED_ON
IS_VALUED_ON
INPUT
OUTPUT
METADATA
STAGE_OF


# Load CSV

In [7]:
import csv

class QueryBuilder:
    def __init__(self, DB_PATH, class_name, file_name):
        self._DB_PATH = DB_PATH
        self._class_name = class_name
        self._file_name = file_name
        
        self._load_fieldnames()
        
    def _load_fieldnames(self):
        path = f'{self._DB_PATH}/{self._file_name}'
        with open(path, 'r') as file:
            reader = csv.DictReader(file,delimiter=',')
            self._fieldnames = reader.fieldnames
    
    def create_node(self):
        """ e.g.
        LOAD CSV WITH HEADERS FROM 'file:///cities_ddl.csv' AS row
        MERGE (c:City {city_id: row.city_id, city_name: row.city_name})
        """
        key = self._class_name
        file_name = self._file_name
        fieldnames = self._fieldnames
        
        query = f"LOAD CSV WITH HEADERS FROM 'file:///{file_name}' AS row "
        query += f"MERGE (c:{key} " + "{"
        for field in fieldnames:
            query += f"{field}: row.{field}, "
        query = query[:-2]
        query += "})"
        
        return query
        
    def create_index(self):
        key = self._class_name
        fieldname = self._fieldnames[0]
        
        query = f"CREATE INDEX ON :{key}({fieldname})"
        
        return query
    
    def create_relation(self):
        """ e.g.
        LOAD CSV WITH HEADERS FROM 'file:///city_LOCATED_IN_country_ddl.csv' AS row
        MATCH (c1:City {city_id: row.city_id}), (c2:Country {country_id: row.country_id})
        CREATE (c1)-[:LOCATED_IN]->(c2)
        """

        relation = self._class_name
        file_name = self._file_name
        src_id = self._fieldnames[0]
        src = src_id[:src_id.find('_')].capitalize()
        dest_id = self._fieldnames[1]
        dest = dest_id[:dest_id.find('_')].capitalize()
        
        query = f"LOAD CSV WITH HEADERS FROM 'file:///{file_name}' AS row "
        query += f"MATCH (c1:{src}" + "{" + f"{src_id}: row.{src_id}" + "}), "
        query += f"(c2:{dest} " + "{" + f"{dest_id}: row.{dest_id}" + "}) "
        query += f"CREATE (c1)-[:{relation}]->(c2)"
        
        return query
    
    def drop_index(self):
        key = self._class_name
        fieldname = self._fieldnames[0]
        
        query = f"DROP INDEX ON :{key}({fieldname})"
        
        return query
    
    @staticmethod
    def drop_relations():
        return "MATCH (n)-[r]-() DELETE r"

    @staticmethod
    def drop_nodes():
        return "MATCH (n) DELETE n"

In [8]:
class Neo4jConnector:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()
            
    def execute_query(self, query):
        with self.driver.session() as session:
            result = session.write_transaction(self._query, query)
            print(result)
            
    @staticmethod
    def _query(tx, query):
        result = tx.run(query)
        return result.single()


In [9]:

    
def queries_create():
    queries = []
    for key in NODES_FILES:
        qb = QueryBuilder(DB_PATH, key, NODES_FILES[key][1])
        queries.append(qb.create_node())
    for key in RELATIONS_FILES:
        name=key
        if key == 'IS_IN_2':
            name='IS_IN'
        qb = QueryBuilder(DB_PATH, name, RELATIONS_FILES[key][1])
        queries.append(qb.create_relation())
    return queries

def queries_drop():
    queries = []
    for key in NODES_FILES:
        qb = QueryBuilder(DB_PATH, key, NODES_FILES[key][1])
        #queries.append(qb.drop_index())
    queries.append(qb.drop_relations())
    queries.append(qb.drop_nodes())
    return queries

def refactor_graph():
    queries = []
    queries.append("""
    MATCH (n:File) 
    WHERE n.type='Dataset' 
    WITH n
    CALL apoc.create.addLabels(n, ['Dataset'])
    YIELD node
    RETURN count(node)
    """)
    
    queries.append("""
    MATCH (n:File) 
    WHERE n.type='Exceptions' 
    WITH n
    CALL apoc.create.addLabels(n, ['Exceptions'])
    YIELD node
    RETURN count(node)
    """)
    
    queries.append("""
    MATCH (n:File) 
    WHERE n.type='Schema' 
    WITH n
    CALL apoc.create.addLabels(n, ['Schema'])
    YIELD node
    RETURN count(node)
    """)
    
    queries.append("""
    MATCH (n1:App)-[r1:STAGE_OF]-(n2:Transform)-[r2:OUTPUT]-(n3:Dataset)-[r3:INPUT]-(n4:Transform)-[r4:STAGE_OF]-(n5:App)
    WHERE n1 <> n5
    WITH n1, n5
    CREATE (n1)-[:NEXT]->(n5)
    """)
    
    return queries

def query(queries):
    neo4j = Neo4jConnector("bolt://localhost:7687", "neo4j", "passwd")
    for query in queries():
        neo4j.execute_query(query)
    neo4j.close()

In [10]:
query(queries_drop)
query(queries_create)
query(refactor_graph)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
<Record count(node)=48>
<Record count(node)=11>
<Record count(node)=41>
None
