In [None]:
import py2neo
from py2neo import Graph
import pandas as pd
import numpy as np
import os
import dropbox
import config
from titlecase import titlecase
dbx = dropbox.Dropbox(config.dropbox_api_key)

In [None]:
##Define helper functions
def get_cypher_from_columns(columns_list):
    cypher = []
    for name in columns_list[1:]:
        cypher.append('n.' + name + ' = line.' + name)
    cypher = ', '.join(cypher)
    return cypher
def get_label(x):
    if 'company' in x:
        return 'Company'
    elif 'companies' in x:
        return 'Company'
    elif 'human' in x:
        return 'Person'
    elif 'super_secure' in x:
        return 'SuperSecurePerson'
    elif 'legal'in x:
        return 'LegalPerson'
    elif 'exemption' in x:
        return 'Exemption'
    elif 'statement' in x:
        return 'Statement'
    elif 'address' in x:
        return 'Postcode'
    elif 'politician' in x:
        return 'Legislature'
def create_target_company_uid(x):
    uk_identifiers = ['Companies House','England', 'Wales', 'Companies House', 'United Kingdom', 'Scotland']
    if any(identifier in x['identification.place_registered'] for identifier in uk_identifiers) and x['identification.registration_number'] != '':
        return x['identification.registration_number'].upper()
    elif any(identifier in x['identification.place_registered'] for identifier in uk_identifiers) and x['identification.registration_number'] == '':
        return x['name'] + '_' + x['identification.place_registered'].replace(' ','_')
    else:
        return x['name'] + '_' + x['identification.place_registered'].replace(' ','_') + '_' + x['identification.registration_number'].upper()

In [None]:
##Load in data from pre-prepped CSV files. See Github repo for initial transformation steps
all_records_psc = pd.read_csv('data/outputs/all_records_psc.csv')
active_companies = pd.read_csv('data/outputs/active_companies.csv')
psc_records = pd.read_csv('data/outputs/psc_records.csv')
psc_statements = pd.read_csv('data/outputs/psc_statements.csv')
psc_controls = pd.read_csv('data/outputs/psc_controls.csv')
active_officers = pd.read_csv('data/outputs/active_officers.csv',parse_dates=['partial_date_of_birth'],dtype={'company_number': str})
#Make sure only officers that appear in the active_companies dataset are used
active_officers = active_officers[active_officers.company_number.isin(active_companies.CompanyNumber.unique())].copy()
secret_jurisdictions = pd.read_csv('data/outputs/secret_jurisdictions.csv',header=None)[0].tolist()
##Create DataFrames for active PSC records, statements and controls
active_psc_all_records = all_records_psc[pd.isnull(all_records_psc.ceased_on)].copy()
active_psc_records = psc_records[pd.isnull(psc_records.ceased_on)].copy()
active_psc_statements = psc_statements[pd.isnull(psc_statements.ceased_on)].copy()
active_psc_controls = psc_controls[psc_controls.company_number.isin(all_records_psc[pd.isnull(all_records_psc.ceased_on)].company_number)].copy()

In [None]:
##Create CSV files for nodes and edges that will be loaded into Neo4J
##Create company nodes for filing companies
active_filing_company_psc = pd.merge(active_companies,active_psc_all_records,\
                                   left_on='CompanyNumber',right_on='company_number',how='right')
active_filing_company_psc['uid'] = active_filing_company_psc.company_number
active_filing_company_psc.fillna('',inplace=True)
active_filing_company_psc_nodes = active_filing_company_psc[['uid','company_number','RegAddress.AddressLine1', 'RegAddress.AddressLine2',\
       'RegAddress.PostTown', 'RegAddress.County', 'RegAddress.Country',\
       'RegAddress.PostCode', 'CompanyCategory',\
       'CountryOfOrigin', 'DissolutionDate', 'IncorporationDate', 'CompanyName']].drop_duplicates()
active_filing_company_psc_nodes.columns = ['uid','company_number','address_line_1','address_line_2','town','county',\
                                          'country', 'postcode', 'company_category', 'country_of_origin', 'dissolution_date',\
                                          'incorporation_date', 'name']
active_filing_company_psc_nodes.to_csv('data/neo4j/imports/active_filing_company_psc_nodes.csv')
print('Created active company nodes CSV')
##Create company nodes for target companies
active_target_company_psc = active_psc_records[active_psc_records.kind == 'corporate-entity-person-with-significant-control'].copy()
active_target_company_psc = active_target_company_psc[['company_number','address.address_line_1', 'address.address_line_2', 'address.country','address.postal_code',\
       'exemptions_count', 'generated_at', 'identification.country_registered',\
       'identification.legal_authority', 'identification.legal_form',\
       'identification.place_registered', 'identification.registration_number', 'name', 'natures_of_control','secret_base']].copy()
active_target_company_psc.fillna('',inplace=True)
active_target_company_psc['uid'] = active_target_company_psc.apply(create_target_company_uid,axis=1)
active_target_company_psc_nodes = active_target_company_psc.drop_duplicates()
active_target_company_psc_nodes.columns = ['company_number_of_filing_company','address_line_1','address_line_2','country','postcode',\
       'exemptions_count', 'generated_at', 'country_registered',\
       'legal_authority', 'legal_form',\
       'place_registered', 'company_number', 'name', 'natures_of_control','secret_base','uid']
active_target_company_psc_nodes.to_csv('data/neo4j/imports/active_target_company_psc_nodes.csv')
print('Created target company nodes CSV')
##Create indivdual nodes
active_human_psc = active_psc_records[active_psc_records.kind == 'individual-person-with-significant-control'].copy()
active_human_psc.fillna('',inplace=True)
active_human_psc['uid'] = active_human_psc['name_elements.forename'] + '_' + active_human_psc['name_elements.surname'] + '_' + active_human_psc['month_year_birth'].astype(str) + '_' +\
                 active_human_psc['address.postal_code']
active_human_psc_nodes = active_human_psc[['uid','name','address.address_line_1', 'address.address_line_2',\
       'address.care_of', 'address.country', 'address.locality',\
       'address.po_box', 'address.postal_code','nationality','month_year_birth','country_of_residence_normal',\
                                                      'address_country_normal','secret_base','join_id']].copy()
active_human_psc_nodes.drop_duplicates(inplace=True)
active_human_psc_nodes.columns = ['uid','name', 'address_line_1', 'address_line_2',\
       'care_of', 'country', 'locality',\
       'po_box', 'post_code','nationality','month_year_birth','country_of_residence_normal',\
                                                      'address_country_normal','secret_base','join_id']
active_human_psc_nodes.to_csv('data/neo4j/imports/active_human_psc_nodes.csv')
print('Created human nodes CSV')
##Create legal person nodes
active_legal_psc = active_psc_records[active_psc_records.kind == 'legal-person-person-with-significant-control'].copy()
active_legal_psc['uid'] = active_legal_psc['name'].copy()
active_legal_psc_nodes = active_legal_psc[['name','address.address_line_1', 'address.address_line_2',\
       'address.care_of', 'address.country', 'address.locality',\
       'address.po_box', 'address.postal_code','nationality','month_year_birth','country_of_residence_normal',\
                                                      'address_country_normal','uid']].copy()
active_legal_psc_nodes.drop_duplicates(inplace=True)
active_legal_psc_nodes.columns  = ['name','address_line_1', 'address_line_2',\
       'care_of', 'country', 'address.locality',\
       'po_box', 'post_code','nationality','month_year_birth','country_of_residence_normal',\
                                                      'address_country_normal','uid']
active_legal_psc_nodes.to_csv('data/neo4j/imports/active_legal_psc_nodes.csv')
##Create super secure node
active_super_secure_psc = active_psc_records[active_psc_records.kind == 'super-secure-person-with-significant-control'].copy()
active_super_secure_psc['uid'] = active_super_secure_psc['etag'].copy()
active_super_secure_psc.fillna('',inplace=True)
active_super_secure_psc_nodes = active_super_secure_psc[['uid']].copy()
active_super_secure_psc_nodes.drop_duplicates(inplace=True)
active_super_secure_psc_nodes.to_csv('data/neo4j/imports/active_super_secure_psc_nodes.csv')
##Create exemptions nodes
active_exemptions_psc = active_psc_records[active_psc_records.kind == 'exemptions'].copy()
active_exemptions_psc['uid'] = active_exemptions_psc['etag'].copy()
active_exemptions_psc.fillna('',inplace=True)
active_exemptions_psc_nodes = active_exemptions_psc[['uid']].drop_duplicates()
active_exemptions_psc_nodes.to_csv('data/neo4j/imports/active_exemptions_psc_nodes.csv')
##Create statement nodes
active_psc_statements['uid'] = active_psc_statements['etag']
active_psc_statements.drop_duplicates(inplace=True)
active_psc_statements_nodes = active_psc_statements[['statement','uid']].drop_duplicates()
active_psc_statements_nodes.to_csv('data/neo4j/imports/active_psc_statement_nodes.csv')
print('Created statement nodes CSV')
##Create address nodes
active_addresses = active_companies.melt(id_vars=['RegAddress.PostCode'],value_vars=['CompanyNumber'])
active_addresses['uid'] = active_addresses['RegAddress.PostCode'].copy()
active_addresses.dropna(subset=['uid'],inplace=True)
active_address_nodes = active_addresses[['RegAddress.PostCode','uid']].copy()
active_address_nodes.columns = ['postcode','uid']
active_address_nodes.drop_duplicates(inplace=True)
active_address_nodes.to_csv('data/neo4j/imports/active_address_nodes.csv')
print('Created postcode nodes')
##Create human edges
human_edges = active_human_psc[['company_number','uid','natures_of_control']].copy()
human_edges.fillna('',inplace=True)
human_edges.to_csv('data/neo4j/imports/psc_human_edges.csv')
print('Created human edges')
##Create company edges
company_edges = active_target_company_psc[['company_number','uid','natures_of_control']].copy()
company_edges.fillna('',inplace=True)
company_edges.to_csv('data/neo4j/imports/psc_company_edges.csv')
print('Created company edges')
##Create super secure person edges
super_secure_edges = active_super_secure_psc[['company_number','uid','natures_of_control']].copy()
super_secure_edges.fillna('',inplace=True)
print('Created super secure person edges')
super_secure_edges.to_csv('data/neo4j/imports/super_secure_edges.csv')
##Create legal persons edges
legal_person_edges = active_legal_psc[['company_number','uid','natures_of_control']].copy()
legal_person_edges.fillna('',inplace=True)
legal_person_edges.to_csv('data/neo4j/imports/legal_person_edges.csv')
print('Created legal person edges')
##Create exemptions edges
exemptions_edges = active_exemptions_psc[['company_number','uid','natures_of_control']].copy()
exemptions_edges.fillna('',inplace=True)
exemptions_edges.to_csv('data/neo4j/imports/exemption_edges.csv')
print('Created exemptions edges')
##Statement edges
statement_edges = active_psc_statements[['company_number','uid']].copy()
statement_edges.fillna('',inplace=True)
statement_edges.to_csv('data/neo4j/imports/statement_edges.csv')
##Create address edges
active_addresses_edges = active_addresses[['value','uid']].drop_duplicates()
active_addresses_edges.to_csv('data/neo4j/imports/address_edges.csv')
print('Created address edges and nodes')
##Human officers
active_officers_humans = active_officers[active_officers.type == 'Person'].copy()
active_officers_humans.fillna('',inplace=True)
single_first_series = active_officers_humans['first_name'].apply(lambda x: x.split(' ')[0])
active_officers_humans['address.in_full'] = active_officers_humans['address.in_full'].str.replace('\\','')
active_officers_humans['address.street_address'] = active_officers_humans['address.street_address'].str.replace('\\','')
active_officers_humans['address.postal_code'] = active_officers_humans['address.postal_code'].str.replace('\\','')
active_officers_humans
active_officers_humans['uid'] = single_first_series.str.title() + '_' + active_officers_humans['last_name'].str.title() + '_' +\
active_officers_humans['partial_date_of_birth'] + '-01_'
active_officers_humans_nodes = active_officers_humans[['name', 'title',
       'first_name', 'last_name','person_number',
       'person_uid', 'current_status', 'occupation', 'nationality',
       'country_of_residence', 'partial_date_of_birth', 'type',
       'address.in_full', 'address.street_address', 'address.locality',
       'address.region', 'address.postal_code', 'address.country',
       'retrieved_at', 'source_url', 'country_of_residence_normal',
       'address_country_normal', 'secret_base', 'uid', 'join_id']]
active_officers_humans_nodes.columns = [x.replace('.','_') for x in active_officers_humans_nodes.columns] 
##Create edges table
active_officers_humans_edges = active_officers_humans[['uid','company_number', 'position', 'start_date', 'end_date']]
##Export
active_officers_humans_nodes.to_csv('data/neo4j/imports/active_officers_humans_nodes.csv')
active_officers_humans_edges.to_csv('data/neo4j/imports/active_officers_humans_edges.csv')
print('Created human officers nodes and edges')
##Company officers
active_officers_companies = active_officers[active_officers.type == 'Company'].copy()
active_officers_companies.fillna('',inplace=True)
active_officers_companies['uid'] = active_officers_companies['name'] + active_officers_companies['address.postal_code']
##Get company IDs from active_company file using name and postcode
active_companies['fake_id'] = active_companies['CompanyName'] + '_' + active_companies['RegAddress.PostCode']
fake_id_map = pd.Series(active_companies['CompanyNumber'].values,index=active_companies['fake_id'])
fake_id_map = fake_id_map.drop_duplicates(keep=False).to_dict()
active_officers_companies['fake_id'] = active_officers_companies['name'] + '_' + active_officers_companies['address.postal_code']
active_officers_companies['uid'] = active_officers_companies.apply(lambda x: fake_id_map[x['fake_id']] if x['fake_id'] in fake_id_map.keys() else titlecase(x['name']) + '_' + titlecase(x['address_country_normal'].replace(' ','_')),axis=1)
##Create nodes
active_officers_companies_nodes = active_officers_companies[['name','nationality',
       'country_of_residence', 'type',
       'address.in_full', 'address.street_address', 'address.locality',
       'address.region', 'address.postal_code', 'address.country',
       'retrieved_at', 'source_url', 'country_of_residence_normal',
       'address_country_normal', 'secret_base', 'uid']].drop_duplicates(subset=['uid'])
active_officers_companies_nodes.columns = [x.replace('.','_') for x in active_officers_companies_nodes.columns] 
##Create edges
active_officers_companies_edges = active_officers_companies[['company_number','uid','start_date', 'end_date','position']]
##Export
active_officers_companies_nodes.to_csv('data/neo4j/imports/active_officers_companies_nodes.csv')
active_officers_companies_edges.to_csv('data/neo4j/imports/active_officers_companies_edges.csv')
print('Created corporate officers edges and nodes')

In [None]:
##Create edges between likely connected edges
active_human_psc['probable_id'] = active_human_psc['name_elements.forename'] + '_' + active_human_psc['name_elements.surname'] + '_' + active_human_psc['month_year_birth'].astype(str) + '_'
probable_id_edges = active_human_psc[['uid','probable_id']]
probable_id_edges = probable_id_edges.dropna()
probable_id_edges = probable_id_edges[~(probable_id_edges.uid == probable_id_edges.probable_id)]
probable_id_edges.to_csv('data/neo4j/imports/probable_id_edges.csv')
print("Created probable match edges")

In [None]:
##Create legislatures and links
every_politician = pd.read_csv('data/outputs/every_politician.csv')
##Create DataFrame with only ones that occur in either officers or PSCs
every_politician_merged = every_politician[(every_politician.join_id.isin(active_officers.join_id))|(every_politician.join_id.isin(active_psc_records.join_id))].copy()

In [None]:
every_politician_merged = every_politican_merged.dropna(axis=1).copy()

In [None]:
every_politician_nodes = pd.DataFrame(every_politician_merged.country.unique(),columns=['country'])
every_politician_nodes['uid'] = every_politician_nodes.country
every_politician_nodes.to_csv('data/neo4j/imports/every_politician_nodes.csv')
print("Created every_politician nodes")

In [None]:
every_politician_edges = every_politician_merged[['join_id','country']].copy()
every_politician_edges.to_csv('data/neo4j/imports/every_politician_edges.csv',index=False)
print("Created every_politician edges")

In [None]:
##Chunk upload files to Dropbox 
files = []
for file in os.listdir("data/neo4j/imports/"):
    if file.endswith(".csv"):
        files.append(os.path.join("data/neo4j/imports/", file))
for item in files:
    print(item)
    with open(item, 'rb') as f:
        file_size = os.path.getsize(item)
        CHUNK_SIZE = 8 * 1024 * 1024
        if file_size <= CHUNK_SIZE:
            print(dbx.files_upload(f.read(), item.replace('data/neo4j/imports','')))
        else:
            upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE))
            cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id,
                                                       offset=f.tell())
            commit = dropbox.files.CommitInfo(path=item.replace('data/neo4j/imports',''),mode=dropbox.files.WriteMode.overwrite)
            while f.tell() < file_size:
                if ((file_size - f.tell()) <= CHUNK_SIZE):
                    print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE),
                                                    cursor,
                                                    commit))
                else:
                    dbx.files_upload_session_append(f.read(CHUNK_SIZE),
                                                    cursor.session_id,
                                                    cursor.offset)
                    cursor.offset = f.tell()

In [None]:
##Connect to graph
graph = Graph(config.neo4j_auth_graph_url)
graph.run("CREATE CONSTRAINT ON (company:Company) ASSERT company.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (person:Person) ASSERT person.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (legal_person:LegalPerson) ASSERT legal_person.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (super_secure_person:SuperSecurePerson) ASSERT super_secure_person.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (exemption:Exemption) ASSERT exemption.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (statement:Statement) ASSERT statement.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (postcode:Postcode) ASSERT postcode.uid IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (legislature:Legislature) ASSERT legislature.uid IS UNIQUE")

In [None]:
##Create shared links and get list of them
shared_node_links = []
shared_edges_links = []
for entry in dbx.files_list_folder('').entries:
    link = dbx.sharing_create_shared_link('/' + entry.name, short_url=False, pending_upload=None).url.replace('dl=0','dl=1')
    if 'nodes' in link:
        shared_node_links.append(link)
    elif 'edges' in link:
        shared_edges_links.append(link)

In [None]:
##Create nodes in graph
for file in shared_node_links:
    cypher = get_cypher_from_columns(pd.read_csv(file,nrows=1).columns)
    q = "USING PERIODIC COMMIT 100000 LOAD CSV WITH HEADERS FROM {} AS line MERGE ({}) ON CREATE SET {} ON MATCH SET {}".format("'" + file + "'", 'n:' + get_label(file) + ' {uid: line.uid}', cypher, cypher) 
    print(q)
    graph.run(q)

In [None]:
##Create edges in graph
for file in shared_edges_links:
    #Need to make explicit re: PSC when officers re-isnerted
    if 'human' in file and 'psc' in file: 
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Person {}),(c:Company {}) MERGE (n)-[r:CONTROLS]->(c) ON CREATE SET r.natures_of_control = line.natures_of_control".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    #Need to make explicit re: PSC when officers re-isnerted    
    elif 'company' in file and 'psc' in file:
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Company {}),(c:Company {}) MERGE (n)-[r:CONTROLS]->(c) ON CREATE SET r.natures_of_control = line.natures_of_control".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'super_secure' in file:
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:SuperSecurePerson {}),(c:Company {}) MERGE (n)-[r:CONTROLS]->(c) ON CREATE SET r.natures_of_control = line.natures_of_control".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'exemption' in file:
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Exemption {}),(c:Company {}) MERGE (n)<-[r:IS_EXEMPT]-(c) ON CREATE SET r.natures_of_control = line.natures_of_control".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'legal_person' in file:     
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:LegalPerson {}),(c:Company {}) MERGE (n)-[r:CONTROLS]->(c) ON CREATE SET r.natures_of_control = line.natures_of_control".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'address_edges' in file:
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Postcode {}),(c:Company {}) MERGE (c)-[r:REGISTERED]->(n)".format("'" + file + "'",'{uid: line.uid}','{uid: line.value}')
        print(q)
        graph.run(q)
    elif 'statement_edges' in file:
        q = "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Statement {}),(c:Company {}) MERGE (c)-[r:STATES]->(n)".format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'officers' and 'companies' in file:
        q = 'USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Company {}),(c:Company {}) MERGE (n)-[r:OFFICER_OF]->(c) ON CREATE SET r.position = line.position, r.start_date = line.start_date, r.end_date = line.end_date'.format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'officers' in file and 'humans' in file:
        q = 'USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n:Person {}),(c:Company {}) MERGE (n)-[r:OFFICER_OF]->(c) ON CREATE SET r.position = line.position, r.start_date = line.start_date, r.end_date = line.end_date'.format("'" + file + "'",'{uid: line.uid}','{uid: line.company_number}')
        print(q)
        graph.run(q)
    elif 'probable' in file and 'id' in file:
        q = 'USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n1:Person {}),(n2:Person {}) MERGE (n1)-[r:PROBABLY_SAME_PERSON]->(n2)'.format("'" + file + "'",'{uid: line.uid}','{uid: line.probable_id}')
        print(q)
        graph.run(q)
        graph.run('MATCH (p:Person)-[r:PROBABLY_SAME_PERSON]->(p:Person) DELETE r')
        print('MATCH (p:Person)-[r:PROBABLY_SAME_PERSON]->(p:Person) DELETE r')
    elif 'politician' in file:
        q = 'USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM {} AS line MATCH (n1:Person {}),(n2:Legislature {}) MERGE (n1)-[r:POLITICIAN_IN]->(n2)'.format("'" + file + "'",'{join_id: line.join_id}','{uid: line.country}')
        print(q)
        graph.run(q)
    else:
        print("Couldn't find the right query for " + file)
        continue