In [1]:
import json
from itertools import repeat, chain
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import os
import time

In [2]:
def count_file_lines(fname):
    def _make_gen(reader):
        while True:
            b = reader(2 ** 16)
            if not b: break
            yield b

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) + 1
    return count

In [3]:
fname = "dane/openownership/statements.latest.jsonl"

file_length = count_file_lines(fname)

In [4]:
print(f'dlugosc pliku: {file_length}')

dlugosc pliku: 30378854


In [5]:
# delete person.txt file 
try:
    filew = "person.txt"
    os.remove(f"dane/{filew}")
except Exception as e:
    print(e)

In [6]:
# read person entities and save to .txt file

filew = "person.txt"
end = False
startline = 0

while not end:
    filer = "statements.latest.jsonl"
    
    nread = file_length # read file in smaller parts (nread = part_size)
    
    statementID_l = ['' for i in range(nread)]
    exception_lines = []

    with open(f'dane/openownership/{filer}', encoding='utf-8') as fd:
        i = 0
        
        line = startline
        raw_data = '0'

        offset = 0
        n = 0

        l = fd.readline()
        while l and n < startline:   
            offset = fd.tell()
            n+=1
            l = fd.readline()

        fd.seek(0)
        fd.seek(offset)

        for it in tqdm(range(nread)):
            if raw_data == '':
                break
            line += 1
            try:
                raw_data = fd.readline()
                data = json.loads(raw_data)
                
                if data['statementType'] == 'personStatement':                
                    statementID_l[i] = data['statementID']
                    i += 1

            except Exception as e:
                print(f"exception {e}")
                print(type(e))
                exception_lines.append(raw_data)
    
    if raw_data == '':
        end = True
        
    n = len([a for a in statementID_l if a != ''])

    with open(f"dane/{filew}", 'a', encoding='utf-8') as fd:
        fd.writelines('\n'.join(statementID_l[:n]) + '\n')
    
    startline = line

100%|██████████████████████████████████████████████████████████████████| 30378854/30378854 [03:13<00:00, 157251.41it/s]


exception Expecting value: line 1 column 1 (char 0)
<class 'json.decoder.JSONDecodeError'>


In [7]:
# delete relations_p.txt, relations_e.txt, relations_u.txt files 
try:
    file = "relations_p.txt"
    os.remove(f"dane/{file}")
    
    file = "relations_e.txt"
    os.remove(f"dane/{file}")
    
    file = "relations_u.txt"
    os.remove(f"dane/{file}")
    
except Exception as e:
    print(e)

In [8]:
# read relations and write to 3 separated files:
# relations_p.txt : entity - person (used )
# relations_e.txt : entity - entity
# relations_u.txt : entity - unknown

end = False
startline = 0

while not end:
    file = "statements.latest.jsonl"
    
    nread = file_length # read file in smaller parts (nread = part_size)
    
    statementID_l = ['' for i in range(nread)]
    subjectID_l = ['' for i in range(nread)]
    interestedPersonID_l = ['' for i in range(nread)]
    interestedEntityID_l = ['' for i in range(nread)]
    interestedUnspecifiedID_l = ['' for i in range(nread)]
    
#     person_list = ['' for i in range(2000000)]
    
    
    exception_lines = []

    with open(f'dane/openownership/{file}', encoding='utf-8') as fd:
        i = 0
        
        line = startline
        raw_data = '0'

        offset = 0
        n = 0

        l = fd.readline()
        while l and n < startline:   
            offset = fd.tell()
            n+=1
            l = fd.readline()

        fd.seek(0)
        fd.seek(offset)

#         while raw_data != '' and i < len(person_list):
        for it in tqdm(range(nread)):
            if raw_data == '':
                break
            line += 1
            try:
                raw_data = fd.readline()
                data = json.loads(raw_data)
                
                if data['statementType'] == 'ownershipOrControlStatement':
                    
                    statementID = data['statementID']
                    subjectID = data['subject']
                    interestedPartyID = data['interestedParty']
                    
                    statementID_l[i] = statementID
                    
                    try:
                        key = list(subjectID.keys())[0]
                        
                        subjectID_l[i] = subjectID['describedByEntityStatement']
                    except Exception as e:
                        print(e)
                        pass
                    
                    try:
                        key = list(interestedPartyID.keys())[0]
                        
                        if key == 'describedByPersonStatement':
                            interestedPersonID_l[i] = interestedPartyID['describedByPersonStatement']
                        elif key == 'describedByEntityStatement':
                            interestedEntityID_l[i] = interestedPartyID['describedByEntityStatement']
                        else:
                            interestedUnspecifiedID_l[i] = interestedPartyID['unspecified']
                    except Exception as e:
                        print(e)
                        pass
                    
                    i += 1
                        
            except Exception as e:
                print(f"exception {e}")
                print(type(e))
                exception_lines.append(raw_data)

    
    if raw_data == '':
        end = True
        
    n = len([a for a in statementID_l if a != ''])
    
    file = "relations"
    
    fdp = open(f"dane/{file}_p.txt", 'a', encoding='utf-8')
    fde = open(f"dane/{file}_e.txt", 'a', encoding='utf-8')
    fdu = open(f"dane/{file}_u.txt", 'a', encoding='utf-8')

    for i in range(n):
        
        if interestedPersonID_l[i] != '':
            fdp.write(f"{subjectID_l[i]} {interestedPersonID_l[i]}\n")
        elif interestedEntityID_l[i] != '':
            fde.write(f"{subjectID_l[i]} {interestedEntityID_l[i]}\n")
        else:
            fdu.write(f"{subjectID_l[i]} {interestedUnspecifiedID_l[i]}\n")

    fdp.close()
    fde.close()
    fdu.close()
    
    startline = line

100%|██████████████████████████████████████████████████████████████████| 30378854/30378854 [03:32<00:00, 143186.61it/s]


exception Expecting value: line 1 column 1 (char 0)
<class 'json.decoder.JSONDecodeError'>


In [5]:
filepath = "dane/person.txt"

person_length = count_file_lines(filepath) - 1  # last line is empty
print(person_length)

9501956


In [6]:
# create list of person ids
person_list = ['' for i in range(person_length)]

with open(filepath, 'r', encoding='utf-8') as fd:
    i = 0
    for line in fd.readlines():
        person_list[i] = line.strip()
        i += 1

In [7]:
len(person_list)

9501956

In [8]:
# entity-id person-id
filepath = "dane/relations_p.txt"

relation_length = count_file_lines(filepath) - 1 # last line is empty

print(relation_length)

10927656


In [9]:
relation_list = ['' for i in range(relation_length)]

with open(filepath, 'r', encoding='utf-8') as fd:
    i = 0
    for line in fd.readlines():
        line = line.strip()
        relation = line.split()
        relation_list[i] = relation
        i += 1

In [10]:
relation_list[-1]

['openownership-register-6853622222214472120',
 'openownership-register-13328578030926230791']

In [11]:
# prepare dictionary: 
# keys - business having relation to person
# values - list of person having relation to business

person_entity = {}

for i in tqdm(range(relation_length)):
    person_entity[relation_list[i][0]] = []

100%|█████████████████████████████████████████████████████████████████| 10927656/10927656 [00:06<00:00, 1591771.12it/s]


In [12]:
len(person_entity)

7706672

In [13]:
# fill the list of person entities that are related to business entity
for i in tqdm(range(relation_length)):
    person_entity[relation_list[i][0]].append(relation_list[i][1])

100%|█████████████████████████████████████████████████████████████████| 10927656/10927656 [00:04<00:00, 2414329.37it/s]


In [14]:
len(person_entity)

7706672

In [15]:
sum([len(v) for k,v in person_entity.items()])

10927656

In [16]:
# print some keys and some person_entity data
keys = list(person_entity.keys())
keys[:3]

['openownership-register-6239283121540420441',
 'openownership-register-16892812539957769356',
 'openownership-register-4878504546140740426']

In [17]:
print(person_entity[keys[0]])
print(person_entity[keys[1]])
print(person_entity[keys[2]])

['openownership-register-8023559688350113290', 'openownership-register-9575124590016937100', 'openownership-register-16662231360960106699', 'openownership-register-5910404311545781136', 'openownership-register-6239560525977119673', 'openownership-register-15662336450571132063', 'openownership-register-17110834677531099056', 'openownership-register-6877792680484479073', 'openownership-register-18297509384758155298', 'openownership-register-6537106832811831952', 'openownership-register-1835582134193563737', 'openownership-register-641637797850175208', 'openownership-register-8368801879004216013', 'openownership-register-14698556745985536925', 'openownership-register-11449044425500392919', 'openownership-register-1195534800658956224', 'openownership-register-16915615315947413893', 'openownership-register-5448936139624469396', 'openownership-register-12061044289259583156', 'openownership-register-8145741226558019624', 'openownership-register-17880138175184417341', 'openownership-register-1

In [28]:
# file with edges:
# knownPerson1 knownPerson2

file = "person_graph.txt"

with open(f"dane/{file}", 'w', encoding='utf-8') as fd:
    
    for entity, person_l in person_entity.items():
        i = 0
        for person in person_l:
            j = i + 1
            while j < len(person_l):
                fd.write(f"{person} {person_l[j]}\n")
                j += 1
            i+=1

In [37]:
# read graph edges
file = "dane/person_graph.txt"
relations_length = count_file_lines(file) -1 
print(relations_length)

5882680


In [38]:
person_relations = ['' for i in range(relations_length)]

with open(file, 'r', encoding='utf-8') as fd:
    i = 0
    for line in fd.readlines():
        person_relations[i] = tuple(line.split())
        i += 1

In [39]:
len(person_relations)

5882680

In [42]:
# create graph

In [43]:
# load nodes - person_list

g = nx.MultiGraph()
g.add_nodes_from(person_list)

In [44]:
# number of nodes
print(f"number of nodes {g.number_of_nodes()}")
print(f"length list of nodes {len(person_list)}")

number of nodes 9501956
length of list of nodes 9501956


In [45]:
# add edges to graph
r = g.add_edges_from(person_relations)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [46]:
# edges
print(f"number of edges {g.number_of_edges()}")
print(f"length list of edges {len(person_relations)}")

number of edges 5882680
length list of edges 5882680
