In [1]:
import pandas as pd
import rdflib 
from rdflib import URIRef, BNode, Literal, RDF, RDFS, Graph
from rdflib.namespace import FOAF, XSD
from datetime import datetime
import numpy as np
import os

In [2]:
relation_file = 'panama_papers.edges.csv'

nodes = {
    'address': 'panama_papers.nodes.address.csv',
    'entity': 'panama_papers.nodes.entity.csv',
    'intermediary':'panama_papers.nodes.intermediary.csv',
    'officer':'panama_papers.nodes.officer.csv'
}

output_file = './output/result.ttl'

lbl_blank_node = 'blank_node_id'
lbl_node_id = 'node_id'
lbl_start_node = 'START_ID'
lbl_end_node = 'END_ID'
lbl_relation_type = 'TYPE'
lbl_index='index'

result2RDFStar=False

In [3]:
data_folder = './input/'

In [4]:
start_time = datetime.now()
print(start_time)

2022-04-28 15:05:41.029903


In [5]:
def getRDFHeader(file_manager):
    file_manager.write('@prefix ns1: <http://example.org/> .\n')
    file_manager.write('@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n\n\n')

In [6]:
dfs = {}

In [7]:
%%capture --no-display
for n in nodes.keys():
    dfs[n] = pd.read_csv(data_folder+nodes[n].format(n), sep=',', dtype='str')
    for c in dfs[n]:
        dfs[n][c] = dfs[n][c].str.replace('"','').str.replace('\\','')

In [None]:
%%capture --no-display
df_relations = pd.read_csv(data_folder+relation_file, sep=',', dtype='str')
for c in df_relations:
    df_relations[c] = df_relations[c].str.replace('"','').str.replace('\\','')

In [None]:
df_relations = df_relations.reset_index()

In [None]:
%%capture --no-display
df_ids = pd.DataFrame(columns=[lbl_blank_node, lbl_node_id,'index', lbl_start_node , lbl_end_node ], dtype=str)
for n in dfs:
    df_ids = df_ids.append(dfs[n][[lbl_node_id ]])

In [None]:
%%capture --no-display
df_ids = df_ids.append(df_relations[['index',lbl_start_node , lbl_end_node ]])

In [None]:
df_ids=df_ids.reset_index(drop=True)

In [None]:
df_ids[lbl_blank_node ] = '_:x' + df_ids.index.astype(str) + ''

In [None]:
for n in dfs:
    for c in dfs[n].columns:
        dfs[n][c+'literal']  = [Literal(l) for l in dfs[n][c].values]

In [None]:
for c in df_relations.columns:
    df_relations[c+'literal']  = [Literal(l) for l in df_relations[c].values]

In [None]:
node_dic={row[lbl_node_id]:row[lbl_blank_node] for idx, row in df_ids.iterrows()}
node_dic.pop(np.nan)

In [None]:
#df_relations.head(2)

In [None]:
edge_dic={(row['index'],row[lbl_start_node],row[lbl_end_node]):row[lbl_blank_node] for idx, row in df_ids.iterrows()}
edge_dic.pop((np.nan,np.nan, np.nan))

In [None]:
print(datetime.now()-start_time)

# writing

In [None]:
%%capture --no-display
with open(output_file, 'w') as f:
    f.write('@prefix ns1: <http://example.org/> .\n')
    f.write('@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n\n\n')
    f.write('@prefix node: <http://nodeLabel.org/> > .\n\n\n')
    f.write('@prefix edge: <http://edgeLabel.org/> > .\n\n\n')

    for n in dfs:
        print(n)

        df = dfs[n].merge(df_ids, on=[lbl_node_id]).drop(columns=['index'])

        for idx, row in df.iterrows():
            f.write(row[lbl_blank_node] + ' ' + 'node:label' + ' ' + '"' + Literal(n) + '"' + ' .' + '\n')
            for c in df.columns:
                if not (c.endswith('literal') or c in [lbl_blank_node ,lbl_start_node ,lbl_end_node , 'inedx']):
                    if not str(row[c+'literal'])=='nan':
                        #print(row[c+'literal'])
                        f.write(row[lbl_blank_node]+ ' ' +URIRef('ns1:'+str(c))+' '+'"' +str(row[c+'literal'])+'"'+ ' .' +'\n')

    for idx, row in df_relations.iterrows():
        
        index = row['index'] 
        source_node=str(node_dic[row[lbl_start_node ]])
        destination_node=str(node_dic[row[lbl_end_node ]])
        relation=str(edge_dic[(index,row[lbl_start_node ],row[lbl_end_node ])])
        predicate='edge:label'
        rdfobject=str(Literal(row[lbl_relation_type+'literal']))
        
        if not result2RDFStar:
            f.write(source_node+' '+URIRef('ns1:'+'InRelationTo')+' '+relation+' .'+'\n')
            f.write(relation+' '+URIRef('ns1:'+'PointsTo')+' '+destination_node+' .'+'\n')
            f.write(relation+' '+predicate+' '+'"' +rdfobject+'"'+' .'+'\n')
            
        if result2RDFStar:
            f.write(source_node + ' ' +URIRef('ns1:'+row[lbl_relation_type]) + ' '+ destination_node+ ' ' + '.' + '\n' ) 
 
        for c in df_relations.columns:
            if not(c.endswith('literal') or c in [lbl_start_node , lbl_relation_type, lbl_end_node, lbl_index ]):
                if not(str(row[c])=='nan'):
                    if result2RDFStar:
                        f.write('<<'+ source_node + ' ' +URIRef('ns1:'+row[lbl_relation_type])+ ' '+ destination_node +'>>' + URIRef('ns1:'+str(c))+ ' '+'"'+str(row[c+'literal'])+'" .' +'\n')
                    else:
                        f.write(relation+' '+URIRef('ns1:'+str(c))+' '+'"' +str(row[c+'literal'])+'"'+ ' .' +'\n')


In [None]:
#print(datetime.now()-start_time)