In [1]:
import pandas as pd
import numpy as np

In [2]:
nodes_df = pd.read_csv('5tables/NodeLabels.tsv', sep='\t')
nodes_df.head()

Unnamed: 0,term,label
0,Microsoft,ORG
1,ISBN,ORG
2,China,GPE
3,Facebook,ORG
4,Apple,ORG


In [3]:
node_props_df = pd.read_csv('5tables/NodeProperty.tsv', sep='\t')
node_props_df.head()

Unnamed: 0,NodeID,Property,Value
0,linkedin,current status,active
1,linkedin,founded,"december 28, 2002 16 years ago (2002-12-28)mou..."
2,linkedin,headquarters,"sunnyvale, california, u.s."
3,linkedin,alexarank,58 (november 2019update)1
4,linkedin,employees,"15,000 (2019)"


In [4]:
connectivity_df = pd.read_csv('5tables/Connectivity.tsv', sep='\t')
connectivity_df.head()

Unnamed: 0,EdgeID,FromNode,ToNode,EdgeLabel
0,5,adam pisoni,yammer,founded
1,6,adam somlai-fischer,prezi,founded
2,7,adeyemi ajao,tuenti,founded
3,9,allen blue,linkedin,founded
4,13,apple inc.,apple mail,developed by


In [5]:
max_edge_id = np.max(connectivity_df.EdgeID.values)
max_edge_id

1541

In [6]:
naics_df = pd.read_csv('naics_codes.tsv', sep='\t')
naics_df.head()

Unnamed: 0,Seq. No.,2017 NAICS US Code,2017 NAICS US Title,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,1.0,11.0,"Agriculture, Forestry, Fishing and Hunting",,,
2,2.0,111.0,Crop Production,,,
3,3.0,1111.0,Oilseed and Grain Farming,,,
4,4.0,11111.0,Soybean Farming,,,


In [7]:
naics_df = naics_df.drop(columns=['Seq. No.', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']).iloc[1:, :]
naics_df.head()

Unnamed: 0,2017 NAICS US Code,2017 NAICS US Title
1,11,"Agriculture, Forestry, Fishing and Hunting"
2,111,Crop Production
3,1111,Oilseed and Grain Farming
4,11111,Soybean Farming
5,111110,Soybean Farming


In [8]:
naics_nodes_df = pd.DataFrame(data={
    "term": naics_df.loc[:, '2017 NAICS US   Code'],
    "label": "NAICS",
})
naics_nodes_df.head()

Unnamed: 0,term,label
1,11,NAICS
2,111,NAICS
3,1111,NAICS
4,11111,NAICS
5,111110,NAICS


In [9]:
naics_node_props_df = pd.DataFrame(data={
    "NodeID": naics_df.loc[:, '2017 NAICS US   Code'],
    "Property": "title",
    "Value": naics_df.loc[:, '2017 NAICS US Title'],
})
naics_node_props_df.head()

Unnamed: 0,NodeID,Property,Value
1,11,title,"Agriculture, Forestry, Fishing and Hunting"
2,111,title,Crop Production
3,1111,title,Oilseed and Grain Farming
4,11111,title,Soybean Farming
5,111110,title,Soybean Farming


In [10]:
from_nodes = []
to_nodes = []
for i in range(2, 6):
    mask = naics_nodes_df.term.str.len() == i
    for prefix in naics_nodes_df[mask].term.values:
        sw_mask = naics_nodes_df.term.str.startswith(prefix) & (naics_nodes_df.term.str.len() == (i + 1))
        tmp_df = naics_nodes_df[sw_mask]
        N = len(tmp_df)
        from_nodes.extend([prefix] * N)
        to_nodes.extend(tmp_df.term.values.tolist())
naics_connectivity_df = pd.DataFrame(data={
    "EdgeID": range(max_edge_id + 1, max_edge_id + 1 + len(from_nodes)),
    "FromNode": from_nodes,
    "ToNode": to_nodes,
    "EdgeLabel": "contains",
})
print(naics_connectivity_df.shape)
naics_connectivity_df.head().append(naics_connectivity_df.sample(5))

(2132, 4)


Unnamed: 0,EdgeID,FromNode,ToNode,EdgeLabel
0,1542,11,111,contains
1,1543,11,112,contains
2,1544,11,113,contains
3,1545,11,114,contains
4,1546,11,115,contains
1543,3085,33699,336999,contains
1939,3481,56171,561710,contains
1911,3453,55111,551111,contains
1544,3086,33711,337110,contains
1942,3484,56174,561740,contains


In [11]:
mask = nodes_df.term.isin(naics_nodes_df.term.values) & (nodes_df.label == 'NAICS')
nodes_df = nodes_df[~mask]
updated_nodes_df = nodes_df.append(naics_nodes_df)
updated_nodes_df.head().append(updated_nodes_df.tail())

Unnamed: 0,term,label
0,Microsoft,ORG
1,ISBN,ORG
2,China,GPE
3,Facebook,ORG
4,Apple,ORG
2192,9281,NAICS
2193,92811,NAICS
2194,928110,NAICS
2195,92812,NAICS
2196,928120,NAICS


In [12]:
updated_nodes_df.to_csv('5tables/NodeLabels.tsv', sep='\t', index=False)

In [13]:
updated_node_props_df = node_props_df.append(naics_node_props_df)
updated_node_props_df.head().append(updated_node_props_df.tail())

Unnamed: 0,NodeID,Property,Value
0,linkedin,current status,active
1,linkedin,founded,"december 28, 2002 16 years ago (2002-12-28)mou..."
2,linkedin,headquarters,"sunnyvale, california, u.s."
3,linkedin,alexarank,58 (november 2019update)1
4,linkedin,employees,"15,000 (2019)"
2192,9281,title,National Security and International Affairs
2193,92811,title,National Security
2194,928110,title,National Security
2195,92812,title,International Affairs
2196,928120,title,International Affairs


In [14]:
updated_node_props_df.to_csv('5tables/NodeProperty.tsv', sep='\t', index=False)

In [15]:
updated_connectivity_df = connectivity_df.append(naics_connectivity_df)
updated_connectivity_df.head().append(updated_connectivity_df.tail())

Unnamed: 0,EdgeID,FromNode,ToNode,EdgeLabel
0,5,adam pisoni,yammer,founded
1,6,adam somlai-fischer,prezi,founded
2,7,adeyemi ajao,tuenti,founded
3,9,allen blue,linkedin,founded
4,13,apple inc.,apple mail,developed by
2127,3669,92614,926140,contains
2128,3670,92615,926150,contains
2129,3671,92711,927110,contains
2130,3672,92811,928110,contains
2131,3673,92812,928120,contains


In [16]:
updated_connectivity_df.to_csv('5tables/Connectivity.tsv', sep='\t', index=False)