In [1]:
%reset -f

In [2]:
from pathlib import Path
import tarfile
import pandas as pd

In [3]:
MERGED_TAR = Path.cwd().parent / "data/merged/merged-kg.tar.gz"

In [4]:
tar = tarfile.open(MERGED_TAR, "r:gz")
# Extract the TSV file
for member in tar.getmembers():
    f = tar.extractfile(member)
    if '_nodes.tsv' in member.name:
        # Read the TSV file into a pandas DataFrame
        df_nodes = pd.read_csv(f, sep='\t', low_memory=False)
    elif '_edges.tsv' in member.name:
        # Read the TSV file into a pandas DataFrame
        df_edges = pd.read_csv(f, sep='\t', low_memory=False)

# Close the tar.gz file
tar.close()

In [5]:
df_nodes.head()

Unnamed: 0,id,category,name,description,xref,provided_by,synonym,iri,object,predicate,relation,same_as,subject,subsets
0,IAO:0000115,biolink:OntologyClass,definition,,,Graph,,http://purl.obolibrary.org/obo/IAO_0000115|htt...,,,,,,
1,NCBITaxon:1,biolink:OrganismalEntity,root,,GC_ID:1|PMID:30365038|PMID:32761142,Graph,all,http://purl.obolibrary.org/obo/NCBITaxon_1,,,,,,
2,NCBITaxon:10,biolink:OrganismalEntity,Cellvibrio,,GC_ID:11|PMID:12710603|PMID:24105943,Graph,,http://purl.obolibrary.org/obo/NCBITaxon_10,,,,,,
3,NCBITaxon:100,biolink:OrganismTaxon|biolink:OrganismalEntity,Ancylobacter aquaticus,,GC_ID:11,Graph,Microcyclus aquaticus,http://purl.obolibrary.org/obo/NCBITaxon_100,,,,,,
4,NCBITaxon:100000,biolink:OrganismalEntity,Herbaspirillum sp. BA12,,GC_ID:11,Graph,Herbispirillum sp. BA12,http://purl.obolibrary.org/obo/NCBITaxon_100000,,,,,,


In [6]:
df_edges.head()

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
0,urn:uuid:c672b249-cb5c-4ed6-b6f6-944bbf690d10,NCBITaxon:10,biolink:subclass_of,NCBITaxon:1706371,rdfs:subClassOf,,Graph
1,urn:uuid:a9b6d704-f957-483d-a02f-5b0dda8b2aca,NCBITaxon:100,biolink:subclass_of,NCBITaxon:99,rdfs:subClassOf,,Graph
2,urn:uuid:0f757f39-b6d6-4dae-a76d-96be13967e3f,NCBITaxon:100,biolink:occurs_in,mediadive.medium:7,BAO:0002924,,Graph
3,urn:uuid:d0adb6aa-777a-4bf7-a608-3a75ea59173a,NCBITaxon:100,biolink:capable_of,traits.pathways:hydrogen_oxidation_dark,RO:0002215,,Graph
4,urn:uuid:3389a8ad-1c31-4f61-a1b2-0b0266e4d212,NCBITaxon:100,biolink:capable_of,traits.pathways:aerobic_chemo_heterotrophy,RO:0002215,,Graph


In [7]:
all_ingredients_obj = df_edges.loc[df_edges['object'].str.contains("ingredient")]

In [8]:
all_ingredients_obj['subject'].str.split(":").str[0].unique()

array(['mediadive.medium', 'mediadive.solution'], dtype=object)

In [9]:
df_edges['subject'].str.split(":").str[0].unique()

array(['NCBITaxon', 'OBO', 'CHEBI', 'mediadive.medium',
       'mediadive.solution'], dtype=object)

In [10]:
df_edges['object'].str.split(":").str[0].unique()


array(['NCBITaxon', 'mediadive.medium', 'traits.pathways', 'ENVO',
       'ECOCORE', 'traits.cell_shape_enum', 'UBERON',
       'traits.data_source', 'CHEBI', 'GO', ' ENVO',
       'traits.carbon_substrates', 'PO', 'FOODON', 'OBO',
       'mediadive.solution', 'mediadive.ingredient', 'CAS-RN', 'PubChem',
       'KEGG'], dtype=object)

In [11]:
df_edges.loc[df_edges['subject']=="mediadive.medium:1a"]

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
1239035,urn:uuid:585e79a6-e793-4e1a-a58f-1f20e47fe106,mediadive.medium:1a,biolink:has_part,mediadive.solution:3,BFO:0000051,,Graph
1239036,urn:uuid:b5cb5889-7d7c-41bb-aabf-02a9de65bbdb,mediadive.medium:1a,biolink:has_part,mediadive.ingredient:2,BFO:0000051,,Graph


In [12]:
df_edges.loc[df_edges['subject']=="mediadive.solution:3"]

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
1249641,urn:uuid:c91d45a3-e9c3-49ab-8b40-f47b0e73e9e4,mediadive.solution:3,biolink:has_part,CAS-RN:73049-73-7,BFO:0000051,,Graph
1249642,urn:uuid:33df28a0-5f04-41e7-aee0-d4ea5575f104,mediadive.solution:3,biolink:has_part,mediadive.ingredient:2,BFO:0000051,,Graph
1249643,urn:uuid:ce34dc43-15f6-4a4a-ae3f-dc9f91147499,mediadive.solution:3,biolink:has_part,CHEBI:2509,BFO:0000051,,Graph
1249644,urn:uuid:60bb08e5-2f2e-414f-b977-e4f6ad8f6124,mediadive.solution:3,biolink:has_part,CHEBI:15377,BFO:0000051,,Graph


In [13]:
subj_pattern = ['mediadive.medium:', 'mediadive.solution:']
obj_pattern = ['mediadive.solution:', 'mediadive.ingredient:']

condition_1 = df_edges['subject'].str.contains('|'.join(subj_pattern))
condition_2 = df_edges['object'].str.contains('|'.join(obj_pattern))

df_mediadive = df_edges[condition_1 & condition_2]

# mapping = df_edges.set_index('object')['subject'].to_dict()

# mapping
df_mediadive

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
1238696,urn:uuid:c42c2fe3-9311-427f-9065-a6a51d559fbd,mediadive.medium:1669,biolink:has_part,mediadive.solution:3464,BFO:0000051,,Graph
1238697,urn:uuid:7c80c163-becc-4caf-888f-b43b2bd12eed,mediadive.medium:1669,biolink:has_part,mediadive.ingredient:156,BFO:0000051,,Graph
1238698,urn:uuid:cf686fd5-c286-4112-bac7-fc9ec67f2be4,mediadive.medium:92,biolink:has_part,mediadive.solution:161,BFO:0000051,,Graph
1238699,urn:uuid:3d68198c-eac5-41dd-b35c-95cbd862af9a,mediadive.medium:92,biolink:has_part,mediadive.ingredient:99,BFO:0000051,,Graph
1238700,urn:uuid:d6162fa5-f52a-44d0-81b6-e6491a5c52c4,mediadive.medium:1609,biolink:has_part,mediadive.solution:3351,BFO:0000051,,Graph
...,...,...,...,...,...,...,...
1340837,urn:uuid:60b08e4e-410d-4bc9-ad97-883486d586b3,mediadive.medium:C109,biolink:has_part,mediadive.solution:6100,BFO:0000051,,Graph
1340838,urn:uuid:eac11dab-9a49-43d1-9360-3ea8a3907cf6,mediadive.medium:C110,biolink:has_part,mediadive.solution:6101,BFO:0000051,,Graph
1340839,urn:uuid:14c685ac-f3ad-43e8-8490-80eec5eeb036,mediadive.medium:C110,biolink:has_part,mediadive.solution:6248,BFO:0000051,,Graph
1340848,urn:uuid:7d168e7c-65ba-4b2b-842c-b7fb162259ab,mediadive.solution:6101,biolink:has_part,mediadive.solution:6248,BFO:0000051,,Graph


In [14]:
medium_subject_condition = df_mediadive["subject"].str.startswith("mediadive.medium:")
solution_object_condition = df_mediadive["object"].str.startswith("mediadive.solution:")

medium_solution_df = df_mediadive[medium_subject_condition & solution_object_condition]
medium_solution_df

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
1238696,urn:uuid:c42c2fe3-9311-427f-9065-a6a51d559fbd,mediadive.medium:1669,biolink:has_part,mediadive.solution:3464,BFO:0000051,,Graph
1238698,urn:uuid:cf686fd5-c286-4112-bac7-fc9ec67f2be4,mediadive.medium:92,biolink:has_part,mediadive.solution:161,BFO:0000051,,Graph
1238700,urn:uuid:d6162fa5-f52a-44d0-81b6-e6491a5c52c4,mediadive.medium:1609,biolink:has_part,mediadive.solution:3351,BFO:0000051,,Graph
1238703,urn:uuid:1702102b-37d7-4c03-a53a-cc36f1f604bf,mediadive.medium:830,biolink:has_part,mediadive.solution:1685,BFO:0000051,,Graph
1238707,urn:uuid:85b3b579-b9d5-4f65-9470-670f6d572c2d,mediadive.medium:65,biolink:has_part,mediadive.solution:106,BFO:0000051,,Graph
...,...,...,...,...,...,...,...
1340795,urn:uuid:bd4f0b42-29a3-4460-affd-dd1d46ae20b7,mediadive.medium:C108,biolink:has_part,mediadive.solution:6160,BFO:0000051,,Graph
1340796,urn:uuid:2b53712c-9e53-4231-911b-b2dbddc8343a,mediadive.medium:C108,biolink:has_part,mediadive.solution:6246,BFO:0000051,,Graph
1340837,urn:uuid:60b08e4e-410d-4bc9-ad97-883486d586b3,mediadive.medium:C109,biolink:has_part,mediadive.solution:6100,BFO:0000051,,Graph
1340838,urn:uuid:eac11dab-9a49-43d1-9360-3ea8a3907cf6,mediadive.medium:C110,biolink:has_part,mediadive.solution:6101,BFO:0000051,,Graph


In [15]:
solution_subject_condition = df_mediadive["subject"].str.startswith("mediadive.solution:")
ingredient_object_condition = df_mediadive["object"].str.startswith("mediadive.ingredient:")

solution_ingredient_df = df_mediadive[solution_subject_condition & ingredient_object_condition]
solution_ingredient_df

Unnamed: 0,id,subject,predicate,object,relation,provided_by,knowledge_source
1249568,urn:uuid:653df055-46ac-4a5a-b881-b8e8c4804d9b,mediadive.solution:1,biolink:has_part,mediadive.ingredient:2,BFO:0000051,,Graph
1249573,urn:uuid:1d669c12-6bcc-49fd-989c-8436ca92721d,mediadive.solution:1,biolink:has_part,mediadive.ingredient:2111,BFO:0000051,,Graph
1249574,urn:uuid:66116aa2-411a-4ca3-be36-6b30014ed72a,mediadive.solution:1,biolink:has_part,mediadive.ingredient:724,BFO:0000051,,Graph
1249575,urn:uuid:20396f7b-2db6-4061-9e0c-e97b236f7513,mediadive.solution:1,biolink:has_part,mediadive.ingredient:2110,BFO:0000051,,Graph
1249577,urn:uuid:ad93aa8e-920c-474e-a114-39110088c83e,mediadive.solution:1,biolink:has_part,mediadive.ingredient:2112,BFO:0000051,,Graph
...,...,...,...,...,...,...,...
1340686,urn:uuid:013ef9c3-c5d7-4301-bb2c-cdc4374a3b73,mediadive.solution:6241,biolink:has_part,mediadive.ingredient:286,BFO:0000051,,Graph
1340700,urn:uuid:486fe651-41c2-4fb4-bcae-6b8bf36befb3,mediadive.solution:6242,biolink:has_part,mediadive.ingredient:286,BFO:0000051,,Graph
1340803,urn:uuid:868448b2-db0e-48c7-988b-1ba8557bf0b2,mediadive.solution:6247,biolink:has_part,mediadive.ingredient:286,BFO:0000051,,Graph
1340809,urn:uuid:b61b5cd7-1312-43fc-8c7a-f9bec69d1904,mediadive.solution:6099,biolink:has_part,mediadive.ingredient:286,BFO:0000051,,Graph
