In [None]:
#!/usr/bin/env conda run -n ct_extract_env python

import pandas as pd
import os
import json

def parse_edges(data_folder):
    filename = "ClinTrials_KG_edges_v01_3.csv"
    filepath = os.path.join(data_folder, filename)

    edges = pd.read_csv(filepath, sep='\t')
    for index, row in edges.iterrows():
        id_dict = {}
        subject_dict = {}
        association_dict = {}
        object_dict = {}
        source_dict = {}

        # id generated by concatenating the following: numbers from nctid, CUI of subject, CUI of object
        id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])

        subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
        subject_dict["name"] = row["subject_name"]
        # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
        subject_dict["type"] = "Disease"

        association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
        association_dict["edge_attributes"] = []
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"clinicaltrials_id",
             "value":row["nctid"]
            }
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"biolink:aggregator_knowledge_source",
             "value":"infores:biothings-multiomics-clinicaltrials"}
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:primary_knowledge_source",
             "value": "infores:aact"}
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:supporting_data_source",
             "value": "infores:clinicaltrials"})

        object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
        object_dict["name"] = row["object_name"]
        object_dict["type"] = "Treatment"
        # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version
        
        source_dict["edge_sources"] = []
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:biothings-multiomics-clinicaltrials",
                "resource_role": "aggregator_knowledge_source"
            }
        )
        
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:aact",
                "resource_role": "primary_knowledge_source"
            }
        )
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:clinicaltrials",
                "resource_role": "supporting_data_source"
            }
        )

        id_dict["subject"] = subject_dict
        id_dict["association"] = association_dict
        id_dict["object"] = object_dict 
        id_dict["source"] = source_dict
        
        # print(json.dumps(id_dict, indent=2)) # uncomment for testing

        # yield the JSON one by one
        yield id_dict # comment for testing

# data_folder = "../outputs" # uncomment for testing
# parse_edges(data_folder) # uncomment for testing




In [1]:
#!/usr/bin/env conda run -n ct_extract_env python

import pandas as pd
import os
import json

In [21]:
data_folder = "../outputs/version_1" # uncomment for testing
filename = "ClinTrials_KG_edges_v01_3.csv"
filepath = os.path.join(data_folder, filename)

edges = pd.read_csv(filepath, sep='\t')
edges.head()

Unnamed: 0,subject,predicate,object,subject_name,object_name,category,nctid,nctid_curie
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606


In [22]:
edges.rename(columns={'subject': 'disease', 'object': 'intervention', 'subject_name': 'disease_name', 'object_name': 'intervention_name'}, inplace=True)
edges.rename(columns={'disease': 'object', 'intervention': 'subject', 'disease_name': 'object_name', 'intervention_name': 'subject_name'}, inplace=True)
edges.head()

Unnamed: 0,object,predicate,subject,object_name,subject_name,category,nctid,nctid_curie
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606


In [37]:
for index, row in edges[:10].iterrows():
    id_dict = {}
    subject_dict = {}
    association_dict = {}
    object_dict = {}
    source_dict = {}
    
    id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])
#     print(id_dict)
    
    subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
    subject_dict["name"] = row["subject_name"]
    # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
    subject_dict["type"] = "Treatment"
#     print(subject_dict)
    
    association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
    association_dict["edge_attributes"] = []
    association_dict["edge_attributes"].append(
        {"attribute_type_id":"clinicaltrials_id",
         "value":row["nctid"]
        }
    )
    association_dict["edge_attributes"].append(
        {"attribute_type_id":"biolink:aggregator_knowledge_source",
         "value":"infores:aact"}
        )
    association_dict["edge_attributes"].append(
        {"attribute_type_id": "biolink:primary_knowledge_source",
         "value": "infores:clinicaltrials"}
    )
    association_dict["edge_attributes"].append(
    {"attribute_type_id": "biolink:supporting_data_source",
     "value": "infores:biothings-multiomics-clinicaltrials"})
#     print(association_dict)
    
    object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
    object_dict["name"] = row["object_name"]
    object_dict["type"] = "Disease"
    # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version
#     print(object_dict)

    source_dict["edge_sources"] = []
    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:biothings-multiomics-clinicaltrials",
            "resource_role": "supporting_data_source"
        }
    )

    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:aact",
            "resource_role": "aggregator_knowledge_source"
        }
    )
    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:clinicaltrials",
            "resource_role": "primary_knowledge_source"
        }
    )
    
    id_dict["subject"] = subject_dict
    id_dict["association"] = association_dict
    id_dict["object"] = object_dict 
    id_dict["source"] = source_dict
    
#     print(json.dumps(id_dict,sort_keys=True, indent=2))

    # yield the JSON one by one
    yield id_dict # comment for testing
    
    
data_folder = "../outputs" # uncomment for testing
parse_edges(data_folder) # uncomment for testing  
    

{
  "_id": "02460354_C0025598_C0011848",
  "association": {
    "edge_attributes": [
      {
        "attribute_type_id": "clinicaltrials_id",
        "value": "NCT02460354"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:aact"
      },
      {
        "attribute_type_id": "biolink:primary_knowledge_source",
        "value": "infores:clinicaltrials"
      },
      {
        "attribute_type_id": "biolink:supporting_data_source",
        "value": "infores:biothings-multiomics-clinicaltrials"
      }
    ],
    "predicate": "related_to"
  },
  "object": {
    "UMLS": "C0011848",
    "name": "diabetes insipidus",
    "type": "Disease"
  },
  "source": {
    "edge_sources": [
      {
        "resource_id": "infores:biothings-multiomics-clinicaltrials",
        "resource_role": "supporting_data_source"
      },
      {
        "resource_id": "infores:aact",
        "resource_role": "aggregator_knowledge_source"
      },
      {
 

In [41]:
def parse_edges(data_folder):
    filename = "ClinTrials_KG_edges_v01_3.csv"
    filepath = os.path.join(data_folder, filename)

    edges = pd.read_csv(filepath, sep='\t')
    edges.rename(columns={'subject': 'disease', 'object': 'intervention', 'subject_name': 'disease_name', 'object_name': 'intervention_name'}, inplace=True)
    edges.rename(columns={'disease': 'object', 'intervention': 'subject', 'disease_name': 'object_name', 'intervention_name': 'subject_name'}, inplace=True)

    for index, row in edges[:10].iterrows():
        id_dict = {}
        subject_dict = {}
        association_dict = {}
        object_dict = {}
        source_dict = {}

        id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])

        subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
        subject_dict["name"] = row["subject_name"]
        # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
        subject_dict["type"] = "Treatment"

        association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
        association_dict["edge_attributes"] = []
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"clinicaltrials_id",
             "value":row["nctid"]
            }
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"biolink:aggregator_knowledge_source",
             "value":"infores:aact"}
            )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:primary_knowledge_source",
             "value": "infores:clinicaltrials"}
        )
        association_dict["edge_attributes"].append(
        {"attribute_type_id": "biolink:supporting_data_source",
         "value": "infores:biothings-multiomics-clinicaltrials"})

        object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
        object_dict["name"] = row["object_name"]
        object_dict["type"] = "Disease"
        # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version

        source_dict["edge_sources"] = []
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:biothings-multiomics-clinicaltrials",
                "resource_role": "supporting_data_source"
            }
        )

        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:aact",
                "resource_role": "aggregator_knowledge_source"
            }
        )
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:clinicaltrials",
                "resource_role": "primary_knowledge_source"
            }
        )

        id_dict["subject"] = subject_dict
        id_dict["association"] = association_dict
        id_dict["object"] = object_dict 
        id_dict["source"] = source_dict

#         print(json.dumps(id_dict,sort_keys=True, indent=2))

        # yield the JSON one by one
        yield id_dict # comment for testing
        
def main():
    data_folder = "../outputs/version_1" # uncomment for testing
    parse_edges(data_folder) # uncomment for testing

if __name__ == "__main__":
    main()


{
  "_id": "02460354_C0025598_C0011848",
  "association": {
    "edge_attributes": [
      {
        "attribute_type_id": "clinicaltrials_id",
        "value": "NCT02460354"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:aact"
      },
      {
        "attribute_type_id": "biolink:primary_knowledge_source",
        "value": "infores:clinicaltrials"
      },
      {
        "attribute_type_id": "biolink:supporting_data_source",
        "value": "infores:biothings-multiomics-clinicaltrials"
      }
    ],
    "predicate": "related_to"
  },
  "object": {
    "UMLS": "C0011848",
    "name": "diabetes insipidus",
    "type": "Disease"
  },
  "source": {
    "edge_sources": [
      {
        "resource_id": "infores:biothings-multiomics-clinicaltrials",
        "resource_role": "supporting_data_source"
      },
      {
        "resource_id": "infores:aact",
        "resource_role": "aggregator_knowledge_source"
      },
      {
 