In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#!/usr/bin/env conda run -n ct_extract_env python

import pandas as pd
import os
import json

def parse_edges(data_folder):
    filename = "ClinTrials_KG_edges_v01_3.csv"
    filepath = os.path.join(data_folder, filename)

    edges = pd.read_csv(filepath, sep='\t')
    for index, row in edges.iterrows():
        id_dict = {}
        subject_dict = {}
        association_dict = {}
        object_dict = {}
        source_dict = {}

        # id generated by concatenating the following: numbers from nctid, CUI of subject, CUI of object
        id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])

        subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
        subject_dict["name"] = row["subject_name"]
        # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
        subject_dict["type"] = "Disease"

        association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
        association_dict["edge_attributes"] = []
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"clinicaltrials_id",
             "value":row["nctid"]
            }
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"biolink:aggregator_knowledge_source",
             "value":"infores:biothings-multiomics-clinicaltrials"}
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:primary_knowledge_source",
             "value": "infores:aact"}
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:supporting_data_source",
             "value": "infores:clinicaltrials"})

        object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
        object_dict["name"] = row["object_name"]
        object_dict["type"] = "Treatment"
        # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version
        
        source_dict["edge_sources"] = []
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:biothings-multiomics-clinicaltrials",
                "resource_role": "aggregator_knowledge_source"
            }
        )
        
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:aact",
                "resource_role": "primary_knowledge_source"
            }
        )
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:clinicaltrials",
                "resource_role": "supporting_data_source"
            }
        )

        id_dict["subject"] = subject_dict
        id_dict["association"] = association_dict
        id_dict["object"] = object_dict 
        id_dict["source"] = source_dict
        
        # print(json.dumps(id_dict, indent=2)) # uncomment for testing

        # yield the JSON one by one
        yield id_dict # comment for testing

# data_folder = "../outputs" # uncomment for testing
# parse_edges(data_folder) # uncomment for testing




In [1]:
#!/usr/bin/env conda run -n ct_extract_env python

import pandas as pd
import os
import json

In [21]:
data_folder = "./outputs/version_1" # uncomment for testing
filename = "ClinTrials_KG_edges_v01_3.csv"
filepath = os.path.join(data_folder, filename)

edges = pd.read_csv(filepath, sep='\t')
edges.head()

Unnamed: 0,subject,predicate,object,subject_name,object_name,category,nctid,nctid_curie
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606


In [22]:
edges.rename(columns={'subject': 'disease', 'object': 'intervention', 'subject_name': 'disease_name', 'object_name': 'intervention_name'}, inplace=True)
edges.rename(columns={'disease': 'object', 'intervention': 'subject', 'disease_name': 'object_name', 'intervention_name': 'subject_name'}, inplace=True)
edges.head()

Unnamed: 0,object,predicate,subject,object_name,subject_name,category,nctid,nctid_curie
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606


In [46]:
for index, row in edges[:10].iterrows():
    id_dict = {}
    subject_dict = {}
    association_dict = {}
    object_dict = {}
    source_dict = {}
    
    id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])
#     print(id_dict)
    
    subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
    subject_dict["name"] = row["subject_name"]
    # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
    subject_dict["type"] = "biolink:Treatment"
#     print(subject_dict)
    
    association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
    association_dict["edge_attributes"] = []
    association_dict["edge_attributes"].append(
        {"attribute_type_id":"clinicaltrials_id",
         "value":row["nctid"]
        }
    )
    association_dict["edge_attributes"].append(
        {"attribute_type_id":"biolink:aggregator_knowledge_source",
         "value":"infores:aact"}
        )
    association_dict["edge_attributes"].append(
        {"attribute_type_id": "biolink:primary_knowledge_source",
         "value": "infores:clinicaltrials"}
    )
    association_dict["edge_attributes"].append(
    {"attribute_type_id": "biolink:aggregator_knowledge_source",
     "value": "infores:biothings-multiomics-clinicaltrials"})
#     print(association_dict)
    
    object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
    object_dict["name"] = row["object_name"]
    object_dict["type"] = "biolink:DiseaseorPhenotypicFeature"
    # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version
#     print(object_dict)

    source_dict["edge_sources"] = []
    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:biothings-multiomics-clinicaltrials",
            "resource_role": "aggregator_knowledge_source"
        }
    )

    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:aact",
            "resource_role": "aggregator_knowledge_source"
        }
    )
    source_dict["edge_sources"].append(
        {
            "resource_id": "infores:clinicaltrials",
            "resource_role": "primary_knowledge_source"
        }
    )
    
    id_dict["subject"] = subject_dict
    id_dict["association"] = association_dict
    id_dict["object"] = object_dict 
    id_dict["source"] = source_dict
    
    print(json.dumps(id_dict,sort_keys=True, indent=2))

    # yield the JSON one by one
#     yield id_dict # comment for testing
    
    
data_folder = "./outputs" # uncomment for testing
parse_edges(data_folder) # uncomment for testing  
    

{
  "_id": "02460354_C0025598_C0011848",
  "association": {
    "edge_attributes": [
      {
        "attribute_type_id": "clinicaltrials_id",
        "value": "NCT02460354"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:aact"
      },
      {
        "attribute_type_id": "biolink:primary_knowledge_source",
        "value": "infores:clinicaltrials"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:biothings-multiomics-clinicaltrials"
      }
    ],
    "predicate": "related_to"
  },
  "object": {
    "UMLS": "C0011848",
    "name": "diabetes insipidus",
    "type": "biolink:DiseaseorPhenotypicFeature"
  },
  "source": {
    "edge_sources": [
      {
        "resource_id": "infores:biothings-multiomics-clinicaltrials",
        "resource_role": "aggregator_knowledge_source"
      },
      {
        "resource_id": "infores:aact",
        "resource_role": "aggregator

FileNotFoundError: [Errno 2] File ../outputs/ClinTrials_KG_edges_v01_3.csv does not exist: '../outputs/ClinTrials_KG_edges_v01_3.csv'

In [47]:
def parse_edges(data_folder):
    filename = "ClinTrials_KG_edges_v01_3.csv"
    filepath = os.path.join(data_folder, filename)

    edges = pd.read_csv(filepath, sep='\t')
    edges.rename(columns={'subject': 'disease', 'object': 'intervention', 'subject_name': 'disease_name', 'object_name': 'intervention_name'}, inplace=True)
    edges.rename(columns={'disease': 'object', 'intervention': 'subject', 'disease_name': 'object_name', 'intervention_name': 'subject_name'}, inplace=True)

    for index, row in edges[:10].iterrows():
        id_dict = {}
        subject_dict = {}
        association_dict = {}
        object_dict = {}
        source_dict = {}

        id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])

        subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
        subject_dict["name"] = row["subject_name"]
        # subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
        subject_dict["type"] = "biolink:Treatment"

        association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
        association_dict["edge_attributes"] = []
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"clinicaltrials_id",
             "value":row["nctid"]
            }
        )
        association_dict["edge_attributes"].append(
            {"attribute_type_id":"biolink:aggregator_knowledge_source",
             "value":"infores:aact"}
            )
        association_dict["edge_attributes"].append(
            {"attribute_type_id": "biolink:primary_knowledge_source",
             "value": "infores:clinicaltrials"}
        )
        association_dict["edge_attributes"].append(
        {"attribute_type_id": "biolink:aggregator_knowledge_source",
         "value": "infores:biothings-multiomics-clinicaltrials"})

        object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
        object_dict["name"] = row["object_name"]
        object_dict["type"] = "DiseaseorPhenotypicFeature"
        # object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version

        source_dict["edge_sources"] = []
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:biothings-multiomics-clinicaltrials",
                "resource_role": "aggregator_knowledge_source"
            }
        )

        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:aact",
                "resource_role": "aggregator_knowledge_source"
            }
        )
        source_dict["edge_sources"].append(
            {
                "resource_id": "infores:clinicaltrials",
                "resource_role": "primary_knowledge_source"
            }
        )

        id_dict["subject"] = subject_dict
        id_dict["association"] = association_dict
        id_dict["object"] = object_dict 
        id_dict["source"] = source_dict

        print(json.dumps(id_dict,sort_keys=True, indent=2))

        # yield the JSON one by one
#         yield id_dict # comment for testing
        
def main():
    data_folder = "../outputs/version_1" # uncomment for testing
    parse_edges(data_folder) # uncomment for testing

if __name__ == "__main__":
    main()


{
  "_id": "02460354_C0025598_C0011848",
  "association": {
    "edge_attributes": [
      {
        "attribute_type_id": "clinicaltrials_id",
        "value": "NCT02460354"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:aact"
      },
      {
        "attribute_type_id": "biolink:primary_knowledge_source",
        "value": "infores:clinicaltrials"
      },
      {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "value": "infores:biothings-multiomics-clinicaltrials"
      }
    ],
    "predicate": "related_to"
  },
  "object": {
    "UMLS": "C0011848",
    "name": "diabetes insipidus",
    "type": "DiseaseorPhenotypicFeature"
  },
  "source": {
    "edge_sources": [
      {
        "resource_id": "infores:biothings-multiomics-clinicaltrials",
        "resource_role": "aggregator_knowledge_source"
      },
      {
        "resource_id": "infores:aact",
        "resource_role": "aggregator_knowled

# Section is to tie Adverse Events and Phase to Version 1.3 TSVs

In [2]:
import pandas as pd
import os
import json

In [3]:
data_folder = "./outputs/version_1" # uncomment for testing
filename = "ClinTrials_KG_edges_v01_3.csv"
filepath = os.path.join(data_folder, filename)

edges = pd.read_csv(filepath, sep='\t')
edges

Unnamed: 0,subject,predicate,object,subject_name,object_name,category,nctid,nctid_curie
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606
...,...,...,...,...,...,...,...,...
55308,UMLS:C4288754,biolink:related_to,UMLS:C2826354,metastatic urothelial carcinoma,ixazomib citrate,biolink:Association,NCT02420847,clinicaltrials:NCT02420847
55309,UMLS:C4721698,biolink:related_to,UMLS:C1516615,metastatic renal cell carcinoma,clinical management,biolink:Association,NCT04467021,clinicaltrials:NCT04467021
55310,UMLS:C0410808,biolink:related_to,UMLS:C4507320,prosthetic joint infection,tnp-2092,biolink:Association,NCT04294862,clinicaltrials:NCT04294862
55311,UMLS:C0017638,biolink:related_to,UMLS:C5207035,glioma,fitbit,biolink:Association,NCT04186832,clinicaltrials:NCT04186832


In [4]:
data_extracted = "/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted"
reported_event_totals = pd.read_csv(data_extracted + '/reported_event_totals.txt', sep='|', index_col=False, header=0)
studies = pd.read_csv(data_extracted + '/studies.txt', sep='|', index_col=False, header=0)
reported_event_totals
with pd.option_context('display.max_rows', 20, 'display.max_columns', None): 
    display(studies)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,nct_id,ctgov_group_code,event_type,classification,subjects_affected,subjects_at_risk,created_at,updated_at
0,28806901,NCT00045942,EG000,serious,"Total, serious adverse events",12.0,20.0,2023-08-18 18:18:16.215445,2023-08-18 18:18:16.215445
1,28806902,NCT00045942,EG000,other,"Total, other adverse events",20.0,20.0,2023-08-18 18:18:16.215445,2023-08-18 18:18:16.215445
2,28806903,NCT00045942,EG000,deaths,"Total, all-cause mortality",,,2023-08-18 18:18:16.215445,2023-08-18 18:18:16.215445
3,28806904,NCT00045942,EG001,serious,"Total, serious adverse events",16.0,18.0,2023-08-18 18:18:16.215445,2023-08-18 18:18:16.215445
4,28806905,NCT00045942,EG001,other,"Total, other adverse events",17.0,18.0,2023-08-18 18:18:16.215445,2023-08-18 18:18:16.215445
...,...,...,...,...,...,...,...,...,...
434338,28980863,NCT01059357,EG000,other,"Total, other adverse events",7.0,54.0,2023-08-20 19:01:55.927501,2023-08-20 19:01:55.927501
434339,28980864,NCT01059357,EG000,deaths,"Total, all-cause mortality",0.0,54.0,2023-08-20 19:01:55.927501,2023-08-20 19:01:55.927501
434340,28980865,NCT01360398,EG000,serious,"Total, serious adverse events",0.0,127.0,2023-08-20 19:01:58.615882,2023-08-20 19:01:58.615882
434341,28980866,NCT01360398,EG000,other,"Total, other adverse events",0.0,0.0,2023-08-20 19:01:58.615882,2023-08-20 19:01:58.615882


Unnamed: 0,nct_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,source_class,delayed_posting,expanded_access_nctid,expanded_access_status_for_nctid,fdaaa801_violation,baseline_type_units_analyzed
0,NCT03971825,,2019-05-31,,,2021-08-24,2019-05-31,2019-06-03,Actual,,,,,,,2021-08-24,2021-08-30,Actual,"July 24, 2018",Actual,2018-07-24,August 2021,2021-08-31,"August 5, 2021",Actual,2021-08-05,"August 5, 2021",Actual,2021-08-05,,Interventional,,,A Safety Study of CC-92252 in Healthy Adult Su...,"A Phase 1, Randomized, 3-Part Study to Evaluat...",Terminated,,Phase 1,131.0,Actual,Celgene,,2.0,,Did not meet progression criteria,f,,,,f,t,f,,,t,,,See Plan Description,See Plan Description,https://www.celgene.com/research-development/c...,Yes,Information relating to our policy on data sha...,2023-08-18 18:38:13.337008,2023-08-18 18:38:13.337008,INDUSTRY,,,,,
1,NCT04856488,,2021-04-06,,,2022-01-18,2021-04-19,2021-04-23,Actual,,,,,,,2022-01-18,2022-01-19,Actual,"November 18, 2021",Actual,2021-11-18,January 2022,2022-01-31,February 2024,Anticipated,2024-02-29,August 2023,Anticipated,2023-08-31,,Interventional,,,Preoperative Lugol's Solution in Graves' Disea...,Preoperative Lugol's Solution in Graves' Disea...,Recruiting,,Phase 3,182.0,Anticipated,Karolinska University Hospital,,2.0,,,f,,,,,f,f,,,,,,The original contributions presented in the st...,Further inquiries can be directed to the corre...,,Yes,The original contributions presented in the st...,2023-08-18 18:38:14.03975,2023-08-18 18:38:14.03975,OTHER,,,,,
2,NCT01126554,,2010-05-17,,,2014-04-28,2010-05-18,2010-05-19,Estimate,,,,,,,2014-04-28,2014-04-29,Estimate,July 2010,,2010-07-31,April 2014,2014-04-30,February 2011,Actual,2011-02-28,December 2010,Actual,2010-12-31,,Observational,Greenpep,,ICG- Liver Test Versus New Biomarkers as Progn...,ICG- Liver Test Versus New Biomarkers as Progn...,Completed,,,110.0,Anticipated,University of Zurich,,,1.0,,f,,,,t,,,,,,,,,,,,,2023-08-18 18:38:14.723441,2023-08-18 18:38:14.723441,OTHER,,,,,
3,NCT01072019,,2010-02-18,,,2018-06-25,2010-02-18,2010-02-19,Estimate,,,,,,,2018-06-25,2018-06-27,Actual,March 2010,,2010-03-31,June 2018,2018-06-30,May 2014,Actual,2014-05-31,May 2014,Actual,2014-05-31,,Interventional,Cutting Block,,A Randomized Prospective Trial of Total Knee A...,Comparative Clinical Study of the Vanguard® Kn...,Completed,,Not Applicable,25.0,Actual,Washington University School of Medicine,,2.0,,,f,,,,t,,,,,,,,,,,,,2023-08-18 18:38:15.338961,2023-08-18 18:38:15.338961,OTHER,,,,,
4,NCT01126632,,2010-05-15,,,2016-01-27,2010-05-18,2010-05-20,Estimate,,,,,,,2016-01-27,2016-01-28,Estimate,June 2010,,2010-06-30,January 2016,2016-01-31,January 2012,Actual,2012-01-31,January 2012,Actual,2012-01-31,,Observational,CAC,,Will Cap-Assisted Colonoscopy Improve Performa...,Will Cap-Assisted Colonoscopy Improve Performa...,Withdrawn,,,0.0,Actual,Nova Scotia Health Authority,,,2.0,Caps became unavailable by manufacturer,f,,,,f,,,,,,,,,,,No,Study was terminated,2023-08-18 18:38:15.961115,2023-08-18 18:38:15.961115,OTHER,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462810,NCT03351387,,2017-11-20,,,2021-12-15,2017-11-20,2017-11-22,Actual,,,,,,,2021-12-15,2021-12-17,Actual,"October 20, 2017",Actual,2017-10-20,December 2021,2021-12-31,"April 18, 2018",Actual,2018-04-18,"April 18, 2018",Actual,2018-04-18,,Observational,,,Wound Necrosis in Lower Extremity Surgery,Assessment of Wound Necrosis in Lower Extremit...,Completed,,,10.0,Actual,Mayo Clinic,,,1.0,,f,,,,f,f,t,,,f,,,,,,No,,2023-08-19 17:16:46.976535,2023-08-19 17:16:46.976535,OTHER,,,,,
462811,NCT05165589,,2021-10-15,,,2021-12-14,2021-12-14,2021-12-21,Actual,,,,,,,2021-12-14,2021-12-21,Actual,"October 15, 2021",Actual,2021-10-15,December 2021,2021-12-31,April 2022,Anticipated,2022-04-30,January 2022,Anticipated,2022-01-31,,Interventional,,,Validation of Active-Insights Device to Measur...,Pilot & Exploratory - Validation of Active-Ins...,Recruiting,,Not Applicable,13.0,Anticipated,BioGaia AB,,2.0,,,f,,,,,f,f,,,,,,,,,No,,2023-08-19 17:16:47.599338,2023-08-19 17:16:47.599338,INDUSTRY,,,,,
462812,NCT01376674,,2011-06-16,,,2013-08-15,2011-06-16,2011-06-20,Estimate,,,,,,,2013-08-15,2013-08-16,Estimate,March 2011,,2011-03-31,August 2013,2013-08-31,,,,,,,,Observational,,,T-cell-immunity During Standard Radiotherapy i...,Changes of T-cell-immune-status During Curativ...,Completed,,,22.0,Actual,University Hospital Tuebingen,,,,,f,,,,f,,,,,,Samples With DNA,Peripheral blood mononuclear cells and serum a...,,,,,,2023-08-18 18:38:11.032433,2023-08-18 18:38:11.032433,OTHER,,,,,
462813,NCT01126476,,2010-05-17,,,2020-04-23,2010-05-18,2010-05-19,Estimate,,,,,,,2020-04-23,2020-04-24,Actual,February 2010,Actual,2010-02-28,April 2020,2020-04-30,April 2020,Actual,2020-04-30,February 2017,Actual,2017-02-28,,Interventional,,,Proton Radiotherapy for Recurrent Tumors,Retreatment of Recurrent Tumors Using Proton T...,Completed,,Not Applicable,24.0,Anticipated,Abramson Cancer Center at Penn Medicine,,2.0,,,f,,,,t,,,,,,,,,,,,,2023-08-18 18:38:11.639866,2023-08-18 18:38:11.639866,OTHER,,,,,


In [5]:
edges_phase = pd.merge(edges, studies[["nct_id", "study_first_posted_date", "updated_at", "overall_status", "phase", "enrollment", "number_of_arms",]], how='left', left_on=['nctid'], right_on = ['nct_id'])
edges_phase

Unnamed: 0,subject,predicate,object,subject_name,object_name,category,nctid,nctid_curie,nct_id,study_first_posted_date,updated_at,overall_status,phase,enrollment,number_of_arms
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354,NCT02460354,2015-06-02,2023-08-20 15:40:47.06955,Terminated,Phase 1,2.0,1.0
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606,NCT00892606,2009-05-04,2023-08-20 13:43:38.732535,Completed,Phase 4,75.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55308,UMLS:C4288754,biolink:related_to,UMLS:C2826354,metastatic urothelial carcinoma,ixazomib citrate,biolink:Association,NCT02420847,clinicaltrials:NCT02420847,NCT02420847,2015-04-20,2023-08-19 20:09:24.997057,"Active, not recruiting",Phase 1/Phase 2,57.0,1.0
55309,UMLS:C4721698,biolink:related_to,UMLS:C1516615,metastatic renal cell carcinoma,clinical management,biolink:Association,NCT04467021,clinicaltrials:NCT04467021,NCT04467021,2020-07-10,2023-08-20 06:10:43.735458,Recruiting,Not Applicable,60.0,2.0
55310,UMLS:C0410808,biolink:related_to,UMLS:C4507320,prosthetic joint infection,tnp-2092,biolink:Association,NCT04294862,clinicaltrials:NCT04294862,NCT04294862,2020-03-04,2023-08-21 00:35:27.952944,Completed,Phase 1,13.0,1.0
55311,UMLS:C0017638,biolink:related_to,UMLS:C5207035,glioma,fitbit,biolink:Association,NCT04186832,clinicaltrials:NCT04186832,NCT04186832,2019-12-05,2023-08-18 18:17:52.108033,Recruiting,Not Applicable,50.0,2.0


In [101]:
reported_event_totals.loc[reported_event_totals['nct_id'] == 'NCT00000125']

test = reported_event_totals.pivot(index=['nct_id', 'id'], columns='event_type', values=['subjects_affected', 'subjects_at_risk'])
test

Unnamed: 0,id,nct_id,ctgov_group_code,event_type,classification,subjects_affected,subjects_at_risk,created_at,updated_at
219226,28868500,NCT00000125,EG000,serious,"Total, serious adverse events",12.0,819.0,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051
219227,28868501,NCT00000125,EG000,other,"Total, other adverse events",481.0,819.0,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051
219228,28868502,NCT00000125,EG000,deaths,"Total, all-cause mortality",,,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051
219229,28868503,NCT00000125,EG001,serious,"Total, serious adverse events",13.0,817.0,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051
219230,28868504,NCT00000125,EG001,other,"Total, other adverse events",570.0,817.0,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051
219231,28868505,NCT00000125,EG001,deaths,"Total, all-cause mortality",,,2023-08-19 17:37:46.39051,2023-08-19 17:37:46.39051


Unnamed: 0_level_0,Unnamed: 1_level_0,subjects_affected,subjects_affected,subjects_affected,subjects_at_risk,subjects_at_risk,subjects_at_risk
Unnamed: 0_level_1,event_type,deaths,other,serious,deaths,other,serious
nct_id,id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
NCT00000125,28868500,,,12.0,,,819.0
NCT00000125,28868501,,481.0,,,819.0,
NCT00000125,28868502,,,,,,
NCT00000125,28868503,,,13.0,,,817.0
NCT00000125,28868504,,570.0,,,817.0,
...,...,...,...,...,...,...,...
NCT05890586,28732502,,0.0,,,583.0,
NCT05890586,28732503,0.0,,,583.0,,
NCT05940077,28891636,,,0.0,,,20.0
NCT05940077,28891637,,0.0,,,20.0,


In [102]:
# test.columns = test.columns.map('|'.join).str.strip('|')
test.columns = test.columns.map(lambda x: f'{x[0]} - {x[1]}')
test = test.reset_index(level=1, drop=True)
test = test.reset_index()
test=test.groupby('nct_id').sum().reset_index()


test

Unnamed: 0,nct_id,subjects_affected - deaths,subjects_affected - other,subjects_affected - serious,subjects_at_risk - deaths,subjects_at_risk - other,subjects_at_risk - serious
0,NCT00000125,0.0,1051.0,25.0,0.0,1636.0,1636.0
1,NCT00000134,0.0,24.0,124.0,0.0,274.0,274.0
2,NCT00000135,0.0,0.0,0.0,0.0,209.0,209.0
3,NCT00000136,0.0,0.0,133.0,0.0,234.0,234.0
4,NCT00000142,0.0,4.0,24.0,0.0,64.0,64.0
...,...,...,...,...,...,...,...
59451,NCT05838456,57.0,34.0,9.0,243.0,243.0,243.0
59452,NCT05867342,0.0,0.0,0.0,80.0,80.0,80.0
59453,NCT05887388,1.0,0.0,2.0,38.0,38.0,38.0
59454,NCT05890586,0.0,0.0,9.0,1229.0,1229.0,1229.0


In [77]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,subjects_affected - deaths,subjects_affected - other,subjects_affected - serious,subjects_at_risk - deaths,subjects_at_risk - other,subjects_at_risk - serious
nct_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT00000125,28868500,,,12.0,,,819.0
NCT00000125,28868501,,481.0,,,819.0,
NCT00000125,28868502,,,,,,
NCT00000125,28868503,,,13.0,,,817.0
NCT00000125,28868504,,570.0,,,817.0,
...,...,...,...,...,...,...,...
NCT05890586,28732502,,0.0,,,583.0,
NCT05890586,28732503,0.0,,,583.0,,
NCT05940077,28891636,,,0.0,,,20.0
NCT05940077,28891637,,0.0,,,20.0,


In [26]:
test.reset_index(level = "nct_id", inplace=True)
test.columns = test.columns.map('|'.join).str.strip('|')
test.reset_index(drop=True)
# test = test.drop('id', axis=1)

test

KeyError: 'Requested level (nct_id) does not match index name (id)'

In [18]:
edges_phase_adv = pd.merge(edges_phase, reported_event_totals[["nct_id",  "event_type", "subjects_affected", "subjects_at_risk"]], how='left', left_on=['nct_id'], right_on = ['nct_id'])

edges_phase_adv


Unnamed: 0,subject,predicate,object,subject_name,object_name,category,nctid,nctid_curie,nct_id,study_first_posted_date,updated_at,overall_status,phase,enrollment,number_of_arms,event_type,subjects_affected,subjects_at_risk
0,UMLS:C0011848,biolink:related_to,UMLS:C0025598,diabetes insipidus,metformin,biolink:Association,NCT02460354,clinicaltrials:NCT02460354,NCT02460354,2015-06-02,2023-08-20 15:40:47.06955,Terminated,Phase 1,2.0,1.0,,,
1,UMLS:C1527344,biolink:related_to,UMLS:C1660599,dysphonia,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,,,,
2,UMLS:C0238441,biolink:related_to,UMLS:C1660599,subglottic stenosis,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,,,,
3,UMLS:C0042940,biolink:related_to,UMLS:C1660599,voice disorders,videofluoroscopy,biolink:Association,NCT04584658,clinicaltrials:NCT04584658,NCT04584658,2020-10-14,2023-08-17 06:36:34.213937,Unknown status,,36.0,,,,
4,UMLS:C0016658,biolink:related_to,UMLS:C0025605,fracture,methadone,biolink:Association,NCT00892606,clinicaltrials:NCT00892606,NCT00892606,2009-05-04,2023-08-20 13:43:38.732535,Completed,Phase 4,75.0,2.0,serious,0.0,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132193,UMLS:C4288754,biolink:related_to,UMLS:C2826354,metastatic urothelial carcinoma,ixazomib citrate,biolink:Association,NCT02420847,clinicaltrials:NCT02420847,NCT02420847,2015-04-20,2023-08-19 20:09:24.997057,"Active, not recruiting",Phase 1/Phase 2,57.0,1.0,,,
132194,UMLS:C4721698,biolink:related_to,UMLS:C1516615,metastatic renal cell carcinoma,clinical management,biolink:Association,NCT04467021,clinicaltrials:NCT04467021,NCT04467021,2020-07-10,2023-08-20 06:10:43.735458,Recruiting,Not Applicable,60.0,2.0,,,
132195,UMLS:C0410808,biolink:related_to,UMLS:C4507320,prosthetic joint infection,tnp-2092,biolink:Association,NCT04294862,clinicaltrials:NCT04294862,NCT04294862,2020-03-04,2023-08-21 00:35:27.952944,Completed,Phase 1,13.0,1.0,,,
132196,UMLS:C0017638,biolink:related_to,UMLS:C5207035,glioma,fitbit,biolink:Association,NCT04186832,clinicaltrials:NCT04186832,NCT04186832,2019-12-05,2023-08-18 18:17:52.108033,Recruiting,Not Applicable,50.0,2.0,,,


In [None]:
clinicaltrials.gov <--- primary KS
aact <--- aggregate KS
multiomics clinical trials <--- aggregate KS