In [1]:
!carrot run py --help

Usage: carrot run py [OPTIONS] COMMAND [ARGS]...

  Commands for using python configurations to run the ETL transformation.

Options:
  --help  Show this message and exit.

Commands:
  list      List all the python classes there are available to run
  make      Generate a python class from the OMOP mapping json
  map       Perform OMOP Mapping given a python configuration file.
  register  Register a python class with the tool
  remove    remove a registered class


In [2]:
!carrot run py make --name ExampleDataset ../data/rules.json

Recreating file /Users/calummacdonald/Usher/CO-CONNECT/docs/docs/CaRROT-CDM/notebooks/ExampleDataset.py


This automatically creates a file that looks like this:

In [3]:
# %load ExampleDataset.py
from carrot.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from carrot.cdm import CommonDataModel
import json

class ExampleDataset(CommonDataModel):
    
    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        super().__init__(**kwargs)
        
    
    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Male": 8507
            }
        )
        
    @define_person
    def person_1(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Female": 8532
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Female": 8532
            }
        )
        
    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)
        
    @define_observation
    def observation_1(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )
        
    @define_observation
    def observation_2(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )
        
    @define_observation
    def observation_3(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_0(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 378253
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 378253
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_1(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_2(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_3(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 254761
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 254761
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_4(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 437663
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 437663
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_5(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 442752
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 442752
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_6(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_end_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.condition_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_start_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_7(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_8(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_9(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_10(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_11(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )
        
    @define_drug_exposure
    def drug_exposure_0(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_1(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_2(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_3(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )
        
    @define_drug_exposure
    def drug_exposure_4(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )



Loading some inputs..

In [4]:
import carrot
import glob
inputs = carrot.tools.load_csv(glob.glob('../data/part1/*'))
inputs

[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - DataCollection Object Created
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Blood_Test.csv [<carrot.io.common.DataBrick object at 0x111f1be50>]
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Demographics.csv [<carrot.io.common.DataBrick object at 0x111fd15b0>]
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  GP_Records.csv [<carrot.io.common.DataBrick object at 0x111fd12e0>]
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Hospital_Visit.csv [<carrot.io.common.DataBrick object at 0x111f56b80>]
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Serology.csv [<carrot.io.common.DataBrick object at 0x111f56730>]
[32m2022-06-17 15:17:54[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - R

<carrot.io.plugins.local.LocalDataCollection at 0x111fd1f40>

A new instances can be created from the created python class 

In [5]:
instance = ExampleDataset(inputs=inputs)
instance

[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - CommonDataModel (5.3.1) created with co-connect-tools version 0.0.0
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Running with an DataCollection object
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Turning on automatic cdm column filling
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_0 of type condition_occurrence
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_1 of type condition_occurrence
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_10 of type condition_occurrence
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_11 of type condition_occurrence
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occu

<__main__.ExampleDataset at 0x115f20f40>

In [6]:
instance.process()

[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Starting processing in order: ['person', 'condition_occurrence', 'drug_exposure', 'observation']
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Number of objects to process for each table...
{
      "condition_occurrence": 12,
      "drug_exposure": 5,
      "observation": 4,
      "person": 2
}
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - for person: found 2 objects
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - working on person
[32m2022-06-17 15:17:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on person_0
[32m2022-06-17 15:17:55[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Retrieving initial dataframe for 'Demographics.csv' for the first time
[32m2022-06-17 15:17:55[0m - [34mPerson[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-06-17 15:17:55[0m - [34mPerson[0m - [

could not convert string to float: 'na'
could not convert string to float: 'na'


[32m2022-06-17 15:17:55[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finalised person on iteration 0 producing 996 rows from 2 tables
[32m2022-06-17 15:17:55[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Getting next chunk of data
[32m2022-06-17 15:17:55[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - All input files for this object have now been used.
[32m2022-06-17 15:17:55[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - resetting used bricks
[32m2022-06-17 15:17:55[0m - [34mExampleDataset[0m - [1;37mINFO[0m - for condition_occurrence: found 12 objects
[32m2022-06-17 15:17:55[0m - [34mExampleDataset[0m - [1;37mINFO[0m - working on condition_occurrence
[32m2022-06-17 15:17:55[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on condition_occurrence_0
[32m2022-06-17 15:17:55[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Retrieving initial dataframe for 'Symptoms.csv' for the first time
[32m2022-06-17 15:17:55[0m - [34mC

[32m2022-06-17 15:17:56[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-06-17 15:17:56[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - created df (0x1162f2850)[condition_occurrence_6]
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished condition_occurrence_6 (0x1162f2850) ... 9/12 completed, 171 rows
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on condition_occurrence_7
[32m2022-06-17 15:17:56[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-06-17 15:17:56[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - created df (0x1162f2c70)[condition_occurrence_7]
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished condition_occurrence_7 (0x1162f2c70) ... 10/12 completed, 444 rows
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mThere are pe

[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished drug_exposure_1 (0x11634ab20) ... 2/5 completed, 225 rows
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mThere are person_ids in this table that are not in the output person table![0m
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mEither they are not in the original data, or while creating the person table, [0m
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mstudies have been removed due to lack of required fields, such as birthdate.[0m
[32m2022-06-17 15:17:56[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31m224/225 were good, 1 studies are removed.[0m
[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on drug_exposure_2
[32m2022-06-17 15:17:57[0m - [34mDrugExposure[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-06-17 15:17:57[0m 

[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mstudies have been removed due to lack of required fields, such as birthdate.[0m
[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31m176/177 were good, 1 studies are removed.[0m
[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on observation_3
[32m2022-06-17 15:17:57[0m - [34mObservation[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-06-17 15:17:57[0m - [34mObservation[0m - [1;37mINFO[0m - created df (0x111fd1d00)[observation_3]
[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished observation_3 (0x111fd1d00) ... 4/4 completed, 349 rows
[32m2022-06-17 15:17:57[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mRemoved 1 row(s) due to duplicates found when merging observation[0m
observation_id                                                       
440                 110.0 

In [7]:
instance.keys()

dict_keys(['person', 'condition_occurrence', 'drug_exposure', 'observation'])

In [8]:
instance['observation'].dropna(axis=1)

Unnamed: 0_level_0,person_id,observation_concept_id,observation_date,observation_datetime,observation_source_value,observation_source_concept_id
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,357,4288455,2020-10-03,2020-10-03 00:00:00.000000,17.172114692899758,4288455
2,258,4288455,2020-11-02,2020-11-02 00:00:00.000000,201.93861878809216,4288455
4,556,4288455,2021-07-26,2021-07-26 00:00:00.000000,11.506250956970998,4288455
5,380,4288455,2021-10-29,2021-10-29 00:00:00.000000,2.6594057121417487,4288455
6,415,4288455,2021-09-07,2021-09-07 00:00:00.000000,40.844873593089126,4288455
...,...,...,...,...,...,...
1193,988,40757663,2020-07-21,2020-07-21 00:00:00.000000,Cancer,40757663
1194,555,40757663,2020-10-03,2020-10-03 00:00:00.000000,Cancer,40757663
1195,992,40757663,2021-06-20,2021-06-20 00:00:00.000000,Cancer,40757663
1196,992,40757663,2019-05-13,2019-05-13 00:00:00.000000,Cancer,40757663


## Manually edited 

By generating a python class from the rules files, you can manually edit the python file setting up i/o as well as making some edits to the various tables. Once done, it could simple be run as a python file:
```
python  ExampleDatasetModified.py
```


In [9]:
# %load ExampleDatasetModified.py
from carrot.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from carrot.cdm import CommonDataModel
from carrot.tools import load_csv,create_csv_store
import json
import glob
import pandas as pd

class ExampleDatasetModified(CommonDataModel):
    
    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        inputs = load_csv(glob.glob('../data/part1/*'))
        outputs = create_csv_store(output_folder="./data_tests/",
                                                   sep="\t",
                                                   write_separate=True,
                                                   write_mode='w')
        
        super().__init__(inputs=inputs,outputs=outputs,**kwargs)
        self.process()
    
    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507,
                "Female": 8532
            }
        )

    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """

        def convert_igg(x):
            """
            A custom function to convert the IgG into g/L
            """
            try:
                igg = float(x['IgG'])
            except:
                return None
            #example of a dataset where the assay has been recalibrated after a certain date
            #therefore you might need to do some conversion based upon the date
            factor = 1.2 if x['Date'].year < 2021 else 1
            
            #apply a factor to convert to g/L
            factor = factor * 10
            
            #return the modified IgG value
            return igg*factor
        
        #save the source value of the IgG
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]

        #convert the date into a datetime object
        self.inputs["Serology.csv"]["Date"] =  pd.to_datetime(self.inputs["Serology.csv"]["Date"],
                                                             errors='coerce')
        
        #recalculate the IgG based upon a custom function
        self.inputs["Serology.csv"]["IgG"] = self.inputs["Serology.csv"].apply(
                                                            lambda x: convert_igg(x),axis=1)
        #set the output units
        self.inputs["Serology.csv"]["Units"] = 'g/L'
        
        #set additional columns we did not have before...
        self.unit_source_value.series = self.inputs["Serology.csv"]["Units"]
        self.value_as_number.series = self.inputs["Serology.csv"]["IgG"]

        
        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]

        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)
        


In [10]:
instance = ExampleDatasetModified()
instance

[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - DataCollection Object Created
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Blood_Test.csv [<carrot.io.common.DataBrick object at 0x116322730>]
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Demographics.csv [<carrot.io.common.DataBrick object at 0x111f1ba90>]
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  GP_Records.csv [<carrot.io.common.DataBrick object at 0x111f56d60>]
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Hospital_Visit.csv [<carrot.io.common.DataBrick object at 0x111f56c10>]
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Serology.csv [<carrot.io.common.DataBrick object at 0x116458a00>]
[32m2022-06-17 15:17:57[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - R

could not convert string to float: 'na'


[32m2022-06-17 15:17:58[0m - [34mExampleDatasetModified[0m - [1;37mINFO[0m - finished person_0 (0x11654bcd0) ... 1/1 completed, 996 rows
[32m2022-06-17 15:17:58[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - saving person_ids.0x1164a5d30.2022-06-17T141758 to ./data_tests//person_ids.0x1164a5d30.2022-06-17T141758.tsv
[32m2022-06-17 15:17:58[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - finished save to file
[32m2022-06-17 15:17:58[0m - [34mExampleDatasetModified[0m - [1;37mINFO[0m - saving dataframe (0x1164852e0) to <carrot.io.plugins.local.LocalDataCollection object at 0x1164585e0>
[32m2022-06-17 15:17:58[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - saving person.person_0.0x1164852e0.2022-06-17T141758 to ./data_tests//person.person_0.0x1164852e0.2022-06-17T141758.tsv
[32m2022-06-17 15:17:58[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - finished save to file
[32m2022-06-17 15:17:58[0m - [34mExampleDatasetModified[0m - [1;37mINFO[

<__main__.ExampleDatasetModified at 0x116458b20>

In [11]:
instance.keys()

dict_keys(['person', 'observation'])

In [12]:
instance['observation'].dropna(axis=1)

Unnamed: 0_level_0,person_id,observation_concept_id,observation_date,observation_datetime,observation_source_value,observation_source_concept_id,unit_source_value
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,650,4288455,2020-10-03,2020-10-03 00:00:00.000000,17.172114692899758,4288455,g/L
2,457,4288455,2020-11-02,2020-11-02 00:00:00.000000,201.93861878809216,4288455,g/L
3,983,4288455,2021-07-26,2021-07-26 00:00:00.000000,11.506250956970998,4288455,g/L
4,696,4288455,2021-10-29,2021-10-29 00:00:00.000000,2.6594057121417487,4288455,g/L
5,751,4288455,2021-09-07,2021-09-07 00:00:00.000000,40.844873593089126,4288455,g/L
...,...,...,...,...,...,...,...
409,187,4288455,2022-11-07,2022-11-07 00:00:00.000000,51.77573831029082,4288455,g/L
410,886,4288455,2022-09-07,2022-09-07 00:00:00.000000,57.11515081936336,4288455,g/L
411,50,4288455,2022-11-07,2022-11-07 00:00:00.000000,15.264660709568151,4288455,g/L
412,260,4288455,2019-11-13,2019-11-13 00:00:00.000000,26.051354325968106,4288455,g/L
