In [1]:
%load_ext autoreload
%autoreload 2

# Example: Mapping PHS-EPCC dataset

These are notes on how to use the pycarrot mapper to OMOP map a dataset

It requires the following:
- `pycarrot` folder containing the source code in the running directory
- an [OMOP database](https://athena.ohdsi.org/vocabulary/list) running locally with some from of SQL (MySQL, PostGres,..)

## Setup

### OMOP database

Create an engine from sqlalchemy for the OMOP database

In [2]:
import json
from sqlalchemy import create_engine
username="admin"
password="test!"
hostname="localhost"
port="5432"
database_name="omop"

connection_string = f"postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{database_name}"
engine = create_engine(connection_string)


### PyCarrot Concept Finder

Load the concept finder

In [3]:
from pycarrot import ConceptFinder

cfinder = ConceptFinder(engine)

Test retrieving information for a concept code for MALE

In [4]:
cfinder.find_concept(8507)

[{'concept_id': 8507,
  'concept_name': 'MALE',
  'domain_id': 'Gender',
  'vocabulary_id': 'Gender',
  'concept_class_id': 'Gender',
  'standard_concept': 'S',
  'concept_code': 'M',
  'valid_start_date': 19700101,
  'valid_end_date': 20991231,
  'invalid_reason': None}]

Test looking up based on ICD10CM codes

In [5]:
cfinder.find('A01.1')

{'A01.1': [{'original_code': 'A01.1',
   'concept_id': 195460,
   'concept_name': 'Paratyphoid A fever',
   'domain_id': 'Condition',
   'vocabulary_id': 'SNOMED',
   'concept_class_id': 'Disorder',
   'standard_concept': 'S',
   'concept_code': '76623002',
   'valid_start_date': 20020131,
   'valid_end_date': 20991231,
   'invalid_reason': None}]}

In [6]:
print (json.dumps(cfinder.find(['C72.5','F28']),indent=6))

{
      "F28": [
            {
                  "original_code": "F28",
                  "concept_id": 436073,
                  "concept_name": "Psychotic disorder",
                  "domain_id": "Condition",
                  "vocabulary_id": "SNOMED",
                  "concept_class_id": "Disorder",
                  "standard_concept": "S",
                  "concept_code": "69322001",
                  "valid_start_date": 20020131,
                  "valid_end_date": 20991231,
                  "invalid_reason": null
            }
      ],
      "C72.5": [
            {
                  "original_code": "C72.5",
                  "concept_id": 433975,
                  "concept_name": "Primary malignant neoplasm of cranial nerve",
                  "domain_id": "Condition",
                  "vocabulary_id": "SNOMED",
                  "concept_class_id": "Disorder",
                  "standard_concept": "S",
                  "concept_code": "93767009",
                  "va

### PyCarrot Mapper

Create an instance of the concept mapper that uses the finder 

In [7]:
import json
import pandas as pd
from pycarrot import ConceptMapper
cmapper = ConceptMapper(cfinder) 
cmapper

<pycarrot.concept_mapper.ConceptMapper at 0x11a211d60>

## WhiteRabbit Scan Reports

Load a scan report

In [8]:
df_smr01 = pd.read_excel("./ScanReports/SMR01_ScanReport.xlsx", sheet_name=None)

In [9]:
df_smr01.keys()

dict_keys(['Field Overview', 'Table Overview', 'smr01_operations.csv', 'smr01_conditions.csv', 'demographics.csv', '_'])

## Map SMR01

Set the scan report on the cmapper instance

In [10]:
cmapper.set_scan_report(df_smr01)

### Manually mapping demographics 

Manually map the demographics table with concept codes for male and female (this has been done by hand)

In [11]:
demo = cmapper.map(
    "demographics.csv",
    {"sex": {"1": "8507", "2": "8532"}},
    "encrypted_id",
    "dob",
)
print(json.dumps(demo, indent=6))

{
      "person": {
            "person_id": {
                  "source_table": "demographics.csv",
                  "source_field": "encrypted_id"
            },
            "birth_datetime": {
                  "source_table": "demographics.csv",
                  "source_field": "dob"
            },
            "gender_concept_id": {
                  "source_table": "demographics.csv",
                  "source_field": "sex",
                  "term_mapping": {
                        "1": 8507,
                        "2": 8532
                  }
            },
            "gender_source_value": {
                  "source_table": "demographics.csv",
                  "source_field": "sex"
            }
      }
}


### Automatic mapping of Conditions

Automatically map the conditions table. The column `newconditon` contains all the ICD10CM codes that are in this dataset

In [12]:
df_smr01['smr01_conditions.csv']['newcondition']

0        A00.0
1        A00.9
2        A01.0
3        A01.1
4        A01.2
         ...  
15176    Z99.2
15177    Z99.3
15178    Z99.4
15179    Z99.8
15180    Z99.9
Name: newcondition, Length: 15181, dtype: object

Run the cmapper to map this column of this table 

In [13]:
tab_1 = cmapper.map(
    source_table="smr01_conditions.csv",
    source_field="newcondition",
    person_id="encrypted_id", 
    date_event="admission_date"
)
print(json.dumps(tab_1, indent=6)[:1000]+".....")

{
      "condition_occurrence": {
            "person_id": {
                  "source_table": "smr01_conditions.csv",
                  "source_field": "encrypted_id"
            },
            "condition_start_datetime": {
                  "source_table": "smr01_conditions.csv",
                  "source_field": "admission_date"
            },
            "condition_concept_id": {
                  "source_table": "smr01_conditions.csv",
                  "source_field": "newcondition",
                  "term_mapping": {
                        "A00.0": 4344638,
                        "A00.9": 198677,
                        "A01.0": 192819,
                        "A01.1": 195460,
                        "A01.2": 193953,
                        "A01.3": 442291,
                        "A01.4": 195177,
                        "A02.0": 196328,
                        "A02.1": 40493039,
                        "A02.2": 141209,
                        "A02.8": 133685,
               

### Automatic mapping of Operations

Next we map the operations table

In [14]:
tab_2 = cmapper.map(
    "smr01_operations.csv", "operation_new", "encrypted_id", "date_operation"
)
print(json.dumps(tab_2, indent=6)[:1000]+".....")

{
      "condition_occurrence": {
            "person_id": {
                  "source_table": "smr01_operations.csv",
                  "source_field": "encrypted_id"
            },
            "condition_start_datetime": {
                  "source_table": "smr01_operations.csv",
                  "source_field": "date_operation"
            },
            "condition_concept_id": {
                  "source_table": "smr01_operations.csv",
                  "source_field": "operation_new",
                  "term_mapping": {
                        "A01.1": 195460,
                        "A01.2": 193953,
                        "A01.3": 442291,
                        "A02.1": 40493039,
                        "A02.2": 141209,
                        "A02.8": 133685,
                        "A02.9": 133685,
                        "A03.1": 4145763,
                        "A03.2": 4185509,
                        "A03.3": 4321384,
                        "A03.8": 440938,
            

### Output Rules JSON

Finally we construct a rules file that is compatitble with the carrot-cdm

In [15]:
rules = {
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(demo.items())},
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(tab_1.items())},
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(tab_2.items())},
}

rules = {"cdm": rules, "metadata": {"dataset": "PHS_SMR01"}}
with open("rules_smr01.json", "w") as f:
    json.dump(rules, f, indent=6)

print(json.dumps(rules, indent=6)[:600]+".....")

{
      "cdm": {
            "person": {
                  "person_0": {
                        "person_id": {
                              "source_table": "demographics.csv",
                              "source_field": "encrypted_id"
                        },
                        "birth_datetime": {
                              "source_table": "demographics.csv",
                              "source_field": "dob"
                        },
                        "gender_concept_id": {
                              "source_table": "demographics.csv",
                              "s.....


### Alternative Rules

Create some alternative rules with a rule per concept (i.e. maximum one concept mapped in each term mapping)

In [16]:
demo_alt_f = cmapper.map(
    "demographics.csv",
    {"sex": {"1": "8507"}},
    "encrypted_id",
    "dob",
)

demo_alt_m = cmapper.map(
    "demographics.csv",
    {"sex": {"2": "8532"}},
    "encrypted_id",
    "dob",
)

tab_1_alt = cmapper.map(
    "smr01_conditions.csv", "newcondition", "encrypted_id", "admission_date",one_to_one=True
)

tab_2_alt = cmapper.map(
    "smr01_operations.csv", "operation_new", "encrypted_id", "date_operation",one_to_one=True
)


rules_alt = {
    **{"person": {
        "female": demo_alt_f['person'],
        "male": demo_alt_m['person']
    }},
    **{
        k: { 
            f"{k}_{i}": obj
            for i,obj in enumerate(v)
        }
        for k, v in tab_1_alt.items()
    },
        **{
        k: { 
            f"{k}_{i}": obj
            for i,obj in enumerate(v)
        }
        for k, v in tab_2_alt.items()
    }
}

rules_alt = {"cdm": rules_alt, "metadata": {"dataset": "PHS_SMR01"}}
with open("rules_smr01_alternative.json", "w") as f:
    json.dump(rules_alt, f, indent=6)

print(json.dumps(rules_alt, indent=6)[:600]+".....")


{
      "cdm": {
            "person": {
                  "female": {
                        "person_id": {
                              "source_table": "demographics.csv",
                              "source_field": "encrypted_id"
                        },
                        "birth_datetime": {
                              "source_table": "demographics.csv",
                              "source_field": "dob"
                        },
                        "gender_concept_id": {
                              "source_table": "demographics.csv",
                              "sou.....


## Map NRS Deaths

Moving onto mapping the NRS Deaths scan report

First we load the new scan report and set it in the pycarrot mapper instance

In [17]:
df_deaths = pd.read_excel("./ScanReports/Deaths_ScanReport.xlsx", sheet_name=None)
cmapper.set_scan_report(df_deaths)

### Build the default rules

In [18]:
demo = cmapper.map(
    "demographics.csv",
    {"sex": {"1": "8507", "2": "8532"}},
    "encrypted_id",
    "dob",
)

tab_1 = cmapper.map("deaths_a.csv", "causedeath", "encrypted_id", "date_of_death")
tab_2 = cmapper.map(
    "deaths_b.csv", {"death": {"1": "4306655"}}, "encrypted_id", "date_of_death"
)

rules = {
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(demo.items())},
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(tab_1.items())},
    **{k: {f"{k}_{i}": v} for i, (k, v) in enumerate(tab_2.items())},
}

rules = {"cdm": rules, "metadata": {"dataset": "PHS_deaths"}}

with open("rules_deaths.json", "w") as f:
    json.dump(rules, f, indent=6)



print(json.dumps(rules, indent=6)[:1000]+".....")



{
      "cdm": {
            "person": {
                  "person_0": {
                        "person_id": {
                              "source_table": "demographics.csv",
                              "source_field": "encrypted_id"
                        },
                        "birth_datetime": {
                              "source_table": "demographics.csv",
                              "source_field": "dob"
                        },
                        "gender_concept_id": {
                              "source_table": "demographics.csv",
                              "source_field": "sex",
                              "term_mapping": {
                                    "1": 8507,
                                    "2": 8532
                              }
                        },
                        "gender_source_value": {
                              "source_table": "demographics.csv",
                              "source_field": "sex"
            

### Alternative rules

As above, build rules with one-to-one concept mapping

In [19]:
demo_alt_f = cmapper.map(
    "demographics.csv",
    {"sex": {"1": "8507"}},
    "encrypted_id",
    "dob",
)

demo_alt_m = cmapper.map(
    "demographics.csv",
    {"sex": {"2": "8532"}},
    "encrypted_id",
    "dob",
)

tab_1_alt = cmapper.map("deaths_a.csv", "causedeath", "encrypted_id", "date_of_death",one_to_one=True)
tab_2_alt = cmapper.map(
    "deaths_b.csv", {"death": {"1": "4306655"}}, "encrypted_id", "date_of_death",one_to_one=True
)


rules_alt = {
    **{"person": {
        "female": demo_alt_f['person'],
        "male": demo_alt_m['person']
    }},
    **{
        k: { 
            f"{k}_{i}": obj
            for i,obj in enumerate(v)
        }
        for k, v in tab_1_alt.items()
    },
        **{
        k: { 
            f"{k}_{i}": obj
            for i,obj in enumerate(v)
        }
        for k, v in tab_2_alt.items()
    }
}

rules_alt = {"cdm": rules_alt, "metadata": {"dataset": "PHS_deaths"}}
with open("rules_deaths_alternative.json", "w") as f:
    json.dump(rules_alt, f, indent=6)

print(json.dumps(rules_alt, indent=6)[:1000]+".....")


{
      "cdm": {
            "person": {
                  "female": {
                        "person_id": {
                              "source_table": "demographics.csv",
                              "source_field": "encrypted_id"
                        },
                        "birth_datetime": {
                              "source_table": "demographics.csv",
                              "source_field": "dob"
                        },
                        "gender_concept_id": {
                              "source_table": "demographics.csv",
                              "source_field": "sex",
                              "term_mapping": {
                                    "1": 8507
                              }
                        },
                        "gender_source_value": {
                              "source_table": "demographics.csv",
                              "source_field": "sex"
                        }
                  },
              