In [36]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Common drugs (OMOP concept_id and names)
drug_concepts = [
    {"concept_id": 19019073, "concept_name": "Acetaminophen 325 MG"},
    {"concept_id": 1125315, "concept_name": "Ibuprofen 200 MG"},
    {"concept_id": 1516766, "concept_name": "Metformin 500 MG"},
]

# Simulate openEHR medications
openEHR_records = []
for i in range(1, 6):
    drug = random.choice(drug_concepts)
    start = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 60))
    end = start + timedelta(days=random.randint(5, 20))
    openEHR_records.append({
        'patient_id': i,
        'concept_id': drug['concept_id'],
        'concept_name': drug['concept_name'],
        'start_date': start.date(),
        'end_date': end.date(),
        'dosage': random.choice([100, 250, 500]),
        'route': random.choice(['oral', 'intravenous']),
        'source': 'openEHR',
        'priority': 1
    })

openEHR_df = pd.DataFrame(openEHR_records)

# Simulate externalEHR medications (possibly conflicting)
externalEHR_records = []
for i in range(1, 6):
    drug = random.choice(drug_concepts)
    start = datetime(2024, 1, 15) + timedelta(days=random.randint(0, 60))
    end = start + timedelta(days=random.randint(5, 20))
    externalEHR_records.append({
        'patient_id': i,
        'concept_id': drug['concept_id'],
        'concept_name': drug['concept_name'],
        'start_date': start.date(),
        'end_date': end.date(),
        'dosage': random.choice([100, 250, 500]),
        'route': random.choice(['oral', 'intravenous']),
        'source': 'externalEHR',
        'priority': 2
    })

externalEHR_df = pd.DataFrame(externalEHR_records)


In [38]:
# Combine both sources
combined_df = pd.concat([openEHR_df, externalEHR_df], ignore_index=True)

# Convert dates to datetime
combined_df['start_date'] = pd.to_datetime(combined_df['start_date'])
combined_df['end_date'] = pd.to_datetime(combined_df['end_date'])

# Sort by patient, concept, priority, and start date
combined_df = combined_df.sort_values(by=['patient_id', 'concept_id', 'priority', 'start_date'])

# Deduplicate: keep best record per patient-drug combination
dedup_df = combined_df.drop_duplicates(subset=['patient_id', 'concept_id'], keep='first')

print("Deduplicated records ready for OMOP `drug_exposure`")
print(dedup_df[['patient_id', 'concept_id', 'concept_name', 'start_date', 'end_date', 'dosage', 'route', 'source']])


Deduplicated records ready for OMOP `drug_exposure`
   patient_id  concept_id          concept_name start_date   end_date  dosage  \
0           1     1125315      Ibuprofen 200 MG 2024-02-17 2024-03-05     500   
5           1     1516766      Metformin 500 MG 2024-02-13 2024-02-24     250   
1           2     1125315      Ibuprofen 200 MG 2024-01-20 2024-02-04     250   
6           2     1516766      Metformin 500 MG 2024-02-04 2024-02-18     100   
2           3     1125315      Ibuprofen 200 MG 2024-02-10 2024-02-18     100   
3           4    19019073  Acetaminophen 325 MG 2024-02-26 2024-03-09     500   
4           5    19019073  Acetaminophen 325 MG 2024-02-13 2024-02-22     100   

         route       source  
0         oral      openEHR  
5         oral  externalEHR  
1  intravenous      openEHR  
6         oral  externalEHR  
2  intravenous      openEHR  
3         oral      openEHR  
4         oral      openEHR  


In [40]:
# Generate synthetic drug_exposure_id and visit_occurrence_id
dedup_df = dedup_df.reset_index(drop=True)
dedup_df['drug_exposure_id'] = dedup_df.index + 1
dedup_df['visit_occurrence_id'] = 100000 + dedup_df.index + 1
dedup_df['drug_type_concept_id'] = 32817  # default for prescription

# Reorder for OMOP drug_exposure
drug_exposure_omop = dedup_df[[
    'drug_exposure_id',
    'patient_id',
    'concept_id',
    'start_date',
    'end_date',
    'drug_type_concept_id',
    'visit_occurrence_id',
    'dosage',
    'route',
    'source'
]].rename(columns={
    'patient_id': 'person_id',
    'concept_id': 'drug_concept_id',
    'start_date': 'drug_exposure_start_date',
    'end_date': 'drug_exposure_end_date'
})

# Save as CSV
drug_exposure_omop.to_csv("merged_drug_exposure.csv", index=False)
print("Saved to merged_drug_exposure.csv")


Saved to merged_drug_exposure.csv
