# How to clean Concomitant Meds

This notebook demonstrates how to load and clean the concomitant medication table from the PPMI dataset. It finishes by building a pivot table of comorbidities for each patient, determined by whether the patient was ever on medication for one of the labelled comorbidities.

In [1]:
import pandas as pd
from pie.data_loader import DataLoader
from pie.data_preprocessor import DataPreprocessor

In [2]:
# Load data, and check we have the Concomitant Meds
# For demonstration purposes, we will not automatically clean the data upon loading
data = DataLoader.load("../PPMI", clean_data=False)
cmeds_df = data[DataLoader.MEDICAL_HISTORY]["Concomitant_Medication"]
print(cmeds_df.shape)
cmeds_df.head()

2025-05-13 14:58:40 data_loader.py [INFO] Biospecimen modality requested. Exclusion list: []
2025-05-13 14:58:40 data_loader.py [INFO] Loading subject_characteristics data...
2025-05-13 14:58:40 data_loader.py [INFO] Loading medical_history data...


  df_temp = pd.read_csv(csv_file)


2025-05-13 14:58:40 data_loader.py [INFO] Loaded 5 medical_history tables
2025-05-13 14:58:49 data_loader.py [INFO] Loading motor_assessments data...
2025-05-13 14:58:49 data_loader.py [INFO] Loading non_motor_assessments data...
2025-05-13 14:58:49 data_loader.py [INFO] Loading biospecimen data...
(45267, 22)


Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,CMTRT,CMDOSE,CMDOSU,CMDOSFRQ,ROUTE,STARTDT,...,CMINDC,CMINDC_TEXT,TOTDDOSE,RECNO,SEQNO1,SEQNO2,WHODRUG,EXCLMED,ORIG_ENTRY,LAST_UPDATE
0,IA9331,3000,ED,CONMED,Lisinopril,,,,,01/2016,...,14.0,,,,,,,,06/2021,2022-11-07 00:00:00.0
1,412285501,3000,LOG,CMED,MELATONIN,3.0,MG,PRN,3.0,04/2013,...,,INSOMNIA,,12432.0,1.0,1.0,MELATONIN,N,09/2013,2020-06-25 16:02:20.0
2,627733901,3000,LOG,CMED,PNEUMOVAX,25.0,MCG,ONCE,2.0,02/2017,...,,AT RISK POPULATION,,4966.0,1.0,2.0,PNEUMOVAX,N,04/2017,2020-06-25 16:02:20.0
3,269586901,3000,LOG,CMED,LISINOPRIL,20.0,MG,QD,3.0,01/2006,...,,HYPERTENSION,,8940.0,1.0,1.0,LISINOPRIL,N,01/2011,2022-09-13 07:29:02.0
4,269587001,3000,LOG,CMED,HYDROCHLOROTHIAZIDE,25.0,MG,QD,3.0,01/2006,...,,HYPERTENSION,,220.0,1.0,1.0,HYDROCHLOROTHIAZIDE,N,01/2011,2022-09-13 07:29:02.0


In [3]:
# Before cleaning, check the dtypes of the dates
print(cmeds_df["STARTDT"].dtype)
print(cmeds_df["STOPDT"].dtype)

object
object


In [4]:
# Before cleaning, count how many indication codes we have (45267 rows but fewer codes)
print(f"There are {cmeds_df['CMINDC'].isnull().sum()} nulls")
cmeds_df["CMINDC"].value_counts()

There are 19514 nulls


CMINDC
25.0    8728
24.0    2636
14.0    2558
22.0    2020
13.0    1679
10.0    1168
17.0     821
12.0     747
1.0      686
11.0     685
3.0      648
23.0     602
15.0     563
18.0     528
6.0      477
7.0      441
20.0     220
2.0      207
19.0     119
4.0       69
5.0       69
16.0      39
9.0       24
8.0       14
21.0       5
Name: count, dtype: int64

In [5]:
# Clean up the dates, and map the indication codes and reason text
clean_df = DataPreprocessor.clean_concomitant_meds(cmeds_df)

2025-05-13 14:58:49 data_preprocessor.py [INFO] There are 78 concomitant medication entries with no start date.
2025-05-13 14:58:49 data_preprocessor.py [INFO] There are 21696 concomitant medication entries with no stop date.


In [6]:
# After cleaning, dates are now datetimes
print(clean_df["STARTDT"].dtype)
print(clean_df["STOPDT"].dtype)

datetime64[ns]
datetime64[ns]


In [7]:
# After cleaning, all indication codes have been mapped, and converted to int
print(f"There are {clean_df['CMINDC'].isnull().sum()} nulls")
clean_df["CMINDC"].value_counts()

There are 0 nulls


CMINDC
25    13142
24     5080
22     4877
14     4110
13     2588
17     2554
10     2001
23     1456
12     1264
1      1166
3      1131
15     1083
11      987
6       896
18      686
7       648
20      419
2       361
4       199
19      171
8       130
16      129
5        90
9        84
21       15
Name: count, dtype: int64

In [8]:
# We can look at the text instead of the codes
clean_df["CMINDC_TEXT"].value_counts()

CMINDC_TEXT
Other                                                         13142
Vitamins / Coenzymes                                           5080
Supplements / Homeopathic Medication                           4877
Hypertension                                                   4110
Hyperlipidemia                                                 2588
Pain                                                           2554
Depression                                                     2001
Thyroid Disorder                                               1456
GERD                                                           1264
Anxiety                                                        1166
Benign Prostatic Hypertrophy / Overactive Bladder              1131
Insomnia                                                       1083
Diabetes                                                        987
Constipation                                                    896
REM-Behavior Disorder               

In [9]:
# Now we can build a table of which comorbidities each patient has ever had
comorbs = clean_df.pivot_table(index="PATNO", columns="CMINDC_TEXT", values="CMINDC",
                               aggfunc=lambda v: 1, fill_value=0)
print(comorbs.shape)
comorbs.head()

(4422, 25)


CMINDC_TEXT,Anxiety,Atrial Fibrillation / Arrhythmias,Benign Prostatic Hypertrophy / Overactive Bladder,Cognitive Dysfunction,Congestive Heart Failure,Constipation,"Coronary Artery Disease, Peripheral Artery Disease, Stroke",Daytime Sleepiness,"Delusions, Hallucination, Psychosis",Depression,...,Nausea,Other,Pain,REM-Behavior Disorder,Restless Leg Syndrome,Sexual Dysfunction,Sialorrhea / Drooling,Supplements / Homeopathic Medication,Thyroid Disorder,Vitamins / Coenzymes
PATNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3001,0,0,1,0,0,1,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
3002,0,0,0,0,0,1,0,0,0,0,...,0,1,1,0,0,0,0,1,1,1
3003,1,0,1,0,0,1,0,0,0,0,...,0,1,1,0,0,0,0,1,1,1
3004,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,1


In [10]:
# How many patients have Cognitive Dysfunction? This number is smaller than the one from
# value_counts(), because some patients are given a drug for a comorbidity multiple times
comorbs["Cognitive Dysfunction"].sum()

np.int64(112)