In [1]:
import pandas as pd
import matplotlib.pyplot as plt

data_motor = pd.read_csv('/Users/larsheijnen/Thesis/data/motor/MDS-UPDRS_Part_III_21Mar2025.csv')
data_blood = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/Current_Biospecimen_Analysis_Results_29Mar2025.csv')

  data_blood = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/Current_Biospecimen_Analysis_Results_29Mar2025.csv')


# Data Filtering Documentation

## Original List
The starting list contains the following fields:
['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'PDTRTMNT', 'PDSTATE', 'HRPOSTMED', 'HRDBSON', 'HRDBSOFF', 'PDMEDYN', 'DBSYN', 'ONOFFORDER', 'OFFEXAM', 'OFFNORSN', 'DBSOFFTM', 'ONEXAM', 'ONNORSN', 'HIFUYN', 'DBSONTM', 'PDMEDDT', 'PDMEDTM', 'EXAMDT', 'EXAMTM', 'NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON', 'NP3TOT', 'DYSKPRES', 'DYSKIRAT', 'NHY', 'ORIG_ENTRY', 'LAST_UPDATE']


## Filtering Steps
The filtering process follows these steps:

1. **Keep Relevant Patient/Record Info:**
   - REC_ID
   - PATNO
   - EVENT_ID
   - PAG_NAME
   - ORIG_ENTRY
   - LAST_UPDATE

2. **Keep Examination Date/Time:**
   - EXAMDT
   - EXAMTM
   - INFODT (often the assessment date)

3. **Keep Specified Motor Score:**
   - NP3TOT

4. **Keep Treatment/Status Context (Relevant for interpreting NP3TOT):**
   - PDTRTMNT
   - PDSTATE
   - HRPOSTMED
   - HRDBSON
   - HRDBSOFF
   - PDMEDYN
   - DBSYN
   - ONOFFORDER
   - OFFEXAM
   - OFFNORSN
   - DBSOFFTM
   - ONEXAM
   - ONNORSN
   - HIFUYN
   - DBSONTM
   - PDMEDDT
   - PDMEDTM

5. **Remove Other Motor Scores:**
   - All W other NP3... items
   - DYSKPRES
   - DYSKIRAT
   - NHY

## Resulting Filtered List
The final filtered list contains:
| Field Name   | Description                                           |
|--------------|-------------------------------------------------------|
| REC_ID       | Record Identifier                                     |
| PATNO        | Participant Identifier (Relevant patient info)        |
| EVENT_ID     | Visit Identifier                                      |
| PAG_NAME     | Page Name (Metadata)                                  |
| INFODT       | Assessment Date (Often the primary date)             |
| PDTRTMNT     | On treatment status (Context)                         |
| PDSTATE      | ON/OFF state (Context)                               |
| HRPOSTMED    | Hours post medication (Context)                       |
| HRDBSON      | Hours DBS On (Context)                                |
| HRDBSOFF     | Hours DBS Off (Context)                               |
| PDMEDYN      | On PD Medication? (Context)                          |
| DBSYN        | Has DBS? (Context)                                   |
| ONOFFORDER   | Order of ON/OFF exam (Context)                        |
| OFFEXAM      | OFF Exam Performed? (Context)                        |
| OFFNORSN     | Reason OFF Exam Not Performed (Context)               |
| DBSOFFTM     | Time DBS Turned Off (Context)                         |
| ONEXAM       | ON Exam Performed? (Context)                         |
| ONNORSN      | Reason ON Exam Not Performed (Context)                |
| HIFUYN       | Had HIFU? (Context)                                  |
| DBSONTM      | Time DBS Turned On (Context)                          |
| PDMEDDT      | Date of Last PD Med Dose (Context)                    |
| PDMEDTM      | Time of Last PD Med Dose (Context)                    |
| EXAMDT       | Examination Date (Specifically requested)            |
| EXAMTM       | Examination Time (Specifically requested)            |
| NP3TOT       | MDS-UPDRS Part III Total Score (Specifically requested to keep) |
| ORIG_ENTRY   | Original Entry Date (Metadata)                        |
| LAST_UPDATE  | Last Update Date (Metadata)                           |

In [2]:
columns_to_keep = [
    'REC_ID',        # Record Identifier
    'PATNO',         # Participant Identifier (Relevant patient info)
    'EVENT_ID',      # Visit Identifier
    'PAG_NAME',      # Page Name (Metadata)
    'INFODT',        # Assessment Date (Often the primary date)
    'PDTRTMNT',      # On treatment status (Context)
    'PDSTATE',       # ON/OFF state (Context)
    'HRPOSTMED',     # Hours post medication (Context)
    'HRDBSON',       # Hours DBS On (Context)
    'HRDBSOFF',      # Hours DBS Off (Context)
    'PDMEDYN',       # On PD Medication? (Context)
    'DBSYN',         # Has DBS? (Context)
    'ONOFFORDER',    # Order of ON/OFF exam (Context)
    'OFFEXAM',       # OFF Exam Performed? (Context)
    'OFFNORSN',      # Reason OFF Exam Not Performed (Context)
    'DBSOFFTM',      # Time DBS Turned Off (Context)
    'ONEXAM',        # ON Exam Performed? (Context)
    'ONNORSN',       # Reason ON Exam Not Performed (Context)
    'HIFUYN',        # Had HIFU? (Context)
    'DBSONTM',       # Time DBS Turned On (Context)
    'PDMEDDT',       # Date of Last PD Med Dose (Context)
    'PDMEDTM',       # Time of Last PD Med Dose (Context)
    'EXAMDT',        # Examination Date (Specifically requested)
    'EXAMTM',        # Examination Time (Specifically requested)
    'NP3TOT',        # MDS-UPDRS Part III Total Score (Specifically requested to keep)
    'ORIG_ENTRY',    # Original Entry Date (Metadata)
    'LAST_UPDATE'    # Last Update Date (Metadata)
]

In [3]:
filtered_data = data_motor[columns_to_keep].copy()
# filtered_data.to_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_Patient_Data.csv', index=False)

In [4]:
print("\nFiltered DataFrame head:")
filtered_data.head()


Filtered DataFrame head:


Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,ONNORSN,HIFUYN,DBSONTM,PDMEDDT,PDMEDTM,EXAMDT,EXAMTM,NP3TOT,ORIG_ENTRY,LAST_UPDATE
0,272451901,3000,BL,NUPDRS3,02/2011,,,,,,...,,,,,,02/2011,13:17:00,4.0,02/2011,2020-06-25 16:02:19.0
1,338703101,3000,V04,NUPDRS3,03/2012,,,,,,...,,,,,,03/2012,13:47:00,1.0,03/2012,2020-06-25 16:02:22.0
2,385009801,3000,V06,NUPDRS3,02/2013,,,,,,...,,,,,,02/2013,12:22:00,4.0,02/2013,2020-06-25 16:02:22.0
3,437131401,3000,V08,NUPDRS3,03/2014,,,,,,...,,,,,,03/2014,13:22:00,2.0,05/2014,2020-06-25 16:02:22.0
4,512469901,3000,V10,NUPDRS3,03/2015,,,,,,...,,,,,,03/2015,11:43:00,19.0,03/2015,2020-06-25 16:02:23.0


In [None]:
filtered_data.head(300).to_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_First_300_Rows.csv', index=False)
data_blood.head(300).to_csv('/Users/larsheijnen/Thesis/data/Blood_data_First_300_Rows.csv')

  data_blood = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/Current_Biospecimen_Analysis_Results_29Mar2025.csv')


In [5]:
#To check the different types, testvalues

data_blood_filtered = data_blood[data_blood['PATNO'] == 4035]
# data_blood_filtered.head()
print(data_blood_filtered['TYPE'].unique())
# print(data_blood_filtered['TESTVALUE'].unique())

['DNA' 'Serum' 'miRNA' 'Cerebrospinal Fluid' 'Whole Blood' 'Plasma'
 'Urine']


In [11]:
csf_data = data_blood[data_blood['TYPE'] == 'Cerebrospinal Fluid']
print(csf_data['TESTNAME'].unique())

# csf_data.to_csv('/Users/larsheijnen/Thesis/data/blood/CSF_blood_data.csv', index=False)

['3,4-Dihydroxymandelic acid' '3,4-Dihydroxyphenylacetic acid (DOPAC)'
 '3,4-Dihydroxyphenylalanine (DOPA)' '3,4-Dihydroxyphenylglycol (DOPEG)'
 '3-Methoxytyrosine' '3-O-Methyldopamine'
 '4-Hydroxy-3-methoxymandelic acid'
 '4-Hydroxy-3-methoxyphenylglycol (HMPG)'
 '5-Hydroxy-3-indoleacetic acid (5-HIAA)' 'Adrenaline (Epinephrine)'
 'Dopamine' 'Histamine' 'Homovanillic acid (HVA)' 'Metanephrine'
 'Noradrenaline (Norepinephrine)' 'Normetanephrine' 'Serotonin (5-HT)'
 'CSF Alpha-synuclein' 'ABeta 1-42' 'pTau' 'tTau' 'MTDNA_DELETION'
 'MTDNA_ND1_CN' 'MTDNA_ND4_CN' 'NDNA_B2M_CN' 'NDNA_B2M_CN_v2'
 'CSF Hemoglobin' 'C16 Cer' 'C16 GL2' 'C16 GlcCer' 'C16 SM' 'C18 Cer'
 'C18 GL2' 'C18 GlcCer' 'C18 SM' 'C20 Cer' 'C20 GL2' 'C20 GlcCer' 'C20 SM'
 'C22 Cer' 'C22 GL2' 'C22 GlcCer' 'C22 SM' 'C23 Cer' 'C23 GL2'
 'C23 GlcCer' 'C23 SM' 'C24 Cer' 'C24 GL2' 'C24 GlcCer' 'C24 SM'
 'C24:1 Cer' 'C24:1 GL2' 'C24:1 GlcCer' 'C24:1 SM' 'total Cer' 'total GL2'
 'total GlcCer' 'Total SM' 'a-Synuclein' 'GFAP' 'IL-6'

In [6]:
pd_cohort = data_blood[data_blood['COHORT'] == 'PD']
unique_patnos_PD = pd_cohort['PATNO'].nunique()

non_pd_cohort = data_blood[data_blood['COHORT'] != 'PD']
unique_patnos_control = non_pd_cohort['PATNO'].nunique()
print(f"Number of unique PATNOs in non_pd_cohort: {unique_patnos_control}")
print(f"Number of unique PATNOs in pd_cohort: {unique_patnos_PD}")

Number of unique PATNOs in non_pd_cohort: 941
Number of unique PATNOs in pd_cohort: 825


In [7]:
# Count unique PATNOs in each dataset
unique_patnos_data_motor = data_motor['PATNO'].nunique()
unique_patnos_data_blood = data_blood['PATNO'].nunique()

# Find the intersection of PATNOs in both datasets
common_patnos = set(data_motor['PATNO']).intersection(set(data_blood['PATNO']))
common_patnos_num = len(set(data_motor['PATNO']).intersection(set(data_blood['PATNO'])))

print(f"Unique PATNOs in data_motor: {unique_patnos_data_motor}")
print(f"Unique PATNOs in data_blood: {unique_patnos_data_blood}")
print(f"PATNOs in both datasets: {common_patnos_num}")

Unique PATNOs in data_motor: 4313
Unique PATNOs in data_blood: 1766
PATNOs in both datasets: 1734


In [8]:
#Here, we check the PATNOs that are in blood, and motor data. 
common_patnos_blood = data_blood[data_blood['PATNO'].isin(common_patnos)]

# Of that common_patnos_blood data, we check how many are in the PD cohort.
pd_cohort_count = common_patnos_blood[common_patnos_blood['COHORT'] == 'PD']['PATNO'].nunique()

# Of that common_patnos_blood data, we check how many are in a non-PD cohort.
non_pd_cohort_count = common_patnos_blood[common_patnos_blood['COHORT'] != 'PD']['PATNO'].nunique()

# Print the results
print(f"PATNOs in both datasets and in PD cohort: {pd_cohort_count}")
print(f"PATNOs in both datasets and in non-PD cohort: {non_pd_cohort_count}")

PATNOs in both datasets and in PD cohort: 822
PATNOs in both datasets and in non-PD cohort: 912
