In [1]:
import pandas as pd
import matplotlib.pyplot as plt

data_motor = pd.read_csv('/Users/larsheijnen/Thesis/data/motor/MDS-UPDRS_Part_III_21Mar2025.csv')
data_blood = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/Current_Biospecimen_Analysis_Results_29Mar2025.csv')

  data_blood = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/Current_Biospecimen_Analysis_Results_29Mar2025.csv')


# Data Filtering Documentation

## Original List
The starting list contains the following fields:
['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'PDTRTMNT', 'PDSTATE', 'HRPOSTMED', 'HRDBSON', 'HRDBSOFF', 'PDMEDYN', 'DBSYN', 'ONOFFORDER', 'OFFEXAM', 'OFFNORSN', 'DBSOFFTM', 'ONEXAM', 'ONNORSN', 'HIFUYN', 'DBSONTM', 'PDMEDDT', 'PDMEDTM', 'EXAMDT', 'EXAMTM', 'NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON', 'NP3TOT', 'DYSKPRES', 'DYSKIRAT', 'NHY', 'ORIG_ENTRY', 'LAST_UPDATE']


## Filtering Steps
The filtering process follows these steps:

1. **Keep Relevant Patient/Record Info:**
   - REC_ID
   - PATNO
   - EVENT_ID
   - PAG_NAME
   - ORIG_ENTRY
   - LAST_UPDATE

2. **Keep Examination Date/Time:**
   - EXAMDT
   - EXAMTM
   - INFODT (often the assessment date)

3. **Keep Specified Motor Score:**
   - NP3TOT

4. **Keep Treatment/Status Context (Relevant for interpreting NP3TOT):**
   - PDTRTMNT
   - PDSTATE
   - HRPOSTMED
   - HRDBSON
   - HRDBSOFF
   - PDMEDYN
   - DBSYN
   - ONOFFORDER
   - OFFEXAM
   - OFFNORSN
   - DBSOFFTM
   - ONEXAM
   - ONNORSN
   - HIFUYN
   - DBSONTM
   - PDMEDDT
   - PDMEDTM

5. **Remove Other Motor Scores:**
   - All W other NP3... items
   - DYSKPRES
   - DYSKIRAT
   - NHY

## Resulting Filtered List
The final filtered list contains:
| Field Name   | Description                                           |
|--------------|-------------------------------------------------------|
| REC_ID       | Record Identifier                                     |
| PATNO        | Participant Identifier (Relevant patient info)        |
| EVENT_ID     | Visit Identifier                                      |
| PAG_NAME     | Page Name (Metadata)                                  |
| INFODT       | Assessment Date (Often the primary date)             |
| PDTRTMNT     | On treatment status (Context)                         |
| PDSTATE      | ON/OFF state (Context)                               |
| HRPOSTMED    | Hours post medication (Context)                       |
| HRDBSON      | Hours DBS On (Context)                                |
| HRDBSOFF     | Hours DBS Off (Context)                               |
| PDMEDYN      | On PD Medication? (Context)                          |
| DBSYN        | Has DBS? (Context)                                   |
| ONOFFORDER   | Order of ON/OFF exam (Context)                        |
| OFFEXAM      | OFF Exam Performed? (Context)                        |
| OFFNORSN     | Reason OFF Exam Not Performed (Context)               |
| DBSOFFTM     | Time DBS Turned Off (Context)                         |
| ONEXAM       | ON Exam Performed? (Context)                         |
| ONNORSN      | Reason ON Exam Not Performed (Context)                |
| HIFUYN       | Had HIFU? (Context)                                  |
| DBSONTM      | Time DBS Turned On (Context)                          |
| PDMEDDT      | Date of Last PD Med Dose (Context)                    |
| PDMEDTM      | Time of Last PD Med Dose (Context)                    |
| EXAMDT       | Examination Date (Specifically requested)            |
| EXAMTM       | Examination Time (Specifically requested)            |
| NP3TOT       | MDS-UPDRS Part III Total Score (Specifically requested to keep) |
| ORIG_ENTRY   | Original Entry Date (Metadata)                        |
| LAST_UPDATE  | Last Update Date (Metadata)                           |

In [2]:
columns_to_keep = [
    'REC_ID',        # Record Identifier
    'PATNO',         # Participant Identifier (Relevant patient info)
    'EVENT_ID',      # Visit Identifier
    'PAG_NAME',      # Page Name (Metadata)
    'INFODT',        # Assessment Date (Often the primary date)
    'PDTRTMNT',      # On treatment status (Context)
    'PDSTATE',       # ON/OFF state (Context)
    'HRPOSTMED',     # Hours post medication (Context)
    'HRDBSON',       # Hours DBS On (Context)
    'HRDBSOFF',      # Hours DBS Off (Context)
    'PDMEDYN',       # On PD Medication? (Context)
    'DBSYN',         # Has DBS? (Context)
    'ONOFFORDER',    # Order of ON/OFF exam (Context)
    'OFFEXAM',       # OFF Exam Performed? (Context)
    'OFFNORSN',      # Reason OFF Exam Not Performed (Context)
    'DBSOFFTM',      # Time DBS Turned Off (Context)
    'ONEXAM',        # ON Exam Performed? (Context)
    'ONNORSN',       # Reason ON Exam Not Performed (Context)
    'HIFUYN',        # Had HIFU? (Context)
    'DBSONTM',       # Time DBS Turned On (Context)
    'PDMEDDT',       # Date of Last PD Med Dose (Context)
    'PDMEDTM',       # Time of Last PD Med Dose (Context)
    'EXAMDT',        # Examination Date (Specifically requested)
    'EXAMTM',        # Examination Time (Specifically requested)
    'NP3TOT',        # MDS-UPDRS Part III Total Score (Specifically requested to keep)
    'ORIG_ENTRY',    # Original Entry Date (Metadata)
    'LAST_UPDATE'    # Last Update Date (Metadata)
]

In [3]:
filtered_data = data_motor[columns_to_keep].copy()
# filtered_data.to_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_Patient_Data.csv', index=False)

In [None]:
print("\nFiltered DataFrame head:")
filtered_data.head()

In [None]:
filtered_data.head(300).to_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_First_300_Rows.csv', index=False)
data_blood.head(300).to_csv('/Users/larsheijnen/Thesis/data/Blood_data_First_300_Rows.csv')

In [5]:
#To check the different types, testvalues

data_blood_filtered = data_blood[data_blood['PATNO'] == 4035]
# data_blood_filtered.head()
print(data_blood_filtered['TYPE'].unique())
# print(data_blood_filtered['TESTVALUE'].unique())

['DNA' 'Serum' 'miRNA' 'Cerebrospinal Fluid' 'Whole Blood' 'Plasma'
 'Urine']


#### Features to Keep in Cerebrospinal Fluid
Based on current evidence, prioritize the following CSF biomarkers:
##### α-Synuclein Metrics
- **a-Synuclein**: Reduced levels are associated with PD and progression.
- **RT-QuIC/SAA metrics (if available)**: Include variables like Time to Threshold, T50, and Area Under the Curve.
##### Tau and Amyloid Pathways
- **pTau**: Phosphorylated tau is linked to rapid motor progression.
- **ABeta 1-42**: Decreased levels predict cognitive decline.
##### Neuroaxonal Damage
- **NFL (Neurofilament Light)**: Elevated levels correlate with axonal degeneration and disease severity.
##### Mitochondrial Dysfunction
- **MTDNA_DELETION**: Reflects mitochondrial damage, a key mechanism in PD.
##### Inflammatory Markers
- **IL-6**: Tracks neuroinflammation, which may accelerate progression.
- **sTREM2 and YKL40**: Indicators of microglial activation and inflammation.
##### Glial Activation
- **GFAP (Glial Fibrillary Acidic Protein)**: Associated with astrocytic activation in PD.
- **S100**: Linked to glial activity and neurodegeneration.

https://www.perplexity.ai/search/dna-serum-mirna-cerebrospinal-4va8HG.iTeiizn9xFrtcrw

In [6]:
csf_data = data_blood[data_blood['TYPE'] == 'Cerebrospinal Fluid']
# print("Testnames in blood data:")
# print(csf_data['TESTNAME'].unique())

# csf_data.to_csv('/Users/larsheijnen/Thesis/data/blood/CSF_blood_data.csv', index=False)

useful_data = csf_data[csf_data['TESTNAME'].str.contains('Alpha-synuclein|pTau|ABeta 1-42|NFL|MTDNA_DELETION|IL-6|sTREM2|YKL40|GFAP|S100', case=False, na=False)]
csf_data = csf_data.sort_values(by='PATNO')
csf_data.to_csv('/Users/larsheijnen/Thesis/data/blood/CSF_useful_features_blood_data.csv', index=False)

# print('\n')
print("Useful testnames in blood data:")
print(useful_data['TESTNAME'].unique())


Useful testnames in blood data:
['CSF Alpha-synuclein' 'ABeta 1-42' 'pTau' 'MTDNA_DELETION' 'GFAP' 'IL-6'
 'S100' 'NFL' 'sTREM2' 'YKL40']


In [14]:
# useful_data['RUNDATE'] = pd.to_datetime(useful_data['RUNDATE']).dt.strftime('%m/%Y') #line up date format with MDS-UPDRS patient data file
filtered_data['INFODT'] = pd.to_datetime(filtered_data['INFODT'], format='%m/%Y', errors='coerce')
useful_data['RUNDATE'] = pd.to_datetime(useful_data['RUNDATE'], format='%m/%Y', errors='coerce')
useful_data = useful_data.sort_values(by='PATNO')
useful_data.to_csv('/Users/larsheijnen/Thesis/data/blood/CSF_useful_features_blood_data_sorted_date.csv', index=False)

In [15]:
# Get the PATNOs present in filtered_data
filtered_patnos = set(filtered_data['PATNO'])

# Filter useful_data to keep only rows with PATNOs present in filtered_data
useful_data = useful_data[useful_data['PATNO'].isin(filtered_patnos)]


# Check if all PATNOs in useful_data are present in filtered_data
overlap_patnos = set(useful_data['PATNO']).issubset(filtered_patnos)

# Print the result
print(f"All PATNOs in useful_data are present in filtered_data: {overlap_patnos}")

All PATNOs in useful_data are present in filtered_data: True


In [18]:
useful_data = useful_data.sort_values(by=['PATNO', 'RUNDATE'])
filtered_data = filtered_data.sort_values(by=['PATNO', 'INFODT'])

useful_data.to_csv('/Users/larsheijnen/Thesis/data/blood/TEST_CSF_useful_features_blood_data_sorted_date.csv', index=False)
filtered_data.to_csv('/Users/larsheijnen/Thesis/data/blood/TEST_MDS-UPDRS_patient_data.csv', index=False)


In [25]:


# Ensure your date columns are parsed correctly:
filtered_data['INFODT'] = pd.to_datetime(filtered_data['INFODT'])
useful_data['RUNDATE'] = pd.to_datetime(useful_data['RUNDATE'])

# Sort the DataFrames by 'PATNO' and the corresponding date column.
filtered_data = filtered_data.sort_values(by=['PATNO', 'INFODT'])
useful_data = useful_data.sort_values(by=['PATNO', 'RUNDATE'])

useful_data.head()

# Merge the data using merge_asof with a tolerance.
# We will merge on the date columns (matching motor INFODT to blood RUNDATE) and group by PATNO.
# Here, we're using a tolerance of 30 days. Adjust pd.Timedelta('30 days') to suit your study's requirements.
# merged_df = pd.merge_asof(
#     filtered_data, 
#     useful_data,
#     left_on='INFODT',
#     right_on='RUNDATE',
#     by='PATNO',
#     tolerance=pd.Timedelta('30 days'),
#     direction='nearest'  # 'nearest' selects the blood record with the closest RUNDATE.
# )

# print(merged_df.head())

Unnamed: 0,PATNO,SEX,COHORT,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
166116,3000,Female,Control,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1798.9,pg/ml,2016-10-01,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
165760,3000,Female,Control,V04,Cerebrospinal Fluid,CSF Alpha-synuclein,1547.0,pg/ml,2016-10-01,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
164781,3000,Female,Control,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1840.4,pg/ml,2016-10-01,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
183987,3000,Female,Control,BL,Cerebrospinal Fluid,MTDNA_DELETION,0.0,percentage,2017-02-01,130,Gavin Hudson,Newcastle University,2017-07-05 08:49:23.0
182571,3000,Female,Control,V08,Cerebrospinal Fluid,MTDNA_DELETION,19.2,percentage,2017-02-01,130,Gavin Hudson,Newcastle University,2017-07-05 08:49:23.0


## Checking PATNOS in motor data, and blood data

In [None]:
NP3TOT_motor_data = data_motor.dropna(subset=['NP3TOT'])
NP3TOT_motor_data_unique_patnos_count = NP3TOT_motor_data['PATNO'].nunique()

unique_pd_patients = useful_data[useful_data['COHORT'] == 'PD']['PATNO'].nunique()
unique_non_pd_patients = useful_data[useful_data['COHORT'] != 'PD']['PATNO'].nunique()

# Find the intersection of PATNOs in data_motor and useful_data
common_patnos_motor_useful = set(data_motor['PATNO']).intersection(set(useful_data['PATNO']))
print(f"Number of unique PD patients in filtered/useful blood data: {unique_pd_patients}")
print(f"Number of unique non-PD patients in filtered/useful blood data: {unique_non_pd_patients}")
print(f"Number of unique patients in filtered/useful blood data: {unique_pd_patients + unique_non_pd_patients}")
print(f"Number of unique PATNOs in NP3TOT_motor_data: {NP3TOT_motor_data_unique_patnos_count}")
print("\n")

# Print the number of common PATNOs
print(f"Number of common PATNOs in data_motor and useful_data: {len(common_patnos_motor_useful)}")

# # Print the row of PATNO 16580 in data_motor
# print("Data from data_motor:")
# print(data_motor[data_motor['PATNO'] == 16580])

# # Print the row of PATNO 16580 in useful_data
# print("\nData from useful_data:")
# print(useful_data[useful_data['PATNO'] == 16580])


In [None]:
# Get the PATNOs of unique non-PD patients in useful_data
non_pd_patnos = useful_data[useful_data['COHORT'] != 'PD']['PATNO'].unique()

# Check if these PATNOs exist in NP3TOT_motor_data
non_pd_patnos_in_motor_data = set(non_pd_patnos).intersection(set(NP3TOT_motor_data['PATNO']))

# Print the result
print(f"Number of unique non-PD patients in useful_data: {unique_non_pd_patients}")
print(f"Number of non-PD patients in useful_data that exist in NP3TOT_motor_data: {len(non_pd_patnos_in_motor_data)}")

In [None]:
# Filter data_motor for PATNO 16580
patno_16580_data = data_motor[data_motor['PATNO'] == 16580]

# Convert the INFODT column to datetime for proper time-based analysis
patno_16580_data['INFODT'] = pd.to_datetime(patno_16580_data['INFODT'], errors='coerce')

# Count cumulative number of visits over time
patno_16580_data = patno_16580_data.sort_values('INFODT')
patno_16580_data['Cumulative_Visits'] = range(1, len(patno_16580_data) + 1)

# Plot the data
plt.figure(figsize=(12, 6))
plt.plot(patno_16580_data['INFODT'], patno_16580_data['Cumulative_Visits'], marker='o', color='skyblue')
plt.title('Cumulative Number of Motor Visits Over Time for PATNO 16580')
plt.xlabel('Date')
plt.ylabel('Cumulative Number of Visits')
plt.grid(True)
plt.tight_layout()
plt.show()