In [2]:
import pandas as pd
from IPython.display import display
import numpy as np

In [3]:
#Here, I read in the correct data. The motor_data only has the relevant info on the PATNOS
#The blood_data only has the TYPE CSF (found most important in PD research).
motor_data = pd.read_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_Patient_Data_only_date_formatted.csv')
blood_data = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/BLOOD_DATE_FORMATTED_CSF_FILTERED.csv', low_memory=False)

In [4]:
# Check temporal range for motor_data
motor_start_date = motor_data['INFODT'].min()
motor_end_date = motor_data['INFODT'].max()

# Check temporal range for blood_data
blood_start_date = blood_data['RUNDATE'].min()
blood_end_date = blood_data['RUNDATE'].max()

print(f"Motor data range: {motor_start_date} to {motor_end_date}")
print(f"Blood data range: {blood_start_date} to {blood_end_date}")



Motor data range: 2010-07-01 to 2025-03-01
Blood data range: 2016-09-14 to 2021-04-16


In [5]:
display(motor_data.head(3))
display(blood_data.head(3))

Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,ONNORSN,HIFUYN,DBSONTM,PDMEDDT,PDMEDTM,EXAMDT,EXAMTM,NP3TOT,ORIG_ENTRY,LAST_UPDATE
0,272451901,3000,BL,NUPDRS3,2011-02-01,,,,,,...,,,,,,2011-02-01,13:17:00,4.0,2011-02-01,2020-06-25 16:02:19.0
1,338703101,3000,V04,NUPDRS3,2012-03-01,,,,,,...,,,,,,2012-03-01,13:47:00,1.0,2012-03-01,2020-06-25 16:02:22.0
2,385009801,3000,V06,NUPDRS3,2013-02-01,,,,,,...,,,,,,2013-02-01,12:22:00,4.0,2013-02-01,2020-06-25 16:02:22.0


Unnamed: 0,PATNO,SEX,COHORT,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
0,3000,Female,Control,V04,Cerebrospinal Fluid,ABeta 1-42,1060.0,pg/mL,2017-06-27,125,Les Shaw,University of Pennsylvania,2017-10-20 09:04:05.0
1,3000,Female,Control,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1798.9,pg/ml,2016-10-11,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
2,3000,Female,Control,V08,Cerebrospinal Fluid,pTau,21.15,pg/mL,2017-06-27,125,Les Shaw,University of Pennsylvania,2017-10-20 09:04:05.0


In [6]:
motor_data = motor_data.sort_values(by=['PATNO', 'INFODT'])
blood_data = blood_data.sort_values(by=['PATNO', 'RUNDATE'])

In [7]:
display(motor_data.head(3))
display(blood_data.head(3))

Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,ONNORSN,HIFUYN,DBSONTM,PDMEDDT,PDMEDTM,EXAMDT,EXAMTM,NP3TOT,ORIG_ENTRY,LAST_UPDATE
0,272451901,3000,BL,NUPDRS3,2011-02-01,,,,,,...,,,,,,2011-02-01,13:17:00,4.0,2011-02-01,2020-06-25 16:02:19.0
1,338703101,3000,V04,NUPDRS3,2012-03-01,,,,,,...,,,,,,2012-03-01,13:47:00,1.0,2012-03-01,2020-06-25 16:02:22.0
2,385009801,3000,V06,NUPDRS3,2013-02-01,,,,,,...,,,,,,2013-02-01,12:22:00,4.0,2013-02-01,2020-06-25 16:02:22.0


Unnamed: 0,PATNO,SEX,COHORT,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
1,3000,Female,Control,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1798.9,pg/ml,2016-10-11,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
4,3000,Female,Control,V04,Cerebrospinal Fluid,CSF Alpha-synuclein,1547.0,pg/ml,2016-10-11,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
5,3000,Female,Control,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1840.4,pg/ml,2016-10-11,124,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0


In [8]:
print(motor_data.dtypes)
print(blood_data.dtypes)

REC_ID          object
PATNO            int64
EVENT_ID        object
PAG_NAME        object
INFODT          object
PDTRTMNT       float64
PDSTATE         object
HRPOSTMED      float64
HRDBSON        float64
HRDBSOFF       float64
PDMEDYN        float64
DBSYN          float64
ONOFFORDER     float64
OFFEXAM        float64
OFFNORSN       float64
DBSOFFTM        object
ONEXAM         float64
ONNORSN        float64
HIFUYN         float64
DBSONTM         object
PDMEDDT         object
PDMEDTM         object
EXAMDT          object
EXAMTM          object
NP3TOT         float64
ORIG_ENTRY      object
LAST_UPDATE     object
dtype: object
PATNO              int64
SEX               object
COHORT            object
CLINICAL_EVENT    object
TYPE              object
TESTNAME          object
TESTVALUE         object
UNITS             object
RUNDATE           object
PROJECTID          int64
PI_NAME           object
PI_INSTITUTION    object
update_stamp      object
dtype: object


In [9]:
motor_df = motor_data.copy()
blood_df = blood_data.copy()

In [10]:
#Fix dates
motor_df['INFODT'] = pd.to_datetime(motor_df['INFODT'], errors='coerce')
blood_df['RUNDATE'] = pd.to_datetime(blood_df['RUNDATE'], errors='coerce')

#Check conversion and handle potential errors
print("Data types after conversion:")
print("motor_df['INFODT'] dtype:", motor_df['INFODT'].dtype)
print("blood_df['RUNDATE'] dtype:", blood_df['RUNDATE'].dtype)
#See how many dates failed to parse (if any)
print("\nNull dates after conversion:")
print("motor_df INFODT nulls:", motor_df['INFODT'].isnull().sum())
print("blood_df RUNDATE nulls:", blood_df['RUNDATE'].isnull().sum())

Data types after conversion:
motor_df['INFODT'] dtype: datetime64[ns]
blood_df['RUNDATE'] dtype: datetime64[ns]

Null dates after conversion:
motor_df INFODT nulls: 0
blood_df RUNDATE nulls: 0


In [11]:
# Assume here that motor_df and blood_df have been loaded and INFODT/RUNDATE converted ---
# Make sure PATNO is also clean (e.g., numeric, no NaNs)
print("Data types BEFORE final cleaning/sorting:")
print("motor_df:", motor_df.dtypes[['PATNO', 'INFODT']])
print("blood_df:", blood_df.dtypes[['PATNO', 'RUNDATE']])
print(f"motor_df rows before cleaning: {len(motor_df)}")
print(f"blood_df rows before cleaning: {len(blood_df)}")

# --- Re-check and drop NaNs in key columns RIGHT BEFORE sorting ---
motor_df = motor_df.dropna(subset=['PATNO', 'INFODT'])
blood_df = blood_df.dropna(subset=['PATNO', 'RUNDATE'])
print(f"motor_df rows after cleaning: {len(motor_df)}")
print(f"blood_df rows after cleaning: {len(blood_df)}")

#As you can see, no rows are dropped, so everything is correct. 

Data types BEFORE final cleaning/sorting:
motor_df: PATNO              int64
INFODT    datetime64[ns]
dtype: object
blood_df: PATNO               int64
RUNDATE    datetime64[ns]
dtype: object
motor_df rows before cleaning: 32346
blood_df rows before cleaning: 21731
motor_df rows after cleaning: 32346
blood_df rows after cleaning: 21731


In [12]:
# --- Ensure PATNO is a numeric type (important for grouping/sorting) ---
# Use errors='coerce' to handle potential non-numeric PATNOs if needed
motor_df['PATNO'] = pd.to_numeric(motor_df['PATNO'], errors='coerce')
blood_df['PATNO'] = pd.to_numeric(blood_df['PATNO'], errors='coerce')
motor_df = motor_df.dropna(subset=['PATNO']) # Drop rows where PATNO became NaN
blood_df = blood_df.dropna(subset=['PATNO']) # Drop rows where PATNO became NaN

motor_df['PATNO'] = motor_df['PATNO'].astype(int)
blood_df['PATNO'] = blood_df['PATNO'].astype(int)

print("motor_df['PATNO'] dtype:", motor_df['PATNO'].dtype)
print("blood_df['PATNO'] dtype:", blood_df['PATNO'].dtype)

motor_df['PATNO'] dtype: int64
blood_df['PATNO'] dtype: int64


In [13]:
# --- Force sorting IMMEDIATELY before the merge ---
print("\nSorting dataframes...")
motor_df = motor_df.sort_values(by=['PATNO', 'INFODT'])
blood_df = blood_df.sort_values(by=['PATNO', 'RUNDATE'])
print("Sorting complete.")


Sorting dataframes...
Sorting complete.


In [14]:
# Verify sorting for the 'left' dataframe (motor_df) 
print("\nVerifying left dataframe sort order (INFODT within each PATNO)...")
# Group by PATNO and check if INFODT is monotonically increasing in each group
is_sorted_check = motor_df.groupby('PATNO')['INFODT'].is_monotonic_increasing.all()
if not is_sorted_check:
    print("ERROR: The left dataframe (motor_df) is NOT correctly sorted by INFODT within PATNO groups.")
    # Optional: Find problematic groups
    problem_groups = motor_df.groupby('PATNO')['INFODT'].apply(lambda x: not x.is_monotonic_increasing)
    print("Problematic PATNOs where INFODT is not sorted:")
    print(problem_groups[problem_groups].index.tolist())
    raise ValueError("Left dataframe sorting failed verification. Cannot proceed with merge_asof.")
else:
    print("Left dataframe sorting verified successfully.")

# Verify sorting for the 'right' dataframe (blood_df) ---
print("\nVerifying right dataframe sort order (RUNDATE within each PATNO)...")
is_sorted_check_right = blood_df.groupby('PATNO')['RUNDATE'].is_monotonic_increasing.all()
if not is_sorted_check_right:
    print("ERROR: The right dataframe (blood_df) is NOT correctly sorted by RUNDATE within PATNO groups.")
    problem_groups_right = blood_df.groupby('PATNO')['RUNDATE'].apply(lambda x: not x.is_monotonic_increasing)
    print("Problematic PATNOs where RUNDATE is not sorted:")
    print(problem_groups_right[problem_groups_right].index.tolist())
    raise ValueError("Right dataframe sorting failed verification. Cannot proceed with merge_asof.")
else:
    print("Right dataframe sorting verified successfully.")


Verifying left dataframe sort order (INFODT within each PATNO)...
Left dataframe sorting verified successfully.

Verifying right dataframe sort order (RUNDATE within each PATNO)...
Right dataframe sorting verified successfully.


In [15]:
# Ensure motor_df is sorted by PATNO and INFODT
motor_df = motor_df.sort_values(by=['PATNO', 'INFODT'])

# Ensure blood_df is sorted by PATNO and RUNDATE
blood_df = blood_df.sort_values(by=['PATNO', 'RUNDATE'])
# --- Define the tolerance ---
tolerance = pd.Timedelta(days=30)

merged_list = []
for patno, motor_grp in motor_df.groupby('PATNO'):
    # Get corresponding blood_data for this PATNO and ensure it's sorted
    blood_grp = blood_df[blood_df['PATNO'] == patno].sort_values(by='RUNDATE')
    if blood_grp.empty:
        continue  # or handle patients with no blood data appropriately
    merged_grp = pd.merge_asof(
        motor_grp.sort_values('INFODT'),
        blood_grp,
        left_on='INFODT',
        right_on='RUNDATE',
        tolerance=pd.Timedelta(days=30),
        direction='nearest',
        suffixes=('_motor', '_blood')
    )
    merged_list.append(merged_grp)

merged_data = pd.concat(merged_list, ignore_index=True)
print("merge_asof completed successfully!")



merge_asof completed successfully!


In [16]:
display(merged_data.head())
merged_data.to_csv('/Users/larsheijnen/Thesis/data/temp/MERGE_1.csv', index=False)

Unnamed: 0,REC_ID,PATNO_motor,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
0,272451901,3000,BL,NUPDRS3,2011-02-01,,,,,,...,,,,,,NaT,,,,
1,338703101,3000,V04,NUPDRS3,2012-03-01,,,,,,...,,,,,,NaT,,,,
2,385009801,3000,V06,NUPDRS3,2013-02-01,,,,,,...,,,,,,NaT,,,,
3,437131401,3000,V08,NUPDRS3,2014-03-01,,,,,,...,,,,,,NaT,,,,
4,512469901,3000,V10,NUPDRS3,2015-03-01,,,,,,...,,,,,,NaT,,,,


In [25]:
csf_alpha_synuclein_data = merged_data[merged_data['TESTNAME'] == 'CSF Alpha-synuclein']
display(csf_alpha_synuclein_data)

Unnamed: 0,REC_ID,PATNO_motor,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
111,601935301,3008,V12,NUPDRS3,2016-11-01,,,,,,...,V06,Cerebrospinal Fluid,CSF Alpha-synuclein,2958.9,pg/ml,2016-11-08,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
171,595092201,3012,V11,NUPDRS3,2016-10-01,1.0,ON,4.5000,,,...,V06,Cerebrospinal Fluid,CSF Alpha-synuclein,1507.3,pg/ml,2016-10-21,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
489,605960501,3054,V13,NUPDRS3A,2016-12-01,1.0,ON,2.0000,,,...,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1500.0,pg/ml,2016-11-04,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
490,605958001,3054,V13,NUPDRS3,2016-12-01,1.0,OFF,13.5000,,,...,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1500.0,pg/ml,2016-11-04,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
535,615907001,3056,V13,NUPDRS3A,2017-02-01,1.0,ON,1.8667,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1847.0,pg/ml,2017-01-06,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19376,595717501,60096,V06,NUPDRS3,2016-10-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,3448.2,pg/ml,2016-09-29,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19389,596367401,60100,V06,NUPDRS3,2016-10-01,0.0,,,,,...,V02,Cerebrospinal Fluid,CSF Alpha-synuclein,613.3,pg/ml,2016-09-30,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19455,592114101,60118,V05,NUPDRS3,2016-09-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,398.8,pg/ml,2016-09-14,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19475,593917001,60148,V05,NUPDRS3,2016-10-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1114.1,pg/ml,2016-09-28,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0


In [26]:
csf_alpha_synuclein_data = csf_alpha_synuclein_data.sort_values(by='PATNO_motor', ascending=False)
display(csf_alpha_synuclein_data)


Unnamed: 0,REC_ID,PATNO_motor,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
21327,611440501,92490,V06,NUPDRS3,2017-01-01,0.0,,,,,...,V04,Cerebrospinal Fluid,CSF Alpha-synuclein,1465.7,pg/ml,2017-01-06,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19475,593917001,60148,V05,NUPDRS3,2016-10-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1114.1,pg/ml,2016-09-28,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19455,592114101,60118,V05,NUPDRS3,2016-09-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,398.8,pg/ml,2016-09-14,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19389,596367401,60100,V06,NUPDRS3,2016-10-01,0.0,,,,,...,V02,Cerebrospinal Fluid,CSF Alpha-synuclein,613.3,pg/ml,2016-09-30,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
19376,595717501,60096,V06,NUPDRS3,2016-10-01,0.0,,,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,3448.2,pg/ml,2016-09-29,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,615906001,3056,V13,NUPDRS3,2017-02-01,1.0,OFF,15.75,,,...,BL,Cerebrospinal Fluid,CSF Alpha-synuclein,1847.0,pg/ml,2017-01-06,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
489,605960501,3054,V13,NUPDRS3A,2016-12-01,1.0,ON,2.00,,,...,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1500.0,pg/ml,2016-11-04,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
490,605958001,3054,V13,NUPDRS3,2016-12-01,1.0,OFF,13.50,,,...,V08,Cerebrospinal Fluid,CSF Alpha-synuclein,1500.0,pg/ml,2016-11-04,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0
171,595092201,3012,V11,NUPDRS3,2016-10-01,1.0,ON,4.50,,,...,V06,Cerebrospinal Fluid,CSF Alpha-synuclein,1507.3,pg/ml,2016-10-21,124.0,Peggy Taylor,Biolegend,2017-04-03 08:50:27.0


In [27]:
unique_patno_motor_count = csf_alpha_synuclein_data['PATNO_motor'].nunique()
print(f"Number of unique PATNO_motor: {unique_patno_motor_count}")

Number of unique PATNO_motor: 183
