In [None]:
import pandas as pd
from IPython.display import display
import numpy as np

In [None]:
motor_data = pd.read_csv('/Users/larsheijnen/Thesis/data/MDS-UPDRS_Patient_Data_only_date_formatted.csv')
blood_data = pd.read_csv('/Users/larsheijnen/Thesis/data/blood/BLOOD_DATE_FORMATTED_CSF_FILTERED.csv', low_memory=False)

In [None]:
display(motor_data.head(3))
display(blood_data.head(3))

In [None]:
motor_data = motor_data.sort_values(by=['PATNO', 'INFODT'])
blood_data = blood_data.sort_values(by=['PATNO', 'RUNDATE'])

In [None]:
display(motor_data.head(3))
display(blood_data.head(3))

In [None]:
print(motor_data.dtypes)
print(blood_data.dtypes)

In [None]:
motor_df = motor_data.copy()
blood_df = blood_data.copy()

In [None]:
motor_df['INFODT'] = pd.to_datetime(motor_df['INFODT'], errors='coerce')
blood_df['RUNDATE'] = pd.to_datetime(blood_df['RUNDATE'], errors='coerce')

# --- Optional: Check conversion and handle potential errors ---
print("Data types after conversion:")
print("motor_df['INFODT'] dtype:", motor_df['INFODT'].dtype)
print("blood_df['RUNDATE'] dtype:", blood_df['RUNDATE'].dtype)
# Optional: See how many dates failed to parse (if any)
print("\nNull dates after conversion:")
print("motor_df INFODT nulls:", motor_df['INFODT'].isnull().sum())
print("blood_df RUNDATE nulls:", blood_df['RUNDATE'].isnull().sum())

In [None]:
# --- Assume motor_df and blood_df have been loaded and INFODT/RUNDATE converted ---
# --- Make sure PATNO is also clean (e.g., numeric, no NaNs) ---
print("Data types BEFORE final cleaning/sorting:")
print("motor_df:", motor_df.dtypes[['PATNO', 'INFODT']])
print("blood_df:", blood_df.dtypes[['PATNO', 'RUNDATE']])

# --- Re-check and drop NaNs in key columns RIGHT BEFORE sorting ---
motor_df = motor_df.dropna(subset=['PATNO', 'INFODT'])
blood_df = blood_df.dropna(subset=['PATNO', 'RUNDATE'])
print(f"motor_df rows after cleaning: {len(motor_df)}")
print(f"blood_df rows after cleaning: {len(blood_df)}")

In [None]:
# --- Ensure PATNO is a numeric type (important for grouping/sorting) ---
# Use errors='coerce' to handle potential non-numeric PATNOs if needed
motor_df['PATNO'] = pd.to_numeric(motor_df['PATNO'], errors='coerce')
blood_df['PATNO'] = pd.to_numeric(blood_df['PATNO'], errors='coerce')
motor_df = motor_df.dropna(subset=['PATNO']) # Drop rows where PATNO became NaN
blood_df = blood_df.dropna(subset=['PATNO']) # Drop rows where PATNO became NaN

motor_df['PATNO'] = motor_df['PATNO'].astype(int)
blood_df['PATNO'] = blood_df['PATNO'].astype(int)

In [None]:
print("\nData types AFTER final cleaning:")
print("motor_df:", motor_df.dtypes[['PATNO', 'INFODT']])
print("blood_df:", blood_df.dtypes[['PATNO', 'RUNDATE']])
print(f"motor_df rows after cleaning: {len(motor_df)}")
print(f"blood_df rows after cleaning: {len(blood_df)}")

In [None]:
# --- Force sorting IMMEDIATELY before the merge ---
print("\nSorting dataframes...")
motor_df = motor_df.sort_values(by=['PATNO', 'INFODT'])
blood_df = blood_df.sort_values(by=['PATNO', 'RUNDATE'])
print("Sorting complete.")

In [None]:
# --- Sanity Check: Verify sorting for the 'left' dataframe (motor_df) ---
print("\nVerifying left dataframe sort order (INFODT within each PATNO)...")
# Group by PATNO and check if INFODT is monotonically increasing in each group
is_sorted_check = motor_df.groupby('PATNO')['INFODT'].is_monotonic_increasing.all()
if not is_sorted_check:
    print("ERROR: The left dataframe (motor_df) is NOT correctly sorted by INFODT within PATNO groups.")
    # Optional: Find problematic groups
    problem_groups = motor_df.groupby('PATNO')['INFODT'].apply(lambda x: not x.is_monotonic_increasing)
    print("Problematic PATNOs where INFODT is not sorted:")
    print(problem_groups[problem_groups].index.tolist())
    # You might want to stop here or investigate these specific patients in motor_df
    raise ValueError("Left dataframe sorting failed verification. Cannot proceed with merge_asof.")
else:
    print("Left dataframe sorting verified successfully.")

# --- Sanity Check: Verify sorting for the 'right' dataframe (blood_df) ---
print("\nVerifying right dataframe sort order (RUNDATE within each PATNO)...")
is_sorted_check_right = blood_df.groupby('PATNO')['RUNDATE'].is_monotonic_increasing.all()
if not is_sorted_check_right:
    print("ERROR: The right dataframe (blood_df) is NOT correctly sorted by RUNDATE within PATNO groups.")
    problem_groups_right = blood_df.groupby('PATNO')['RUNDATE'].apply(lambda x: not x.is_monotonic_increasing)
    print("Problematic PATNOs where RUNDATE is not sorted:")
    print(problem_groups_right[problem_groups_right].index.tolist())
    raise ValueError("Right dataframe sorting failed verification. Cannot proceed with merge_asof.")
else:
    print("Right dataframe sorting verified successfully.")

In [None]:
# Ensure motor_df is sorted by PATNO and INFODT
motor_df = motor_df.sort_values(by=['PATNO', 'INFODT'])

# Ensure blood_df is sorted by PATNO and RUNDATE
blood_df = blood_df.sort_values(by=['PATNO', 'RUNDATE'])
# --- Define the tolerance ---
tolerance = pd.Timedelta(days=30)

merged_list = []
for patno, motor_grp in motor_df.groupby('PATNO'):
    # Get corresponding blood_data for this PATNO and ensure it's sorted
    blood_grp = blood_df[blood_df['PATNO'] == patno].sort_values(by='RUNDATE')
    if blood_grp.empty:
        continue  # or handle patients with no blood data appropriately
    merged_grp = pd.merge_asof(
        motor_grp.sort_values('INFODT'),
        blood_grp,
        left_on='INFODT',
        right_on='RUNDATE',
        tolerance=pd.Timedelta(days=30),
        direction='nearest',
        suffixes=('_motor', '_blood')
    )
    merged_list.append(merged_grp)

merged_data = pd.concat(merged_list, ignore_index=True)
print("merge_asof completed successfully!")



merge_asof completed successfully!


Unnamed: 0,REC_ID,PATNO_motor,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
0,272451901,3000,BL,NUPDRS3,2011-02-01,,,,,,...,,,,,,NaT,,,,
1,338703101,3000,V04,NUPDRS3,2012-03-01,,,,,,...,,,,,,NaT,,,,
2,385009801,3000,V06,NUPDRS3,2013-02-01,,,,,,...,,,,,,NaT,,,,
3,437131401,3000,V08,NUPDRS3,2014-03-01,,,,,,...,BL,Plasma,Apolipoprotein A1,186.0,mg/dL,2014-02-27,112.0,Alice Chen-Plotkin,Penn Neurological Institute,2014-05-20 13:30:50.0
4,512469901,3000,V10,NUPDRS3,2015-03-01,,,,,,...,,,,,,NaT,,,,


In [24]:
display(merged_data.head())
merged_data.to_csv('/Users/larsheijnen/Thesis/data/temp/MERGE_1.csv', index=False)

Unnamed: 0,REC_ID,PATNO_motor,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,CLINICAL_EVENT,TYPE,TESTNAME,TESTVALUE,UNITS,RUNDATE,PROJECTID,PI_NAME,PI_INSTITUTION,update_stamp
0,272451901,3000,BL,NUPDRS3,2011-02-01,,,,,,...,,,,,,NaT,,,,
1,338703101,3000,V04,NUPDRS3,2012-03-01,,,,,,...,,,,,,NaT,,,,
2,385009801,3000,V06,NUPDRS3,2013-02-01,,,,,,...,,,,,,NaT,,,,
3,437131401,3000,V08,NUPDRS3,2014-03-01,,,,,,...,BL,Plasma,Apolipoprotein A1,186.0,mg/dL,2014-02-27,112.0,Alice Chen-Plotkin,Penn Neurological Institute,2014-05-20 13:30:50.0
4,512469901,3000,V10,NUPDRS3,2015-03-01,,,,,,...,,,,,,NaT,,,,
