In [1]:
import pandas as pd

data = pd.read_csv('/Users/larsheijnen/Thesis/data/motor/MDS-UPDRS_Part_III_21Mar2025.csv')

# MDS-UPDRS Part III Data Integrity Analysis

This notebook provides a comprehensive analysis of data integrity and missingness across subjects and visits in the MDS-UPDRS Part III dataset. We'll visualize patterns of missing data and calculate key metrics to understand data completeness.

## Import Required Libraries

Import necessary libraries including pandas, numpy, matplotlib, and seaborn for data manipulation and visualization.

In [2]:
# Import additional required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

# Set visualization styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

## Load and Prepare the Data

We'll load the MDS-UPDRS Part III data and prepare it for analysis by identifying subject IDs, visit information, and organizing the data structure.

In [4]:
# Display basic information about the dataset
print(f"Dataset shape: {data.shape}")
print("\nFirst few rows:")
display(data.head())

print("\nColumn names:")
display(data.columns.tolist())

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing values per column:")
display(missing_values[missing_values > 0])

Dataset shape: (32346, 63)

First few rows:


Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,NP3RTARL,NP3RTALL,NP3RTALJ,NP3RTCON,NP3TOT,DYSKPRES,DYSKIRAT,NHY,ORIG_ENTRY,LAST_UPDATE
0,272451901,3000,BL,NUPDRS3,02/2011,,,,,,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2011,2020-06-25 16:02:19.0
1,338703101,3000,V04,NUPDRS3,03/2012,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,03/2012,2020-06-25 16:02:22.0
2,385009801,3000,V06,NUPDRS3,02/2013,,,,,,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2013,2020-06-25 16:02:22.0
3,437131401,3000,V08,NUPDRS3,03/2014,,,,,,...,0.0,0.0,0.0,0.0,2.0,0.0,,0.0,05/2014,2020-06-25 16:02:22.0
4,512469901,3000,V10,NUPDRS3,03/2015,,,,,,...,0.0,0.0,0.0,0.0,19.0,0.0,,0.0,03/2015,2020-06-25 16:02:23.0



Column names:


['REC_ID',
 'PATNO',
 'EVENT_ID',
 'PAG_NAME',
 'INFODT',
 'PDTRTMNT',
 'PDSTATE',
 'HRPOSTMED',
 'HRDBSON',
 'HRDBSOFF',
 'PDMEDYN',
 'DBSYN',
 'ONOFFORDER',
 'OFFEXAM',
 'OFFNORSN',
 'DBSOFFTM',
 'ONEXAM',
 'ONNORSN',
 'HIFUYN',
 'DBSONTM',
 'PDMEDDT',
 'PDMEDTM',
 'EXAMDT',
 'EXAMTM',
 'NP3SPCH',
 'NP3FACXP',
 'NP3RIGN',
 'NP3RIGRU',
 'NP3RIGLU',
 'NP3RIGRL',
 'NP3RIGLL',
 'NP3FTAPR',
 'NP3FTAPL',
 'NP3HMOVR',
 'NP3HMOVL',
 'NP3PRSPR',
 'NP3PRSPL',
 'NP3TTAPR',
 'NP3TTAPL',
 'NP3LGAGR',
 'NP3LGAGL',
 'NP3RISNG',
 'NP3GAIT',
 'NP3FRZGT',
 'NP3PSTBL',
 'NP3POSTR',
 'NP3BRADY',
 'NP3PTRMR',
 'NP3PTRML',
 'NP3KTRMR',
 'NP3KTRML',
 'NP3RTARU',
 'NP3RTALU',
 'NP3RTARL',
 'NP3RTALL',
 'NP3RTALJ',
 'NP3RTCON',
 'NP3TOT',
 'DYSKPRES',
 'DYSKIRAT',
 'NHY',
 'ORIG_ENTRY',
 'LAST_UPDATE']


Missing values per column:


PDTRTMNT       2307
PDSTATE       17921
HRPOSTMED     19916
HRDBSON       31913
HRDBSOFF      32205
PDMEDYN        3333
DBSYN          1030
ONOFFORDER    28302
OFFEXAM       28889
OFFNORSN      30870
DBSOFFTM      32192
ONEXAM        28855
ONNORSN       32107
HIFUYN        29910
DBSONTM       31899
PDMEDDT       19792
PDMEDTM       19803
EXAMDT         1728
EXAMTM         5354
NP3SPCH        1722
NP3FACXP       1722
NP3RIGN        1725
NP3RIGRU       1725
NP3RIGLU       1725
NP3RIGRL       1727
NP3RIGLL       1729
NP3FTAPR       1724
NP3FTAPL       1727
NP3HMOVR       1724
NP3HMOVL       1728
NP3PRSPR       1724
NP3PRSPL       1726
NP3TTAPR       1753
NP3TTAPL       1750
NP3LGAGR       1723
NP3LGAGL       1724
NP3RISNG       1725
NP3GAIT        1724
NP3FRZGT       1738
NP3PSTBL       1752
NP3POSTR       1725
NP3BRADY       1725
NP3PTRMR       1725
NP3PTRML       1726
NP3KTRMR       1726
NP3KTRML       1725
NP3RTARU       1725
NP3RTALU       1725
NP3RTARL       1725
NP3RTALL       1725


In [6]:
# Extract subject IDs and visit information
# Assuming columns like 'PATNO' for subject ID and 'EVENT_ID' for visit
# Modify these if your column names are different
subject_column = 'PATNO' if 'PATNO' in data.columns else 'PATIENT_ID'
visit_column = 'EVENT_ID' if 'EVENT_ID' in data.columns else 'VISIT'

# Get unique subjects and visits
subjects = data[subject_column].unique()
visits = data[visit_column].unique()

print(f"Number of unique subjects: {len(subjects)}")
print(f"Number of unique visits: {len(visits)}")
print(f"Visits: {sorted(visits)}")

# Create a clean dataframe with just the key information
analysis_df = data[[subject_column, visit_column]].copy()

Number of unique subjects: 4313
Number of unique visits: 41
Visits: ['BL', 'PW', 'R01', 'R04', 'R06', 'R08', 'R10', 'R12', 'R13', 'R14', 'R15', 'R16', 'R17', 'R18', 'R19', 'R20', 'RS1', 'SC', 'ST', 'U01', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21']


## Calculate Missingness

Now we'll create a matrix of subjects x visits showing where data is missing, using binary indicators or NaN count metrics.

In [7]:
# Create a pivot table to show which subjects have data for which visits
# 1 = data present, 0 = data missing
presence_matrix = pd.crosstab(
    index=data[subject_column], 
    columns=data[visit_column]
)

# Fill visits that don't exist with 0
for visit in visits:
    if visit not in presence_matrix.columns:
        presence_matrix[visit] = 0

# Sort columns (visits) if they are in a logical order
if all(isinstance(v, str) for v in visits):
    presence_matrix = presence_matrix.reindex(sorted(presence_matrix.columns), axis=1)
elif all(isinstance(v, (int, float)) for v in visits):
    presence_matrix = presence_matrix.reindex(sorted(presence_matrix.columns), axis=1)

# Display the first few rows of the presence matrix
print("Presence matrix (1 = data present, sample of first 10 subjects):")
display(presence_matrix.head(10))

# Calculate the missing data percentage per subject
presence_matrix['completion_rate'] = presence_matrix.mean(axis=1) * 100

Presence matrix (1 = data present, sample of first 10 subjects):


EVENT_ID,BL,PW,R01,R04,R06,R08,R10,R12,R13,R14,...,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21
PATNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000,1,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,0,0,0
3001,1,0,0,0,0,0,0,0,0,0,...,2,0,2,2,0,2,2,2,2,0
3002,1,0,0,0,0,0,0,0,0,0,...,1,0,2,2,0,2,2,0,0,0
3003,1,0,0,0,0,0,0,0,0,0,...,1,1,0,2,0,0,2,2,2,0
3004,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,1,1,0
3005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3006,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3007,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3008,1,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,0,0,0
3009,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,1,1,0


In [None]:
# Calculate detailed missingness metrics

# 1. Overall missingness
total_possible_entries = len(subjects) * len(visits)
actual_entries = presence_matrix.drop(columns=['completion_rate']).sum().sum()
overall_missingness = (1 - actual_entries / total_possible_entries) * 100

# 2. Per-visit missingness
visit_missingness = (1 - presence_matrix.drop(columns=['completion_rate']).mean(axis=0)) * 100

# 3. Per-subject missingness
subject_missingness = (1 - presence_matrix.drop(columns=['completion_rate']).mean(axis=1)) * 100

print(f"Overall data missingness: {overall_missingness:.2f}%")
print("\nMissingness per visit:")
display(visit_missingness)

## Create Heatmap of Subjects x Visits

Generate a heatmap visualization using seaborn that shows the pattern of missingness across all subjects and visits.

In [None]:
# Create a heatmap of the presence matrix
plt.figure(figsize=(12, 10))

# If there are too many subjects, we'll sample a subset for visualization
max_subjects_to_show = 100
if len(subjects) > max_subjects_to_show:
    sampled_subjects = np.random.choice(presence_matrix.index, max_subjects_to_show, replace=False)
    presence_subset = presence_matrix.loc[sampled_subjects].drop(columns=['completion_rate'])
    title_suffix = f" (Sample of {max_subjects_to_show} subjects)"
else:
    presence_subset = presence_matrix.drop(columns=['completion_rate'])
    title_suffix = ""

# Create the heatmap
sns.heatmap(presence_subset, cmap="YlGnBu", cbar_kws={'label': 'Data Present (1) / Missing (0)'})
plt.title(f"Data Presence Across Subjects and Visits{title_suffix}")
plt.xlabel("Visit")
plt.ylabel("Subject ID")
plt.tight_layout()
plt.show()

In [None]:
# Create a clustered heatmap to identify patterns
plt.figure(figsize=(14, 12))
clustered_data = presence_matrix.drop(columns=['completion_rate']).copy()

# Apply clustering to both rows and columns
clustergrid = sns.clustermap(
    clustered_data,
    cmap="YlGnBu",
    figsize=(14, 12),
    row_cluster=True,
    col_cluster=True,
    cbar_kws={'label': 'Data Present (1) / Missing (0)'}
)

plt.title("Clustered Heatmap of Data Presence", pad=20, fontsize=16)
plt.tight_layout()
plt.show()

## Analyze Per-Visit Missingness

Calculate summary statistics for missingness at each visit, including percentage of missing data points per visit.

In [None]:
# Analyze missingness patterns per visit
visit_stats = pd.DataFrame({
    'Total Subjects': len(subjects),
    'Subjects Present': presence_matrix.drop(columns=['completion_rate']).sum(),
    'Subjects Missing': len(subjects) - presence_matrix.drop(columns=['completion_rate']).sum(),
    'Presence Rate (%)': presence_matrix.drop(columns=['completion_rate']).mean() * 100,
    'Missing Rate (%)': (1 - presence_matrix.drop(columns=['completion_rate']).mean()) * 100
})

# Sort by visit order if possible
visit_stats = visit_stats.sort_index()

# Display the visit statistics
display(visit_stats)

# Identify visits with high missingness (e.g., >50% missing data)
high_missingness_visits = visit_stats[visit_stats['Missing Rate (%)'] > 50].sort_values('Missing Rate (%)', ascending=False)

if not high_missingness_visits.empty:
    print("\nVisits with high missingness (>50%):")
    display(high_missingness_visits)
else:
    print("\nNo visits have more than 50% missing data.")

## Visualize Per-Visit Missingness

Create bar charts or line plots showing the trend of missingness across different visits to identify any patterns or problematic timepoints.

In [None]:
# Visualize missingness by visit

plt.figure(figsize=(14, 7))
ax = visit_stats['Missing Rate (%)'].plot(kind='bar', color='skyblue')
plt.title('Missing Data Rate by Visit', fontsize=16)
plt.xlabel('Visit', fontsize=14)
plt.ylabel('Missing Data (%)', fontsize=14)
plt.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='50% Threshold')
plt.axhline(y=visit_stats['Missing Rate (%)'].mean(), color='green', linestyle='--', alpha=0.7, 
           label=f'Mean Missing Rate ({visit_stats["Missing Rate (%)"].mean():.1f}%)')

# Add value labels on top of each bar
for i, v in enumerate(visit_stats['Missing Rate (%)']):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)

plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Line plot to show the progression of missingness over visits
# This is especially useful if visits have a chronological order

plt.figure(figsize=(14, 7))

# Sort visit columns if they are numeric or have a clear order
numeric_visits = False
try:
    # Try to convert visit names to numeric values for ordering
    visit_order = [float(str(v).replace('V', '').replace('VISIT', '')) 
                  for v in visit_stats.index]
    visit_order_map = {v: i for i, v in enumerate(visit_order)}
    sorted_visits = sorted(visit_stats.index, key=lambda x: visit_order_map.get(x, 0))
    numeric_visits = True
except:
    # If conversion fails, use the original order
    sorted_visits = visit_stats.index

plt.plot(visit_stats.loc[sorted_visits, 'Missing Rate (%)'], marker='o', markersize=8, 
         linewidth=2, color='royalblue')
plt.title('Trend of Missing Data Across Visits', fontsize=16)
plt.xlabel('Visit', fontsize=14)
plt.ylabel('Missing Data (%)', fontsize=14)
plt.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='50% Threshold')
plt.axhline(y=visit_stats['Missing Rate (%)'].mean(), color='green', linestyle='--', alpha=0.7, 
           label=f'Mean Missing Rate ({visit_stats["Missing Rate (%)"].mean():.1f}%)')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

## Subject Completion Analysis

Let's analyze how many subjects completed various numbers of visits to understand the distribution of data completeness.

In [None]:
# Calculate the number of visits completed per subject
visits_per_subject = presence_matrix.drop(columns=['completion_rate']).sum(axis=1)

# Create a histogram of visits completed
plt.figure(figsize=(12, 6))
ax = visits_per_subject.plot(kind='hist', bins=len(visits), color='skyblue', edgecolor='black')
plt.title('Distribution of Completed Visits per Subject', fontsize=16)
plt.xlabel('Number of Visits Completed', fontsize=14)
plt.ylabel('Number of Subjects', fontsize=14)

# Add a vertical line for the mean
plt.axvline(x=visits_per_subject.mean(), color='red', linestyle='--', 
           label=f'Mean: {visits_per_subject.mean():.1f} visits')

plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Summary statistics
print("Summary of visits completed per subject:")
print(f"Mean: {visits_per_subject.mean():.2f}")
print(f"Median: {visits_per_subject.median():.2f}")
print(f"Min: {visits_per_subject.min()}")
print(f"Max: {visits_per_subject.max()}")
print(f"Subjects with complete data (all visits): {(visits_per_subject == len(visits)).sum()} ({(visits_per_subject == len(visits)).sum()/len(subjects)*100:.2f}%)")
print(f"Subjects with no data (0 visits): {(visits_per_subject == 0).sum()} ({(visits_per_subject == 0).sum()/len(subjects)*100:.2f}%)")

## Conclusion

This analysis provides insights into the completeness of the MDS-UPDRS Part III dataset across subjects and visits. Key findings include:

1. Overall missingness patterns and their distribution
2. Identification of visits with especially high or low data completeness
3. Subjects with incomplete data that may affect longitudinal analyses
4. Potential trends in data collection or subject retention over time

These insights can guide decision-making for handling missing data in subsequent analyses, such as imputation strategies or subject/visit exclusion criteria.