In [None]:
import os
import pandas as pd

In [None]:
df = pd.read_csv('cleaned.csv')

In [None]:
# Print the shape of the cleaned data
print(f"cleaned DataFrame shape: {df.shape}")

cleaned DataFrame shape: (4429, 24)


In [None]:
df = df.drop_duplicates()
print(f"DataFrame shape after removing duplicates: {df.shape}")

DataFrame shape after removing duplicates: (4407, 24)


In [None]:
df.head()

Unnamed: 0,PATNO,EVENT_ID,NP1COG,NP1HALL,NP1DPRS,NP1ANXS,NP1APAT,NP2SPCH,NP2SALV,NP2SWAL,...,NP3FRZGT,NP3PSTBL,NP3TOT,AGE_AT_VISIT,SDMTOTAL,HVLTRT1,HVLTRT2,HVLTRT3,HVLTRDLY,HVLTREC
0,3000,BL,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,69.1,53.0,8.0,11.0,11.0,10.0,12.0
1,3001,BL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,12.0,65.1,42.0,9.0,10.0,8.0,12.0,12.0
2,3002,BL,1.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,...,0.0,0.0,17.0,67.6,41.0,6.0,10.0,12.0,11.0,12.0
3,3003,BL,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,29.0,56.7,37.0,7.0,10.0,12.0,11.0,12.0
4,3004,BL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,59.4,47.0,6.0,8.0,8.0,5.0,12.0


In [None]:
# Step 1: Handle Missing Values
# Fill missing values for numerical columns with the median
numerical_columns = [
    "NP1COG", "NP1HALL", "NP1DPRS", "NP1ANXS", "NP1APAT",
    "NP2SPCH", "NP2SALV", "NP2SWAL", "NP2EAT", "NP2DRES",
    "NP3SPCH", "NP3GAIT", "NP3FRZGT", "NP3PSTBL", "NP3TOT",
    "AGE_AT_VISIT", "SDMTOTAL", "HVLTRT1", "HVLTRT2", "HVLTRT3", "HVLTRDLY", "HVLTREC"
]

In [None]:
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())


In [None]:
df = df.drop(columns=["EVENT_ID"])


In [None]:
# Step 3: Scale Numerical Features
# Standardize numerical columns to have mean = 0 and standard deviation = 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [None]:
# Step 4: Feature-Specific Preprocessing
# Drop columns with very low variance (e.g., NP3FRZGT has only 0.0 values)
low_variance_columns = ["NP3FRZGT"]
df = df.drop(columns=low_variance_columns)

In [None]:
import numpy as np  # Import numpy for numerical operations

# Compute the correlation matrix
correlation_matrix = df.corr().abs()

# Create an upper triangle mask using numpy
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

# Identify columns to drop based on high correlation (e.g., > 0.9)
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]

# Drop the highly correlated columns
df = df.drop(columns=to_drop)

print(f"Columns dropped due to high correlation: {to_drop}")

Columns dropped due to high correlation: []


In [None]:
df.to_csv("cleaned_preprocessed.csv", index=False)
print("Preprocessed dataset saved as 'cleaned_preprocessed.csv'")

Preprocessed dataset saved as 'cleaned_preprocessed.csv'


In [None]:
df.head()
print(f"cleaned DataFrame shape: {df.shape}")

cleaned DataFrame shape: (4407, 22)


In [None]:
import pandas as pd

# Load the datasets
cleaned_df = pd.read_csv('cleaned_preprocessed.csv')
participant_status_df = pd.read_csv('Participant_Status_30Mar2025.csv')

# Select only the necessary columns from Participant_Status_30Mar2025.csv
participant_status_df = participant_status_df[['PATNO', 'COHORT_DEFINITION']]

# Merge the datasets on the PATNO column
merged_df = cleaned_df.merge(participant_status_df, on='PATNO', how='inner')

# Save the merged dataset to a new CSV file
merged_df.to_csv('merged_dataset.csv', index=False)

print("Merged dataset saved as 'merged_dataset.csv'.")

merged_df.head()

Merged dataset saved as 'merged_dataset.csv'.


Unnamed: 0,PATNO,NP1COG,NP1HALL,NP1DPRS,NP1ANXS,NP1APAT,NP2SPCH,NP2SALV,NP2SWAL,NP2EAT,...,NP3PSTBL,NP3TOT,AGE_AT_VISIT,SDMTOTAL,HVLTRT1,HVLTRT2,HVLTRT3,HVLTRDLY,HVLTREC,COHORT_DEFINITION
0,3000,1.093062,-0.206265,0.379001,0.683703,-0.39711,-0.461978,-0.498558,-0.312543,-0.387215,...,-0.065167,-0.61101,0.483564,0.868034,0.975283,1.157533,0.678225,0.575454,0.607089,Healthy Control
1,3001,-0.57434,-0.206265,-0.219443,-0.656039,-0.39711,-0.461978,-0.498558,-0.312543,-0.387215,...,-0.065167,0.083662,0.03183,-0.167952,1.498863,0.672808,-0.856663,1.264217,0.607089,Parkinson's Disease
2,3002,1.093062,-0.206265,0.379001,-0.656039,1.306943,1.144918,1.84132,1.912624,-0.387215,...,-0.065167,0.517831,0.314164,-0.262133,-0.071878,0.672808,1.189855,0.919836,0.607089,Parkinson's Disease
3,3003,-0.57434,-0.206265,-0.219443,0.683703,-0.39711,-0.461978,-0.498558,-0.312543,1.722124,...,-0.065167,1.559838,-0.916811,-0.638855,0.451702,0.672808,1.189855,0.919836,0.607089,Parkinson's Disease
4,3004,-0.57434,-0.206265,-0.219443,-0.656039,-0.39711,-0.461978,-0.498558,-0.312543,-0.387215,...,-0.065167,-0.784677,-0.611891,0.302951,-0.071878,-0.296643,-0.856663,-1.146454,0.607089,Healthy Control


In [None]:
# Compute correlation of features with the target (COHORT_DEFINITION)
# Convert COHORT_DEFINITION to numeric for correlation analysis
# Load the merged dataset
merged_df = pd.read_csv('merged_dataset.csv')

# Exclude PATNO (identifier) from the correlation matrix
features_df = merged_df.drop(columns=['PATNO'])

merged_df['COHORT_DEFINITION'] = merged_df['COHORT_DEFINITION'].map({
    "Parkinson's Disease": 1,
    "Healthy Control": 0
})

# Compute correlation
correlation = merged_df.corr()['COHORT_DEFINITION'].sort_values(ascending=False)
print(correlation)

COHORT_DEFINITION    1.000000
NP3TOT               0.612393
NP3GAIT              0.390452
NP3SPCH              0.312430
NP2DRES              0.291265
NP2SPCH              0.264534
NP2EAT               0.248967
NP2SALV              0.224557
NP1COG               0.146408
NP2SWAL              0.145890
NP1ANXS              0.141613
NP1APAT              0.134761
NP1DPRS              0.129693
NP3PSTBL             0.122127
NP1HALL              0.088026
PATNO                0.034323
AGE_AT_VISIT         0.011819
HVLTRT3             -0.070855
HVLTREC             -0.086256
HVLTRT2             -0.104233
HVLTRT1             -0.117775
HVLTRDLY            -0.123760
SDMTOTAL            -0.207535
Name: COHORT_DEFINITION, dtype: float64


In [None]:
import pandas as pd

# Load the merged dataset
merged_df = pd.read_csv('merged_dataset.csv')

# Define the selected features based on correlation analysis
selected_features = [
   'PATNO', 'NP3TOT', 'NP3GAIT', 'NP3SPCH', 'NP2DRES', 'NP2SPCH', 'NP2EAT',
    'NP2SALV', 'NP1COG', 'NP2SWAL', 'NP1ANXS', 'NP1APAT', 'NP1DPRS', 'COHORT_DEFINITION'
]

# Filter the dataset to include only the selected features
filtered_df = merged_df[selected_features]

# Save the filtered dataset
filtered_df.to_csv('filtered_dataset.csv', index=False)

print("Filtered dataset saved as 'filtered_dataset.csv'.")

Filtered dataset saved as 'filtered_dataset.csv'.


In [None]:
import pandas as pd

# Load the filtered dataset
filtered_df = pd.read_csv('filtered_dataset.csv')

# Keep only rows where COHORT_DEFINITION is "Healthy Control" or "Parkinson's Disease"
filtered_df = filtered_df[filtered_df['COHORT_DEFINITION'].isin(["Healthy Control", "Parkinson's Disease"])]

# Save the updated dataset
filtered_df.to_csv('Final_clinical_feat.csv', index=False)

print("Rows with invalid COHORT_DEFINITION removed. Cleaned dataset saved as 'Final_clinical_feat.csv'.")

Rows with invalid COHORT_DEFINITION removed. Cleaned dataset saved as 'Final_clinical_feat.csv'.


In [None]:
filtered_df.shape


(2032, 14)

# Random Forest with clinical features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the cleaned dataset
df = pd.read_csv('Final_clinical_feat.csv')

# Convert COHORT_DEFINITION to numeric values
df['COHORT_DEFINITION'] = df['COHORT_DEFINITION'].map({
    "Parkinson's Disease": 1,
    "Healthy Control": 0
})

# Define features (X) and target (y)
X = df.drop(columns=['PATNO', 'COHORT_DEFINITION'])  # Exclude PATNO and target
y = df['COHORT_DEFINITION']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importances:")
print(feature_importances)

Model Evaluation Metrics:
Accuracy: 0.97
Precision: 0.97
Recall: 0.99
F1-Score: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        66
           1       0.97      0.99      0.98       341

    accuracy                           0.97       407
   macro avg       0.97      0.93      0.95       407
weighted avg       0.97      0.97      0.97       407


Feature Importances:
NP3TOT     0.718244
NP3GAIT    0.092973
NP3SPCH    0.045246
NP2DRES    0.030439
NP2SPCH    0.025661
NP2EAT     0.019631
NP2SALV    0.017984
NP1ANXS    0.015424
NP1APAT    0.009768
NP1DPRS    0.009193
NP1COG     0.008448
NP2SWAL    0.006989
dtype: float64


Gradient Boosting Classifier Metrics:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90        66
           1       0.97      0.99      0.98       341

    accuracy                           0.97       407
   macro avg       0.96      0.93      0.94       407
weighted avg       0.97      0.97      0.97       407



In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score

# Load the cleaned dataset
df = pd.read_csv('Final_clinical_feat.csv')

# Convert COHORT_DEFINITION to numeric values
df['COHORT_DEFINITION'] = df['COHORT_DEFINITION'].map({
    "Parkinson's Disease": 1,
    "Healthy Control": 0
})

# Define features (X) and target (y)
X = df.drop(columns=['PATNO', 'COHORT_DEFINITION'])  # Exclude PATNO and target
y = df['COHORT_DEFINITION']

# Define k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define models
models = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
}

# Evaluate each model using k-fold cross-validation
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    accuracies = cross_val_score(model, X, y, cv=kfold, scoring=make_scorer(accuracy_score))
    print(f"Accuracies for each fold: {accuracies}")
    print(f"Mean Accuracy: {accuracies.mean():.2f}")


Evaluating Gradient Boosting...
Accuracies for each fold: [0.97542998 0.96560197 0.97044335 0.98522167 0.9679803 ]
Mean Accuracy: 0.97

Evaluating SVM...
Accuracies for each fold: [0.97788698 0.95577396 0.97044335 0.98029557 0.9729064 ]
Mean Accuracy: 0.97

Evaluating Logistic Regression...
Accuracies for each fold: [0.97788698 0.96314496 0.9679803  0.98768473 0.97536946]
Mean Accuracy: 0.97


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM (samples, timesteps, features)
X_lstm = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])

# Define k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize variables to store accuracy for each fold
fold_accuracies = []

# Perform k-fold cross-validation
for train_idx, test_idx in kfold.split(X_lstm, y):
    # Split the data
    X_train, X_test = X_lstm[train_idx], X_lstm[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Build the LSTM model
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    lstm_model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Evaluate the model
    _, accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
    fold_accuracies.append(accuracy)
    print(f"Fold Accuracy: {accuracy:.2f}")

# Print overall mean accuracy
print(f"\nMean Accuracy for LSTM: {np.mean(fold_accuracies):.2f}")

  super().__init__(**kwargs)


Fold Accuracy: 0.97
Fold Accuracy: 0.95
Fold Accuracy: 0.96
Fold Accuracy: 0.97
Fold Accuracy: 0.96

Mean Accuracy for LSTM: 0.96
