In [1]:
# 📌 Step 1: Import Required Libraries

# Data manipulation
import pandas as pd
import numpy as np

# Plotly for interactive visualization
import plotly.express as px
import plotly.graph_objects as go

# Preprocessing
from sklearn.preprocessing import StandardScaler


In [2]:
# 📌 Step 2: Load the Parkinson’s Dataset

# Load the dataset from CSV file
df = pd.read_csv("parkinsons.csv")

# Clean column names: lowercase and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Preview the first few rows
df.head()


Unnamed: 0,name,mdvp:fo(hz),mdvp:fhi(hz),mdvp:flo(hz),mdvp:jitter(%),mdvp:jitter(abs),mdvp:rap,mdvp:ppq,jitter:ddp,mdvp:shimmer,...,shimmer:dda,nhr,hnr,status,rpde,dfa,spread1,spread2,d2,ppe
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [3]:
# 📌 Step 3: Basic Inspection – Check for Nulls and Target

# Overview of data types and missing values
df.info()

# Count missing values per column
df.isnull().sum()

# Check value counts of target column 'status' (0 = healthy, 1 = Parkinson’s)
df['status'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   mdvp:fo(hz)       195 non-null    float64
 2   mdvp:fhi(hz)      195 non-null    float64
 3   mdvp:flo(hz)      195 non-null    float64
 4   mdvp:jitter(%)    195 non-null    float64
 5   mdvp:jitter(abs)  195 non-null    float64
 6   mdvp:rap          195 non-null    float64
 7   mdvp:ppq          195 non-null    float64
 8   jitter:ddp        195 non-null    float64
 9   mdvp:shimmer      195 non-null    float64
 10  mdvp:shimmer(db)  195 non-null    float64
 11  shimmer:apq3      195 non-null    float64
 12  shimmer:apq5      195 non-null    float64
 13  mdvp:apq          195 non-null    float64
 14  shimmer:dda       195 non-null    float64
 15  nhr               195 non-null    float64
 16  hnr               195 non-null    float64
 1

status
1    147
0     48
Name: count, dtype: int64

In [4]:
# 📌 Step 4: Drop Identifier Column (e.g., 'name')

# Drop the 'name' column since it's an identifier and not useful for modeling
if 'name' in df.columns:
    df.drop('name', axis=1, inplace=True)


In [5]:
# 📌 Step 5: Feature Scaling and Preprocessing

# Split features and target
X = df.drop("status", axis=1)
y = df["status"]

# Apply StandardScaler to normalize feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert scaled features back to DataFrame for EDA
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add back the target column
df_scaled['status'] = y.reset_index(drop=True)

# View the processed data
df_scaled.head()


Unnamed: 0,mdvp:fo(hz),mdvp:fhi(hz),mdvp:flo(hz),mdvp:jitter(%),mdvp:jitter(abs),mdvp:rap,mdvp:ppq,jitter:ddp,mdvp:shimmer,mdvp:shimmer(db),...,shimmer:dda,nhr,hnr,rpde,dfa,spread1,spread2,d2,ppe,status
0,-0.8293,-0.436165,-0.952037,0.334914,0.749759,0.132963,0.7608,0.131755,0.745985,0.739536,...,0.607532,-0.067893,-0.193225,-0.807838,1.760814,0.801323,0.480477,-0.210531,0.868886,1
1,-0.770972,-0.530974,-0.057721,0.715418,1.037674,0.453892,1.276809,0.452684,1.681731,1.768464,...,1.548254,-0.137843,-0.634508,-0.387524,1.837562,1.479853,1.311185,0.275077,1.803605,1
2,-0.909476,-0.723168,-0.109875,0.884991,1.325589,0.72077,1.585687,0.721813,1.202693,1.027636,...,1.175323,-0.291633,-0.27976,-0.662075,1.942048,1.141445,1.017682,-0.103629,1.402661,1
3,-0.909622,-0.649092,-0.114229,0.775389,1.325589,0.578885,1.284076,0.577677,1.340396,1.207698,...,1.340229,-0.280719,-0.281346,-0.613134,1.83238,1.440945,1.29384,0.062145,1.806954,1
4,-0.925657,-0.606245,-0.130608,1.368893,1.901418,1.09575,2.047187,1.096793,1.836448,1.552389,...,1.899461,-0.178026,-0.506745,-0.783021,1.909364,1.78094,0.096195,-0.130026,2.267082,1


In [6]:
# 📌 Step 6: Save Cleaned Dataset to CSV

# Save the cleaned and scaled data for model training
df_scaled.to_csv("parkinsons_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as 'parkinsons_cleaned.csv'")


✅ Cleaned dataset saved as 'parkinsons_cleaned.csv'


In [7]:
# 📊 Step 7: Univariate Analysis – Target Class Distribution


# ✅ Make sure 'status' exists in the dataframe and is integer (0 or 1)
print(df_scaled['status'].value_counts())  # Debug check

# ✅ Plot the count of Parkinson’s vs Healthy
fig = px.histogram(
    df_scaled, 
    x='status', 
    color='status', 
    title="Parkinson’s Disease vs Healthy",
    labels={'status': 'Status (0 = Healthy, 1 = Parkinson’s)'},
    barmode='group',
    template='plotly_white'
)
fig.update_layout(xaxis_title="Status", yaxis_title="Count")
fig.show()


status
1    147
0     48
Name: count, dtype: int64


In [9]:
# 📊 Step 8: Bivariate Analysis – Example Feature vs Status

# ✅ Confirm the column exists
print(df_scaled.columns.tolist())  # Debug: see all columns

# 📊 Plot Box Plot for "mdvp:fo(hz)" across Parkinson’s status
if 'mdvp:fo(hz)' in df_scaled.columns:
    fig = px.box(
        df_scaled, 
        x='status', 
        y='mdvp:fo(hz)', 
        points="all", 
        title="Fundamental Frequency vs Parkinson’s Status",
        color='status',
        template="plotly_white"
    )
    fig.update_layout(
        xaxis_title="Status (0 = Healthy, 1 = Parkinson’s)", 
        yaxis_title="Fo (Hz)",
        boxmode='group'
    )
    fig.show()
else:
    print("⚠️ Column 'mdvp:fo(hz)' not found in dataset.")


['mdvp:fo(hz)', 'mdvp:fhi(hz)', 'mdvp:flo(hz)', 'mdvp:jitter(%)', 'mdvp:jitter(abs)', 'mdvp:rap', 'mdvp:ppq', 'jitter:ddp', 'mdvp:shimmer', 'mdvp:shimmer(db)', 'shimmer:apq3', 'shimmer:apq5', 'mdvp:apq', 'shimmer:dda', 'nhr', 'hnr', 'rpde', 'dfa', 'spread1', 'spread2', 'd2', 'ppe', 'status']


In [10]:
# 📊 Step 9: Feature Correlation with Target Variable (Parkinson’s Disease)

# ✅ Compute correlation of each feature with 'status'
if 'status' in df_scaled.columns:
    corr = df_scaled.corr(numeric_only=True)['status'].drop('status').sort_values(ascending=False)

    # ✅ Plot feature correlation as a horizontal bar chart
    fig = px.bar(
        x=corr.values, 
        y=corr.index,
        orientation='h',
        title="Feature Correlation with Parkinson’s Disease",
        labels={'x': 'Correlation', 'y': 'Feature'},
        template="plotly_white"
    )
    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
    fig.show()
else:
    print("⚠️ 'status' column not found in dataset.")


In [11]:
# 📊 Step 10: Multivariate Analysis – Correlation Heatmap (Parkinson’s Dataset)

# Plot a full heatmap of feature correlations
fig = go.Figure(data=go.Heatmap(
    z=df_scaled.corr().values,
    x=df_scaled.columns,
    y=df_scaled.columns,
    colorscale='Viridis'
))
fig.update_layout(title="Correlation Heatmap of Parkinson’s Features")
fig.show()
