# Task
Create a UI using Python and Flask/Django that takes user input for features related to Parkinson's disease (e.g., tremors, voice recordings) and uses a machine learning model trained on an online dataset to predict the likelihood of the disease.

## Load the dataset

In [1]:
import pandas as pd

# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"

# Read the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the first 5 rows of the DataFrame
display(df.head())

# Print the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


Shape of the DataFrame: (195, 24)


## Explore and preprocess the data

In [2]:
# Display column names and their data types
display(df.info())

# Check for missing values in each column and display the count
display(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

None

Unnamed: 0,0
name,0
MDVP:Fo(Hz),0
MDVP:Fhi(Hz),0
MDVP:Flo(Hz),0
MDVP:Jitter(%),0
MDVP:Jitter(Abs),0
MDVP:RAP,0
MDVP:PPQ,0
Jitter:DDP,0
MDVP:Shimmer,0


In [3]:
# Examine the distribution of the target variable ('status')
display(df['status'].value_counts())

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,147
0,48


## Split the data

In [4]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop(['name', 'status'], axis=1)
y = df['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (146, 22)
Shape of X_test: (49, 22)
Shape of y_train: (146,)
Shape of y_test: (49,)


## Train a machine learning model

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

## Evaluate the model


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict the target variable on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9388
Precision: 0.9268
Recall: 1.0000
F1-score: 0.9620


In [11]:
import json, pickle, pathlib

pathlib.Path("artifacts").mkdir(exist_ok=True)

# Save model
with open("artifacts/model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the exact feature order used during training
feature_names = X.columns.tolist()
with open("artifacts/feature_names.json", "w") as f:
    json.dump(feature_names, f)


In [14]:
import json

feature_names = X.columns.tolist()
stats = {}
for c in feature_names:
    s = X[c].describe()
    stats[c] = {
        "min": float(s["min"]),
        "max": float(s["max"]),
        "mean": float(s["mean"])
    }

# also save the exact feature order and (optional) scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
import pickle, pathlib
pathlib.Path("artifacts").mkdir(exist_ok=True)
with open("artifacts/model.pkl", "wb") as f: pickle.dump(model, f)
with open("artifacts/feature_names.json", "w") as f: json.dump(feature_names, f)
with open("artifacts/feature_stats.json", "w") as f: json.dump(stats, f)
with open("artifacts/scaler.pkl", "wb") as f: pickle.dump(scaler, f)


In [12]:
import json, pickle, pathlib

pathlib.Path("artifacts").mkdir(exist_ok=True)

# Save model
with open("artifacts/model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the exact feature order used during training
feature_names = X.columns.tolist()
with open("artifacts/feature_names.json", "w") as f:
    json.dump(feature_names, f)

### Data Analysis Key Findings


*   The dataset contains 195 rows and 24 columns, with no missing values.
*   The target variable ('status') is imbalanced, with significantly more instances of status 1 (147) than status 0 (48).
*   The dataset was split into training (146 rows, 22 features) and testing (49 rows, 22 features) sets.
*   A Random Forest Classifier model was trained on the data.
*   The trained model achieved an accuracy of 0.9388, precision of 0.9268, recall of 1.0000, and an F1-score of 0.9620 on the test set, demonstrating high performance, especially in identifying positive cases.
*   A Streamlit UI was developed to take user input for features and display predictions from the trained model.
