# Parkinson's Prediction

This file is all about how to preprocess data with pipelines in `scikit-learn`. I will be using the following modules:

 - `pandas`
 - `sklearn`
 - `matplotlib.pyplot`
 - `numpy`
 - `xgboost`

I will be making visualizations and be potentially be making models that use these preprocessing techniques.

Let the fun begin.

In [1]:
# Importing the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from ucimlrepo import fetch_ucirepo

In [2]:
# Fetching the dataset

parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
df = pd.DataFrame(data = parkinsons.data['features'], columns = parkinsons.feature_names)
df['target'] = parkinsons.data['targets']

# Extract features and target
X = df.drop(columns=['target'])
y = df['target'].map({-1: 0})

df

Unnamed: 0,MDVP:Fo,MDVP:Fhi,MDVP:Flo,MDVP:Jitter,MDVP:Jitter.1,MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer.1,...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,target
0,119.992,157.302,74.997,0.00784,0.00784,0.00370,0.00554,0.01109,0.04374,0.04374,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,1
1,122.400,148.650,113.819,0.00968,0.00968,0.00465,0.00696,0.01394,0.06134,0.06134,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674,1
2,116.682,131.111,111.555,0.01050,0.01050,0.00544,0.00781,0.01633,0.05233,0.05233,...,0.08270,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,1
3,116.676,137.871,111.366,0.00997,0.00997,0.00502,0.00698,0.01505,0.05492,0.05492,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,1
4,116.014,141.781,110.655,0.01284,0.01284,0.00655,0.00908,0.01966,0.06425,0.06425,...,0.10470,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,174.188,230.978,94.261,0.00459,0.00459,0.00263,0.00259,0.00790,0.04087,0.04087,...,0.07008,0.02764,19.517,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050,0
191,209.516,253.017,89.488,0.00564,0.00564,0.00331,0.00292,0.00994,0.02751,0.02751,...,0.04812,0.01810,19.147,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895,0
192,174.688,240.005,74.287,0.01360,0.01360,0.00624,0.00564,0.01873,0.02308,0.02308,...,0.03804,0.10715,17.883,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728,0
193,198.764,396.961,74.904,0.00740,0.00740,0.00370,0.00390,0.01109,0.02296,0.02296,...,0.03794,0.07223,19.020,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306,0


In [3]:
# Going to do some EDA

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MDVP:Fo       195 non-null    float64
 1   MDVP:Fhi      195 non-null    float64
 2   MDVP:Flo      195 non-null    float64
 3   MDVP:Jitter   195 non-null    float64
 4   MDVP:Jitter   195 non-null    float64
 5   MDVP:RAP      195 non-null    float64
 6   MDVP:PPQ      195 non-null    float64
 7   Jitter:DDP    195 non-null    float64
 8   MDVP:Shimmer  195 non-null    float64
 9   MDVP:Shimmer  195 non-null    float64
 10  Shimmer:APQ3  195 non-null    float64
 11  Shimmer:APQ5  195 non-null    float64
 12  MDVP:APQ      195 non-null    float64
 13  Shimmer:DDA   195 non-null    float64
 14  NHR           195 non-null    float64
 15  HNR           195 non-null    float64
 16  RPDE          195 non-null    float64
 17  DFA           195 non-null    float64
 18  spread1       195 non-null    

In [4]:
df.shape

(195, 23)

In [5]:
df.describe()

Unnamed: 0,MDVP:Fo,MDVP:Fhi,MDVP:Flo,MDVP:Jitter,MDVP:Jitter.1,MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer.1,...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,target
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,0.00622,0.003306,0.003446,0.00992,0.029709,0.029709,...,0.046993,0.024847,21.885974,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552,0.753846
std,41.390065,91.491548,43.521413,0.004848,0.004848,0.002968,0.002759,0.008903,0.018857,0.018857,...,0.030459,0.040418,4.425764,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119,0.431878
min,88.333,102.145,65.476,0.00168,0.00168,0.00068,0.00092,0.00204,0.00954,0.00954,...,0.01364,0.00065,8.441,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539,0.0
25%,117.572,134.8625,84.291,0.00346,0.00346,0.00166,0.00186,0.004985,0.016505,0.016505,...,0.024735,0.005925,19.198,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451,1.0
50%,148.79,175.829,104.315,0.00494,0.00494,0.0025,0.00269,0.00749,0.02297,0.02297,...,0.03836,0.01166,22.085,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052,1.0
75%,182.769,224.2055,140.0185,0.007365,0.007365,0.003835,0.003955,0.011505,0.037885,0.037885,...,0.060795,0.02564,25.0755,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298,1.0
max,260.105,592.03,239.17,0.03316,0.03316,0.02144,0.01958,0.06433,0.11908,0.11908,...,0.16942,0.31482,33.047,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367,1.0


In [6]:
df.dtypes

MDVP:Fo         float64
MDVP:Fhi        float64
MDVP:Flo        float64
MDVP:Jitter     float64
MDVP:Jitter     float64
MDVP:RAP        float64
MDVP:PPQ        float64
Jitter:DDP      float64
MDVP:Shimmer    float64
MDVP:Shimmer    float64
Shimmer:APQ3    float64
Shimmer:APQ5    float64
MDVP:APQ        float64
Shimmer:DDA     float64
NHR             float64
HNR             float64
RPDE            float64
DFA             float64
spread1         float64
spread2         float64
D2              float64
PPE             float64
target            int64
dtype: object

In [7]:
# Preprocessing the data
X = X.loc[:, ~X.columns.duplicated()]

numeric_features = X.select_dtypes(include=['float64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

X = preprocessor.fit_transform(X)

In [8]:
# Spliting the data into testing and training sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)   

ValueError: Input y contains NaN.

In [18]:
# Printing the shape of the X_test and X_train
print(f'X train Shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

X train Shape: (156, 22)
X test shape: (39, 22)


In [19]:
# Printing shape of y_train and y_test
print(f'y train Shape: {y_train.shape}')
print(f'y test shape: {y_test.shape}')

y train Shape: (156,)
y test shape: (39,)
