In [1]:
import pandas                  as pd
from   sklearn.preprocessing   import StandardScaler
from   sklearn.model_selection import train_test_split
from   sklearn.svm             import SVC
from   sklearn.metrics         import accuracy_score

In [2]:
df = pd.read_csv("parkinson_disease.csv")
df.shape

(195, 24)

In [3]:
# Voir toutes les colonnes
pd.set_option('display.max_columns', 25)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [5]:
df.isnull().sum()

Il n'y a pas de valeur manquante.  
Que des nombres sauf name.  
Name n'a pas d'intérêt pour ce projet.  
Pas de valeur catégorielle.

In [7]:
df.drop(columns='name', inplace=True)

In [8]:
# Diviser en feature et label
y = df['status']
X = df.drop(columns='status')

In [9]:
# Quelles sont les valeur des colonnes de fetures
X['NHR'].min(), X['NHR'].max()
X['MDVP:Flo(Hz)'].min(), X['MDVP:Flo(Hz)'].max()

(65.476, 239.17)

Aucune variables de features n'est normalisées.  
Il est impossible d'alimenter un algo de ML.

Normaliser avec une échelle strandard est une option.

In [11]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

Après la normalisation de df devient un tableau NumPy.  
Le type array NumPy n'est pas un souci pour alimenter le modèle de ML.

Je vais tout de même le convertir en dataframe.

In [13]:
X_columns = df.drop(columns='status').columns
pd.DataFrame(X, columns=X_columns)

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,-0.829300,-0.436165,-0.952037,0.334914,0.749759,0.132963,0.760800,0.131755,0.745985,0.739536,0.607859,1.119147,0.332985,0.607532,-0.067893,-0.193225,-0.807838,1.760814,0.801323,0.480477,-0.210531,0.868886
1,-0.770972,-0.530974,-0.057721,0.715418,1.037674,0.453892,1.276809,0.452684,1.681731,1.768464,1.547912,2.276504,1.159454,1.548254,-0.137843,-0.634508,-0.387524,1.837562,1.479853,1.311185,0.275077,1.803605
2,-0.909476,-0.723168,-0.109875,0.884991,1.325589,0.720770,1.585687,0.721813,1.202693,1.027636,1.175643,1.726176,0.699187,1.175323,-0.291633,-0.279760,-0.662075,1.942048,1.141445,1.017682,-0.103629,1.402661
3,-0.909622,-0.649092,-0.114229,0.775389,1.325589,0.578885,1.284076,0.577677,1.340396,1.207698,1.340547,1.848749,0.806859,1.340229,-0.280719,-0.281346,-0.613134,1.832380,1.440945,1.293840,0.062145,1.806954
4,-0.925657,-0.606245,-0.130608,1.368893,1.901418,1.095750,2.047187,1.096793,1.836448,1.552389,1.899444,2.532491,1.216839,1.899461,-0.178026,-0.506745,-0.783021,1.909364,1.780940,0.096195,-0.130026,2.267082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.483467,0.371185,-0.508265,-0.337173,-0.401899,-0.228505,-0.311189,-0.227459,0.593395,0.631498,0.759926,0.592165,0.199282,0.759930,0.069278,-0.536647,-0.483208,-1.090704,-0.785527,-1.256837,0.721944,-0.817703
191,1.339202,0.612690,-0.618218,-0.120037,-0.401899,0.001213,-0.191272,0.002258,-0.116922,-0.099041,0.037113,-0.109086,-0.313046,0.037108,-0.167360,-0.620463,-0.644916,-0.631503,-0.469859,-1.168475,1.054135,-0.418929
192,0.495578,0.470104,-0.968393,1.526058,1.037674,0.991026,0.797139,0.992069,-0.352453,-0.135053,-0.294670,-0.352565,-0.438466,-0.294679,2.041513,-0.906799,-0.877441,-1.130853,-1.014154,-0.818079,0.780338,-0.832410
193,1.078761,2.190044,-0.954180,0.243924,-0.113985,0.132963,0.164847,0.131755,-0.358834,-0.212223,-0.297633,-0.389254,-0.485202,-0.297970,1.175327,-0.649233,-0.456374,-1.343323,-0.974960,-0.229066,-0.637003,-0.926105


In [14]:
# diviser les features et label en ensembles d’entraînement et en ensembles de test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=34)
x_train.shape, x_test.shape

((156, 22), (39, 22))

In [15]:
y_train.shape, y_test.shape

((156,), (39,))

In [16]:
# 
svc = SVC()
svc.fit(x_train, y_train)
svc_prediction = svc.predict(x_test)

svc_score = accuracy_score(y_test, svc_prediction)
svc_score

0.8974358974358975