In [25]:
#importer les packages necessaire
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
#importer les packages de machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, classification_report, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [27]:
#Chargement du DF
df = pd.read_excel('Copie de Coeur.xlsx')

In [28]:
#Travailler avec une copie du dataset pour éviter de le rechager en cas d'êrreur
df = df.copy()

In [29]:
#Afficher les informations relatives au DF
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AGE          918 non-null    int64  
 1   SEXE         918 non-null    object 
 2   TDT          918 non-null    object 
 3   PAR          918 non-null    int64  
 4   CHOLESTEROL  918 non-null    int64  
 5   GAJ          918 non-null    int64  
 6   ECG          918 non-null    object 
 7   FCMAX        918 non-null    int64  
 8   ANGINE       918 non-null    object 
 9   DEPRESSION   918 non-null    float64
 10  PENTE        918 non-null    object 
 11  CŒUR         918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [30]:
#Standardisation des variables quantitatives avec la fonction std()
for col in df.drop('CŒUR', axis =1).select_dtypes(np.number).columns:
    df[col] = df[col]-df[col].mean()/df[col].std()


In [31]:
#Vérification des résultats
df.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,34.327036,homme,AA,132.848903,287.182556,-0.551041,Normal,166.626568,Non,-0.831979,Ascendant,0
1,43.327036,femme,DNA,152.848903,178.182556,-0.551041,Normal,150.626568,Non,0.168021,Plat,1
2,31.327036,homme,AA,122.848903,281.182556,-0.551041,ST,92.626568,Non,-0.831979,Ascendant,0
3,42.327036,femme,ASY,130.848903,212.182556,-0.551041,Normal,102.626568,Oui,0.668021,Plat,1
4,48.327036,homme,DNA,142.848903,193.182556,-0.551041,Normal,116.626568,Non,-0.831979,Ascendant,0


In [32]:
#Encodage des variables qualitatives
for col in df.select_dtypes('object').columns:
    df[col] = df[col].astype('category').cat.codes

In [33]:
#Vérification des résultats
df.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,34.327036,1,0,132.848903,287.182556,-0.551041,1,166.626568,0,-0.831979,0,0
1,43.327036,0,3,152.848903,178.182556,-0.551041,1,150.626568,0,0.168021,2,1
2,31.327036,1,0,122.848903,281.182556,-0.551041,2,92.626568,0,-0.831979,0,0
3,42.327036,0,1,130.848903,212.182556,-0.551041,1,102.626568,1,0.668021,2,1
4,48.327036,1,3,142.848903,193.182556,-0.551041,1,116.626568,0,-0.831979,0,0


In [34]:
#Séparer la variable cible (coeur) et les variables explicatives
y = df['CŒUR']
x = df.drop('CŒUR', axis = 1)

In [35]:
#preparation des donnees de test et des donnees d'entrainememnt
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state =1)

In [36]:
#Vérifier le résultat
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(734, 11) (184, 11) (734,) (184,)


In [37]:
#Création d'un objet lr de la classe LogisticRegression
lr = LogisticRegression(solver ='newton-cg', random_state =1)

In [38]:
#Apprentissage du modèle
model = lr.fit(x_train, y_train)

In [39]:
#Probabilité d'appartenance à l'une des classes
predict_proba = model.predict_proba(x_test)
predict_proba[:5,:]

array([[0.34770612, 0.65229388],
       [0.17508424, 0.82491576],
       [0.03708029, 0.96291971],
       [0.08135272, 0.91864728],
       [0.18111693, 0.81888307]])

In [40]:
#Application du modèle au données de test
y_pred = model.predict(x_test)
y_pred[:5]

array([1, 1, 1, 1, 1])

In [41]:
#matrice de confusion
mc = confusion_matrix(y_test, y_pred)
mc

array([[ 66,   8],
       [  8, 102]])

In [42]:
#Taux de bonnes prédictions
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9130434782608695

In [43]:
#Sensibilité
recall = recall_score(y_test, y_pred)
recall

0.9272727272727272

In [44]:
#Précision
precision = precision_score(y_test, y_pred)
precision

0.9272727272727272

In [45]:
#plus de détails avec classification report
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        74
           1       0.93      0.93      0.93       110

    accuracy                           0.91       184
   macro avg       0.91      0.91      0.91       184
weighted avg       0.91      0.91      0.91       184



In [46]:
#Modèle Trival
df['CŒUR'].value_counts()/df.shape[0]

1    0.553377
0    0.446623
Name: CŒUR, dtype: float64

In [47]:
#Score d'entrainement (le taux de bonne prédiction calculé sur la base des données d'entrainement)
model.score(x_train, y_train)

0.8460490463215259

In [48]:
#Score d'entrainement (le taux de bonne prédiction calculé sur la base des données de test)
model.score(x_test, y_test)

0.9130434782608695

# La partie de DecisionTree

In [49]:
df

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,34.327036,1,0,132.848903,287.182556,-0.551041,1,166.626568,0,-0.831979,0,0
1,43.327036,0,3,152.848903,178.182556,-0.551041,1,150.626568,0,0.168021,2,1
2,31.327036,1,0,122.848903,281.182556,-0.551041,2,92.626568,0,-0.831979,0,0
3,42.327036,0,1,130.848903,212.182556,-0.551041,1,102.626568,1,0.668021,2,1
4,48.327036,1,3,142.848903,193.182556,-0.551041,1,116.626568,0,-0.831979,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,39.327036,1,2,102.848903,262.182556,-0.551041,1,126.626568,0,0.368021,2,1
914,62.327036,1,1,136.848903,191.182556,0.448959,1,135.626568,0,2.568021,2,1
915,51.327036,1,1,122.848903,129.182556,-0.551041,1,109.626568,1,0.368021,2,1
916,51.327036,0,0,122.848903,234.182556,-0.551041,0,168.626568,0,-0.831979,2,1


In [50]:
#faire une copie du dataset de base pour avoir un autre sur lequel je pourais appliquer mon arbre de décision 
df_DT = df.copy()

In [51]:
df_DT.head(10)

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,34.327036,1,0,132.848903,287.182556,-0.551041,1,166.626568,0,-0.831979,0,0
1,43.327036,0,3,152.848903,178.182556,-0.551041,1,150.626568,0,0.168021,2,1
2,31.327036,1,0,122.848903,281.182556,-0.551041,2,92.626568,0,-0.831979,0,0
3,42.327036,0,1,130.848903,212.182556,-0.551041,1,102.626568,1,0.668021,2,1
4,48.327036,1,3,142.848903,193.182556,-0.551041,1,116.626568,0,-0.831979,0,0
5,33.327036,1,3,112.848903,337.182556,-0.551041,1,164.626568,0,-0.831979,0,0
6,39.327036,0,0,122.848903,235.182556,-0.551041,1,164.626568,0,-0.831979,0,0
7,48.327036,1,0,102.848903,206.182556,-0.551041,1,136.626568,0,-0.831979,0,0
8,31.327036,1,1,132.848903,205.182556,-0.551041,1,124.626568,1,0.668021,2,1
9,42.327036,0,0,112.848903,282.182556,-0.551041,1,114.626568,0,-0.831979,0,0


In [52]:
#Standardisation des variables quantitatives
for col in df_DT.drop('CŒUR', axis =1).select_dtypes(np.number).columns:
    df_DT[col] = df_DT[col]-df_DT[col].mean()/df_DT[col].max()

In [53]:
#Vérification des résultats
df_DT.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,33.656352,0.21024,-0.434641,132.199454,286.854898,0.157098,0.505447,165.958114,-0.404139,-0.842296,-0.535403,0
1,42.656352,-0.78976,2.565359,152.199454,177.854898,0.157098,0.505447,149.958114,-0.404139,0.157704,1.464597,1
2,30.656352,0.21024,-0.434641,122.199454,280.854898,0.157098,1.505447,91.958114,-0.404139,-0.842296,-0.535403,0
3,41.656352,-0.78976,0.565359,130.199454,211.854898,0.157098,0.505447,101.958114,0.595861,0.657704,1.464597,1
4,47.656352,0.21024,2.565359,142.199454,192.854898,0.157098,0.505447,115.958114,-0.404139,-0.842296,-0.535403,0


In [54]:
#Encodage des variables qualitatives dans une variable le
le = LabelEncoder()

In [55]:
#creer une variable df_encoded qui va stocker notre dataframe encoder
df_encoded=df.iloc[:,0:12]

In [56]:
#recodons par exemple le 'PENTE'
le.fit_transform(df_encoded["PENTE"])


array([0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2,
       0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0,
       2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0,
       0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2,
       2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       2, 0, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0,
       2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0,
       2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2,
       2, 0, 2, 0, 2, 1, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2,

In [57]:
#Dans une fonction application cette modification à tout notre DataFrame
for i in df_encoded:
    df_encoded[i]=le.fit_transform(df_encoded[i])


In [58]:
df_encoded.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,12,1,0,41,147,0,1,98,0,10,0,0
1,21,0,3,55,40,0,1,82,0,20,2,1
2,9,1,0,31,141,0,2,25,0,10,0,0
3,20,0,1,39,72,0,1,34,1,25,2,1
4,26,1,3,49,53,0,1,48,0,10,0,0


In [59]:
# jeu de fonctionnalités
x=df_encoded.iloc[:,0:11]
#l'étiquettement de notre jeu
y=df_encoded.iloc[:,11]

In [60]:
#preparation des donnees de test et des donnees d'entrainememnt
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state =1)

In [61]:
#Vérifier le résultat
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(734, 11) (184, 11) (734,) (184,)


In [62]:
#Création d'un objet model de la classe DecisionTreeClassifier
model_DT=DecisionTreeClassifier(criterion='gini')

In [63]:
#Apprentissage du modèle
model_DT.fit(x_train,y_train)

DecisionTreeClassifier()

In [64]:
#Probabilité d'appartenance à l'une des classes
predict_proba = model_DT.predict_proba(x_test)
predict_proba[:5,:]

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [65]:
#Application du modèle au données de test
y_pred = model_DT.predict(x_test)
y_pred[:5]

array([1, 0, 1, 1, 0])

In [66]:
#matrice de confusion
mc = confusion_matrix(y_test, y_pred)
mc

array([[64, 10],
       [28, 82]])

In [67]:
#Taux de bonnes prédictions
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7934782608695652

In [68]:
#Sensibilité
recall_DT = recall_score(y_test, y_pred)
recall_DT

0.7454545454545455

In [69]:
#Précision
precision_DT = precision_score(y_test, y_pred)
precision_DT

0.8913043478260869

In [70]:
#plus de détails avec classification report
cr_DT = classification_report(y_test, y_pred)
print(cr_DT)

              precision    recall  f1-score   support

           0       0.70      0.86      0.77        74
           1       0.89      0.75      0.81       110

    accuracy                           0.79       184
   macro avg       0.79      0.81      0.79       184
weighted avg       0.81      0.79      0.80       184



In [71]:
#Score d'entrainement (le taux de bonne prédiction calculé sur la base des données d'entrainement)
model_DT.score(x_train, y_train)

1.0

In [72]:
#Score d'entrainement (le taux de bonne prédiction calculé sur la base des données de test)
model_DT.score(x_test, y_test)

0.7934782608695652

# Comparaison des scores entre notre arbre de decision(DT) et notre Regression Logistic

In [73]:
#Comparaison au niveau des  Score d'entrainement (le taux de bonne prédiction calculé sur la base des données d'entrainement)
if model.score(x_train, y_train) > model_DT.score(x_train, y_train):
    print("Sur l'entrainement, le resultat de la Régression Logistique est supérieure à celui de l'Arbre de Décision")
else:
    print("Sur l'entrainement, le resultat de la Régression Logistique est inférieure à celui de l'Arbre de Décision")

Sur l'entrainement, le resultat de la Régression Logistique est inférieure à celui de l'Arbre de Décision


In [74]:
#Comparaison au niveau des Score d'entrainement (le taux de bonne prédiction calculé sur la base des données de test)
if model.score(x_test, y_test) > model_DT.score(x_test, y_test):
    print("Sur l'entrainement,le resultat de la Régression Logistique est supérieure à celui de l'Arbre de Décision")
else:
    print("Sur l'entrainement,le resultat de la Régression Logistique est inférieure à celui de l'Arbre de Décision")

Sur l'entrainement,le resultat de la Régression Logistique est inférieure à celui de l'Arbre de Décision


In [75]:
if model.score(x_train, y_train) < model_DT.score(x_train, y_train) and model.score(x_test, y_test) < model_DT.score(x_test, y_test):
    print("notre arbre de decision produit un bon résultat par raport à celle de la Régression Log")

notre arbre de decision produit un bon résultat par raport à celle de la Régression Log
