# Importation des bibliothèques nécessaires

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Chargement du dataset

In [2]:
data = pd.read_csv("/home/hassan/Desktop/HeartDiseaseAnalysis/data/HeartDisease.csv")

# 1. Génération des nouvelles caractéristiques

In [3]:
# Groupes d'âge
age_bins = [0, 29, 39, 49, 59, 69, 79, 89]
age_labels = ['<30', '30-39', '40-49', '50-59', '60-69', '70-79', '80+']
data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels)

In [4]:
# Catégories de cholestérol
cholesterol_bins = [0, 200, 240, 600]
cholesterol_labels = ['Desirable', 'Borderline High', 'High']
data['cholesterol_category'] = pd.cut(data['cholestoral'], bins=cholesterol_bins, labels=cholesterol_labels)

In [5]:
# Hypertension binaire
data['hypertension'] = data['resting_blood_pressure'].apply(lambda x: 1 if x > 130 else 0)

In [6]:
# Caractéristique "Senior"
data['senior'] = data['age'].apply(lambda x: 1 if x >= 60 else 0)

In [7]:
# Niveau de fréquence cardiaque
heart_rate_bins = [0, 100, 140, 180, 250]
heart_rate_labels = ['Low', 'Normal', 'High', 'Very High']
data['heart_rate_level'] = pd.cut(data['Max_heart_rate'], bins=heart_rate_bins, labels=heart_rate_labels)

# 2. Préparation des données

In [8]:
# Suppression de la colonne cible si elle n'est pas binaire
target_column = 'target'  # Assurez-vous que la colonne cible est binaire
X = data.drop(columns=[target_column])
y = data[target_column]

In [9]:
# Séparation des données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Prétraitement des données avec encodage et normalisation

In [10]:
# Sélection des colonnes numériques et catégorielles
numeric_features = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']
categorical_features = ['age_group', 'cholesterol_category', 'sex', 'chest_pain_type', 
                        'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 
                        'slope', 'vessels_colored_by_flourosopy', 'thalassemia', 'heart_rate_level']

In [11]:
# Pipeline de transformation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# 4.1 Modélisation avec la régression logistique

In [12]:
# Pipeline pour le prétraitement et le modèle
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000))])

In [13]:
# Entraînement du modèle
model.fit(X_train, y_train)

In [14]:
# Prédictions sur l'ensemble de test
y_pred = model.predict(X_test)

In [15]:
# Affichage des métriques
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nMatrice de Confusion:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8195121951219512

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       102
           1       0.79      0.87      0.83       103

    accuracy                           0.82       205
   macro avg       0.82      0.82      0.82       205
weighted avg       0.82      0.82      0.82       205


Matrice de Confusion:
[[78 24]
 [13 90]]


In [38]:
# Sauvegarde du modèle
joblib.dump(model, '/home/hassan/Desktop/HeartDiseaseAnalysis/models/log_reg_model.joblib')
print("Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/log_reg_model.joblib'")

Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/log_reg_model.joblib'


# 4.2 Modélisation avec la forêt aléatoire

In [18]:
# Pipeline pour le prétraitement et le modèle
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])


In [19]:
# Entraînement du modèle
model_rf.fit(X_train, y_train)

In [20]:
# Prédictions sur l'ensemble de test
y_pred = model_rf.predict(X_test)

In [21]:
# Affichage des métriques
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nMatrice de Confusion:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9853658536585366

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205


Matrice de Confusion:
[[102   0]
 [  3 100]]


In [22]:
# Extraire et afficher l'importance des caractéristiques
feature_importances = model_rf.named_steps['classifier'].feature_importances_
encoded_feature_names = model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)
feature_names = list(numeric_features) + list(encoded_feature_names)
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\nImportance des caractéristiques :")
print(feature_importance_df)


Importance des caractéristiques :
                                     Feature  Importance
28             thalassemia_Reversable Defect    0.110866
4                                    oldpeak    0.103863
3                             Max_heart_rate    0.098748
25        vessels_colored_by_flourosopy_Zero    0.092213
15            chest_pain_type_Typical angina    0.086995
0                                        age    0.069492
2                                cholestoral    0.064600
1                     resting_blood_pressure    0.056681
20                                slope_Flat    0.037576
19               exercise_induced_angina_Yes    0.032445
12                                  sex_Male    0.031128
14          chest_pain_type_Non-anginal pain    0.025165
22         vessels_colored_by_flourosopy_One    0.019806
24         vessels_colored_by_flourosopy_Two    0.019681
30                   heart_rate_level_Normal    0.018631
18            rest_ecg_ST-T wave abnormality    0.014

In [41]:
# Sauvegarde du modèle
joblib.dump(model_rf, '/home/hassan/Desktop/HeartDiseaseAnalysis/models/random_forest_model.joblib')
print("Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/random_forest_model.joblib'")

Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/random_forest_model.joblib'


# 4.3 Modélisation avec le SVM

In [24]:
# Pipeline pour le prétraitement et le modèle SVM
model_svm = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42))])


In [25]:
# Entraînement du modèle SVM
model_svm.fit(X_train, y_train)

In [26]:
# Prédictions sur l'ensemble de test
y_pred = model.predict(X_test)

In [27]:
# Affichage des métriques
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nMatrice de Confusion:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8195121951219512

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       102
           1       0.79      0.87      0.83       103

    accuracy                           0.82       205
   macro avg       0.82      0.82      0.82       205
weighted avg       0.82      0.82      0.82       205


Matrice de Confusion:
[[78 24]
 [13 90]]


In [42]:
# Sauvegarde du modèle
joblib.dump(model_svm, '/home/hassan/Desktop/HeartDiseaseAnalysis/models/svm_model.joblib')
print("Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/svm_model.joblib'")

Modèle sauvegardé sous '/home/hassan/Desktop/HeartDiseaseAnalysis/models/svm_model.joblib'
