In [1]:
# Importing necessary libraries for data cleaning
import pandas as pd

# Loading the dataset
data = pd.read_csv("/home/hassan/Desktop/HeartDiseaseAnalysis/data/HeartDisease.csv")

In [2]:
# 1. Statistiques descriptives de base pour toutes les colonnes
print("Statistiques descriptives globales :")
print(data.describe(include='all'))

Statistiques descriptives globales :
                age   sex chest_pain_type  resting_blood_pressure  \
count   1025.000000  1025            1025             1025.000000   
unique          NaN     2               4                     NaN   
top             NaN  Male  Typical angina                     NaN   
freq            NaN   713             497                     NaN   
mean      54.434146   NaN             NaN              131.611707   
std        9.072290   NaN             NaN               17.516718   
min       29.000000   NaN             NaN               94.000000   
25%       48.000000   NaN             NaN              120.000000   
50%       56.000000   NaN             NaN              130.000000   
75%       61.000000   NaN             NaN              140.000000   
max       77.000000   NaN             NaN              200.000000   

        cholestoral   fasting_blood_sugar               rest_ecg  \
count    1025.00000                  1025                   1025  

In [3]:
# 2. Statistiques descriptives pour les colonnes numériques uniquement
print("\nStatistiques descriptives pour les colonnes numériques :")
numerical_summary = data.describe().T
print(numerical_summary)


Statistiques descriptives pour les colonnes numériques :
                         count        mean        std    min    25%    50%  \
age                     1025.0   54.434146   9.072290   29.0   48.0   56.0   
resting_blood_pressure  1025.0  131.611707  17.516718   94.0  120.0  130.0   
cholestoral             1025.0  246.000000  51.592510  126.0  211.0  240.0   
Max_heart_rate          1025.0  149.114146  23.005724   71.0  132.0  152.0   
oldpeak                 1025.0    1.071512   1.175053    0.0    0.0    0.8   
target                  1025.0    0.513171   0.500070    0.0    0.0    1.0   

                          75%    max  
age                      61.0   77.0  
resting_blood_pressure  140.0  200.0  
cholestoral             275.0  564.0  
Max_heart_rate          166.0  202.0  
oldpeak                   1.8    6.2  
target                    1.0    1.0  


In [4]:
# 3. Statistiques descriptives pour les colonnes catégorielles uniquement
print("\nStatistiques descriptives pour les colonnes catégorielles :")
categorical_summary = data.describe(include=['object']).T
print(categorical_summary)


Statistiques descriptives pour les colonnes catégorielles :
                              count unique                    top freq
sex                            1025      2                   Male  713
chest_pain_type                1025      4         Typical angina  497
fasting_blood_sugar            1025      2   Lower than 120 mg/ml  872
rest_ecg                       1025      3  ST-T wave abnormality  513
exercise_induced_angina        1025      2                     No  680
slope                          1025      3                   Flat  482
vessels_colored_by_flourosopy  1025      5                   Zero  578
thalassemia                    1025      4           Fixed Defect  544


In [5]:
# 4. Valeurs uniques et fréquence des valeurs pour les colonnes catégorielles
print("\nValeurs uniques et leur fréquence pour chaque colonne catégorielle :")
for col in data.select_dtypes(include=['object']).columns:
    print(f"\n{col}:")
    print(data[col].value_counts())


Valeurs uniques et leur fréquence pour chaque colonne catégorielle :

sex:
sex
Male      713
Female    312
Name: count, dtype: int64

chest_pain_type:
chest_pain_type
Typical angina      497
Non-anginal pain    284
Atypical angina     167
Asymptomatic         77
Name: count, dtype: int64

fasting_blood_sugar:
fasting_blood_sugar
Lower than 120 mg/ml      872
Greater than 120 mg/ml    153
Name: count, dtype: int64

rest_ecg:
rest_ecg
ST-T wave abnormality           513
Normal                          497
Left ventricular hypertrophy     15
Name: count, dtype: int64

exercise_induced_angina:
exercise_induced_angina
No     680
Yes    345
Name: count, dtype: int64

slope:
slope
Flat           482
Downsloping    469
Upsloping       74
Name: count, dtype: int64

vessels_colored_by_flourosopy:
vessels_colored_by_flourosopy
Zero     578
One      226
Two      134
Three     69
Four      18
Name: count, dtype: int64

thalassemia:
thalassemia
Fixed Defect         544
Reversable Defect    410
Norm

In [6]:
# 5. Calcul de la skewness (asymétrie) et de la kurtosis (applatissement) pour les colonnes numériques
from scipy.stats import skew, kurtosis

print("\nAsymétrie (skewness) et aplatissement (kurtosis) pour les colonnes numériques :")
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    print(f"{col} - Skewness: {skew(data[col].dropna()):.2f}, Kurtosis: {kurtosis(data[col].dropna()):.2f}")



Asymétrie (skewness) et aplatissement (kurtosis) pour les colonnes numériques :
age - Skewness: -0.25, Kurtosis: -0.53
resting_blood_pressure - Skewness: 0.74, Kurtosis: 0.98
cholestoral - Skewness: 1.07, Kurtosis: 3.97
Max_heart_rate - Skewness: -0.51, Kurtosis: -0.09
oldpeak - Skewness: 1.21, Kurtosis: 1.30
target - Skewness: -0.05, Kurtosis: -2.00


In [7]:
# 6. Vérification des valeurs manquantes
print("\nValeurs manquantes dans chaque colonne :")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])


Valeurs manquantes dans chaque colonne :
Series([], dtype: int64)


In [9]:
# Résumé des statistiques descriptives dans un dictionnaire
descriptive_summary = {
    "global_summary": data.describe(include='all'),
    "numerical_summary": numerical_summary,
    "categorical_summary": categorical_summary,
    "missing_values": missing_values[missing_values > 0],
}

In [10]:
descriptive_summary

{'global_summary':                 age   sex chest_pain_type  resting_blood_pressure  \
 count   1025.000000  1025            1025             1025.000000   
 unique          NaN     2               4                     NaN   
 top             NaN  Male  Typical angina                     NaN   
 freq            NaN   713             497                     NaN   
 mean      54.434146   NaN             NaN              131.611707   
 std        9.072290   NaN             NaN               17.516718   
 min       29.000000   NaN             NaN               94.000000   
 25%       48.000000   NaN             NaN              120.000000   
 50%       56.000000   NaN             NaN              130.000000   
 75%       61.000000   NaN             NaN              140.000000   
 max       77.000000   NaN             NaN              200.000000   
 
         cholestoral   fasting_blood_sugar               rest_ecg  \
 count    1025.00000                  1025                   1025   
 u