In [10]:
# Réexécution du pipeline complet après réinitialisation de l'état
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [11]:
# Définition des chemins des fichiers
file_heart_exams = "DSA-2024_HeartDisease_HeartExams_20240312.tsv"
file_diagnosis = "DSA-2024_HeartDisease_Diagnosis_20240312.csv"
file_general_exams = "DSA-2024_HeartDisease_GeneralExams_20240312.tsv"

In [12]:
# 1. Chargement des fichiers
df_heart_exams = pd.read_csv(file_heart_exams, sep="\t")
df_diagnosis = pd.read_csv(file_diagnosis)
df_general_exams = pd.read_csv(file_general_exams, sep="\t")

In [13]:
%run CATALOGUE.ipynb

In [14]:
decrire_colonnes(df_heart_exams)

Unnamed: 0,Nom de la colonne,Type de données,Valeurs manquantes,Pourcentage de valeurs manquantes,Valeurs uniques
0,patient_id,object,0,0.0,1190
1,measurement,object,0,0.0,6
2,value,float64,0,0.0,172


In [15]:
decrire_colonnes(df_general_exams)

Unnamed: 0,Nom de la colonne,Type de données,Valeurs manquantes,Pourcentage de valeurs manquantes,Valeurs uniques
0,center_id,object,0,0.0,1
1,patient_id,object,0,0.0,1190
2,sex,object,0,0.0,4
3,weight,float64,70,5.88,471
4,height,float64,54,4.54,96
5,resting.bp.s,int64,0,0.0,67
6,cholesterol,int64,0,0.0,222
7,fasting.blood.sugar,int64,0,0.0,2
8,physical_activity,object,0,0.0,4
9,dob,object,0,0.0,50


In [16]:
decrire_colonnes(df_diagnosis)

Unnamed: 0,Nom de la colonne,Type de données,Valeurs manquantes,Pourcentage de valeurs manquantes,Valeurs uniques
0,pid,int64,0,0.0,1190
1,target,object,0,0.0,2


In [17]:
generer_catalogue(df_heart_exams)


===== DESCRIPTION DES COLONNES =====
  Nom de la colonne Type de données  Valeurs manquantes  \
0        patient_id          object                   0   
1       measurement          object                   0   
2             value         float64                   0   

   Pourcentage de valeurs manquantes  Valeurs uniques  
0                               0.00             1190  
1                               0.00                6  
2                               0.00              172  

===== STATISTIQUES DESCRIPTIVES =====
        value
count 7140.00
mean    24.43
std     52.62
min     -2.60
25%      0.00
50%      1.70
75%      4.00
max    202.00


In [20]:
generer_catalogue(df_heart_exams)


===== DESCRIPTION DES COLONNES =====
  Nom de la colonne Type de données  Valeurs manquantes  \
0        patient_id          object                   0   
1       measurement          object                   0   
2             value         float64                   0   

   Pourcentage de valeurs manquantes  Valeurs uniques  
0                               0.00             1190  
1                               0.00                6  
2                               0.00              172  

===== STATISTIQUES DESCRIPTIVES =====
        value
count 7140.00
mean    24.43
std     52.62
min     -2.60
25%      0.00
50%      1.70
75%      4.00
max    202.00


In [21]:
generer_catalogue(df_general_exams)


===== DESCRIPTION DES COLONNES =====
         Nom de la colonne Type de données  Valeurs manquantes  \
0                center_id          object                   0   
1               patient_id          object                   0   
2                      sex          object                   0   
3                   weight         float64                  70   
4                   height         float64                  54   
5             resting.bp.s           int64                   0   
6              cholesterol           int64                   0   
7      fasting.blood.sugar           int64                   0   
8        physical_activity          object                   0   
9                      dob          object                   0   
10  social_security_number           int64                   0   

    Pourcentage de valeurs manquantes  Valeurs uniques  
0                                0.00                1  
1                                0.00             1190 

In [17]:
# 2. Nettoyage et transformation
df_heart_exams = df_heart_exams.dropna()
df_diagnosis = df_diagnosis.dropna()
df_general_exams = df_general_exams.drop(columns=["social_security_number"], errors='ignore')  # Suppression de données sensibles

In [18]:
# Conversion de `dob` en âge
current_year = pd.to_datetime("today").year
df_general_exams["age"] = current_year - pd.to_datetime(df_general_exams["dob"]).dt.year
df_general_exams = df_general_exams.drop(columns=["dob"])

In [20]:
df_general_exams

Unnamed: 0,center_id,patient_id,sex,weight,height,resting.bp.s,cholesterol,fasting.blood.sugar,physical_activity,age
0,HOSPIT4224,PAT00001,M,49.9,174.0,140,289,0,Low,41
1,HOSPIT4224,PAT00002,F,65.3,180.0,160,180,0,Low,50
2,HOSPIT4224,PAT00003,M,65.5,,130,283,0,Intermediate,38
3,HOSPIT4224,PAT00004,F,77.9,160.0,138,214,0,Low,49
4,HOSPIT4224,PAT00005,M,98.3,175.0,150,195,0,Low,55
...,...,...,...,...,...,...,...,...,...,...
1185,HOSPIT4224,PAT01186,M,84.3,185.0,110,264,0,Low,46
1186,HOSPIT4224,PAT01187,M,75.8,171.0,144,193,1,Low,69
1187,HOSPIT4224,PAT01188,M,71.9,185.0,130,131,0,Low,58
1188,HOSPIT4224,PAT01189,F,56.4,189.0,130,236,0,Low,58


In [21]:
# Transformation des colonnes catégoriques
df_general_exams["sex"] = df_general_exams["sex"].map({'M': 1, 'F': 0})
df_general_exams["physical_activity"] = df_general_exams["physical_activity"].map({'Low': 0, 'Intermediate': 1, 'High': 2})


In [22]:
# Renommage des clés pour fusion
df_diagnosis = df_diagnosis.rename(columns={"pid": "patient_id"})
df_diagnosis["patient_id"] = df_diagnosis["patient_id"].astype(str)
df_heart_exams["patient_id"] = df_heart_exams["patient_id"].astype(str)
df_general_exams["patient_id"] = df_general_exams["patient_id"].astype(str)

In [23]:
# 3. Fusion des datasets
df_fusionne = df_general_exams.merge(df_diagnosis, on="patient_id", how="left")
df_heart_exams_pivot = df_heart_exams.pivot(index="patient_id", columns="measurement", values="value").reset_index()
df_fusionne = df_fusionne.merge(df_heart_exams_pivot, on="patient_id", how="left")

In [24]:
df_heart_exams_pivot

measurement,patient_id,ST.slope,chest.pain.type,exercise.angina,max.heart.rate,oldpeak,resting.ecg
0,PAT00001,1.0,2.0,0.0,172.0,0.0,0.0
1,PAT00002,2.0,3.0,0.0,156.0,1.0,0.0
2,PAT00003,1.0,2.0,0.0,98.0,0.0,1.0
3,PAT00004,2.0,4.0,1.0,108.0,1.5,0.0
4,PAT00005,1.0,3.0,0.0,122.0,0.0,0.0
...,...,...,...,...,...,...,...
1185,PAT01186,2.0,1.0,0.0,132.0,1.2,0.0
1186,PAT01187,2.0,4.0,0.0,141.0,3.4,0.0
1187,PAT01188,2.0,4.0,1.0,115.0,1.2,0.0
1188,PAT01189,2.0,2.0,0.0,174.0,0.0,2.0


In [25]:
df_fusionne

Unnamed: 0,center_id,patient_id,sex,weight,height,resting.bp.s,cholesterol,fasting.blood.sugar,physical_activity,age,target,ST.slope,chest.pain.type,exercise.angina,max.heart.rate,oldpeak,resting.ecg
0,HOSPIT4224,PAT00001,1.0,49.9,174.0,140,289,0,0.0,41,,1.0,2.0,0.0,172.0,0.0,0.0
1,HOSPIT4224,PAT00002,0.0,65.3,180.0,160,180,0,0.0,50,,2.0,3.0,0.0,156.0,1.0,0.0
2,HOSPIT4224,PAT00003,1.0,65.5,,130,283,0,1.0,38,,1.0,2.0,0.0,98.0,0.0,1.0
3,HOSPIT4224,PAT00004,0.0,77.9,160.0,138,214,0,0.0,49,,2.0,4.0,1.0,108.0,1.5,0.0
4,HOSPIT4224,PAT00005,1.0,98.3,175.0,150,195,0,0.0,55,,1.0,3.0,0.0,122.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,HOSPIT4224,PAT01186,1.0,84.3,185.0,110,264,0,0.0,46,,2.0,1.0,0.0,132.0,1.2,0.0
1186,HOSPIT4224,PAT01187,1.0,75.8,171.0,144,193,1,0.0,69,,2.0,4.0,0.0,141.0,3.4,0.0
1187,HOSPIT4224,PAT01188,1.0,71.9,185.0,130,131,0,0.0,58,,2.0,4.0,1.0,115.0,1.2,0.0
1188,HOSPIT4224,PAT01189,0.0,56.4,189.0,130,236,0,0.0,58,,2.0,2.0,0.0,174.0,0.0,2.0


In [26]:
# 4. Normalisation des colonnes numériques
numerical_cols = df_fusionne.select_dtypes(include=['int64', 'float64']).columns.tolist()
scaler = MinMaxScaler()
df_fusionne[numerical_cols] = scaler.fit_transform(df_fusionne[numerical_cols])

In [30]:
catalogue = DataCatalogue(df_fusionne)
catalogue.generate_catalogue()

NameError: name 'DataCatalogue' is not defined

In [28]:
df_fusionne.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   center_id            1190 non-null   object 
 1   patient_id           1190 non-null   object 
 2   sex                  1186 non-null   float64
 3   weight               1120 non-null   float64
 4   height               1136 non-null   float64
 5   resting.bp.s         1190 non-null   float64
 6   cholesterol          1190 non-null   float64
 7   fasting.blood.sugar  1190 non-null   float64
 8   physical_activity    1185 non-null   float64
 9   age                  1190 non-null   int32  
 10  target               0 non-null      object 
 11  ST.slope             1190 non-null   float64
 12  chest.pain.type      1190 non-null   float64
 13  exercise.angina      1190 non-null   float64
 14  max.heart.rate       1190 non-null   float64
 15  oldpeak              1190 non-null   f

### Cette fonction utilise GridSearchCV pour tester différentes combinaisons d'hyperparamètres et trouver les meilleures valeurs.