In [67]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

In [68]:
df = pd.read_csv("../data/data.csv")

# 1. Nettoyage des valeurs aberrantes pour la colonne BMI

In [69]:
# Suppression des valeurs qui sortent de l'intervalle raisonnable (18 à 50 pour BMI)
df = df[(df['BMI'] >= 18) & (df['BMI'] <= 50)]

# 2. Encodage des colonnes catégorielles

In [70]:
# Encodage ordinal pour la colonne GenHlth
health_mapping = {
    'Poor': 1,
    'Fair': 2,
    'Good': 3,
    'Very Good': 4,
    'Excellent': 5
}
df['GenHlth'] = df['GenHlth'].map(health_mapping)

In [71]:
# Encodage ordinal pour Age en utilisant des valeurs moyennes
age_mapping = {
    '18 to 24': 21,
    '25 to 29': 27,
    '30 to 34': 32,
    '35 to 39': 37,
    '40 to 44': 42,
    '45 to 49': 47,
    '50 to 54': 52,
    '55 to 59': 57,
    '60 to 64': 62,
    '65 to 69': 67,
    '70 to 74': 72,
    '75 to 79': 77,
    '80 or older': 85
}
df['Age'] = df['Age'].map(age_mapping)

In [72]:
# Encodage binaire pour la colonne Diabetes_binary
diabetes_mapping = {
    'Non-Diabetic': 0,
    'Diabetic': 1
}
df['Diabetes_binary'] = df['Diabetes_binary'].map(diabetes_mapping)

In [73]:
df.head()

Unnamed: 0,ID,BMI,PhysHlth,Age,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,...,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,DiffWalk,Sex,Education,Income,Diabetes_binary
0,114414,29.0,0.0,67,0,1,1,0,0,0,...,0,1,1,1,0,0,0,6,7,0
1,168896,32.0,0.0,85,1,1,1,0,0,0,...,0,1,0,1,0,0,0,Some College Degree,4,0
2,68354,25.0,5.0,67,1,0,1,1,0,0,...,0,1,0,2,0,0,1,6,2,0
3,121194,24.0,0.0,85,1,0,1,0,0,0,...,0,1,0,4,0,1,0,Advanced Degree,5,0
4,141150,31.0,0.0,27,0,0,1,0,0,1,...,0,1,0,4,5,0,0,6,6,1


In [74]:
df["Education"].unique()

array(['6', 'Some College Degree', 'Advanced Degree', 'High School',
       'Elementary', 'Never Attended School'], dtype=object)

In [75]:
# Remove rows where 'Education' is '6'
df = df[df['Education'] != '6']

# Define the education mapping based on observed levels in the dataset
education_mapping = {
    'Never Attended School': 0,
    'Elementary': 1,
    'High School': 2,
    'Some College Degree': 3,
    'Advanced Degree': 4
}

# Apply the mapping to the Education column
df['Education'] = df['Education'].map(education_mapping)

In [76]:
df["Education"].unique()

array([3, 4, 2, 1, 0], dtype=int64)

In [77]:
df.drop(['ID'],inplace=True,axis=1)

In [78]:
df.head()

Unnamed: 0,BMI,PhysHlth,Age,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,DiffWalk,Sex,Education,Income,Diabetes_binary
1,32.0,0.0,85,1,1,1,0,0,0,0,...,0,1,0,1,0,0,0,3,4,0
3,24.0,0.0,85,1,0,1,0,0,0,0,...,0,1,0,4,0,1,0,4,5,0
5,20.0,0.0,57,0,1,1,1,0,0,0,...,0,1,0,4,3,1,0,3,3,0
6,35.0,0.0,72,1,1,1,1,0,0,0,...,0,1,0,3,0,0,0,4,6,1
8,29.0,2.0,62,1,1,1,1,0,0,1,...,0,1,0,3,0,0,1,3,4,1


# 3. balancing the target

In [79]:
# Vérification de la répartition des classes avant équilibrage
print("Répartition des classes avant équilibrage :")
print(df['Diabetes_binary'].value_counts())

Répartition des classes avant équilibrage :
Diabetes_binary
0    91666
1    18721
Name: count, dtype: int64


In [80]:
# Séparation des caractéristiques (X) et de la variable cible (y)
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

In [81]:
# Application de la méthode de suréchantillonnage pour équilibrer les classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
'''SMOTE génère des exemples synthétiques pour la classe minoritaire en créant des points interpolés. 
Cette approche peut être plus efficace qu'un simple suréchantillonnage aléatoire. Cependant, il est essentiel de surveiller les performances après application.'''

In [82]:
# Combinaison des données rééchantillonnées en un nouveau DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Diabetes_binary'])], axis=1)

In [83]:
# Vérification de la répartition des classes après équilibrage
print("\nRépartition des classes après équilibrage :")
print(balanced_data['Diabetes_binary'].value_counts())


Répartition des classes après équilibrage :
Diabetes_binary
0    91666
1    91666
Name: count, dtype: int64


In [84]:
df.tail()

Unnamed: 0,BMI,PhysHlth,Age,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,DiffWalk,Sex,Education,Income,Diabetes_binary
194807,30.0,0.0,37,1,0,1,1,0,0,0,...,0,1,1,2,5,0,1,4,7,0
194809,24.0,0.0,42,1,0,1,1,0,1,0,...,0,0,0,4,0,0,1,1,3,0
194810,19.0,0.0,67,1,0,1,1,0,1,1,...,0,1,0,4,0,0,0,3,5,0
194820,28.0,28.0,52,1,1,1,0,0,0,1,...,0,1,0,4,15,1,1,4,6,1
194821,25.0,0.0,62,0,0,1,1,0,0,1,...,0,1,1,2,0,0,0,4,7,0


# Save the cleaned data to the specified path

In [39]:
cleaned_data_path = '../data/clean_data.csv'
df.to_csv(cleaned_data_path, index=False)


In [85]:
# Enregistrement du nouveau dataset équilibré
balanced_data.to_csv('../data/balanced_clean_data.csv', index=False)
print("\nDataset équilibré enregistré sous le nom 'balanced_clean_data.csv'")


Dataset équilibré enregistré sous le nom 'balanced_clean_data.csv'
