In [8]:
import numpy as np
import pandas as pd 
import matplotlib as plot

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 
  

X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [None]:
X["Gender"] = X["Gender"].str.strip()
X["Gender"] = X["Gender"].map({"Female":0,"Male":1})

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,0,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,1,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,1,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,1,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [14]:
X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,0,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,1,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,1,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,1,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [15]:
# Encodage binaire (0 et 1) pour les colonnes booléennes
binary_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
binary_mapping = {'Male': 0, 'Female': 1, 'yes': 1, 'no': 0}
for col in binary_columns:
    X.loc[:, col] = X[col].map(binary_mapping)

# Encodage ordinal pour certaines colonnes
ordinal_columns = {
    'CAEC': {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3},
    'CALC': {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3},
    'NObeyesdad': {
        'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3,
        'Obesity_Type_I': 4, 'Obesity_Type_II': 5, 'Obesity_Type_III': 6
    }
}
for col, mapping in ordinal_columns.items():
    if col in X.columns:
        X.loc[:, col] = X[col].map(mapping)
    elif col in y.columns:
        y.loc[:, col] = y[col].map(mapping)

# Vérification de l'existence de la colonne MTRANS avant transformation
if 'MTRANS' in X.columns:
    # One-Hot Encoding manuel pour les variables nominales (MTRANS)
    transport_modes = ['Public_Transportation', 'Walking', 'Automobile', 'Motorbike', 'Bike']
    for mode in transport_modes[1:]:  # Exclure la première catégorie pour éviter le dummy variable trap
        X.loc[:, f'MTRANS_{mode}'] = (X['MTRANS'] == mode).astype(int)
    X.drop(columns=['MTRANS'], inplace=True)
else:
    print("La colonne 'MTRANS' n'existe pas dans X. Vérifiez les données.")

# Sauvegarder les données transformées au format CSV
X.to_csv("X_transformed.csv", index=False)
y.to_csv("y_transformed.csv", index=False)

# Fusionner X et y en un seul tableau
full_data = pd.concat([X, y], axis=1)

# Sauvegarder les données transformées au format CSV
full_data.to_csv("full_data_transformed.csv", index=False)

# Afficher les 5 premières lignes après transformation
print(full_data.head())

# Afficher les 5 premières lignes après transformation
print(X.head())
print(y.head())

  X.loc[:, col] = X[col].map(binary_mapping)


   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0     NaN  21.0    1.62    64.0                              1    0   2.0   
1     NaN  21.0    1.52    56.0                              1    0   3.0   
2     NaN  23.0    1.80    77.0                              1    0   2.0   
3     NaN  27.0    1.80    87.0                              0    0   3.0   
4     NaN  22.0    1.78    89.8                              0    0   2.0   

   NCP CAEC SMOKE  CH2O SCC  FAF  TUE CALC  MTRANS_Walking  MTRANS_Automobile  \
0  3.0    1     0   2.0   0  0.0  1.0    0               0                  0   
1  3.0    1     1   3.0   1  3.0  0.0    1               0                  0   
2  3.0    1     0   2.0   0  2.0  1.0    2               0                  0   
3  3.0    1     0   2.0   0  2.0  0.0    2               1                  0   
4  1.0    1     0   2.0   0  0.0  0.0    1               0                  0   

   MTRANS_Motorbike  MTRANS_Bike NObeyesdad  
0   

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualisation des outliers à l'aide de boxplots
sns.boxplot(data=data)
plt.show()

ModuleNotFoundError: No module named 'seaborn'