On installe les librairies:

In [None]:
!pip install ipympl
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install seaborn
!pip install seaborn --upgrade

On importe les librairie

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.preprocessing import *
from sklearn.model_selection import *
if float(sns.__version__[2:])<11:
    print("Sns version must be superior to 0.11.0 ;your current version:",sns.__version__)
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 

In [None]:
dfObesity = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv",sep=",")
dfObesity.head()

# Preparation des données

## Détecte s'il y a des valeurs non définies (Il n'y en a pas)

In [None]:
testNA=dfObesity.isna().sum()
print(testNA)
print("\nIl y a",sum(testNA),"avec des valeurs NA")

## Compréhension des données

In [None]:
dfObesity.dtypes

On crée un dataset avec les variables catégorielles remplacée par des nombres

In [None]:
dfObesity2=dfObesity.copy()
for col in dfObesity2.columns.to_numpy():
  if dfObesity2[col].dtype=="object":
    dfObesity2[col]=dfObesity2[col].astype("category").cat.codes
dfObesity2.dtypes

In [None]:
dfObesity2.describe().T

In [None]:
sns.set(style="ticks")
sns.pairplot(dfObesity2, hue='NObeyesdad')

Il semble que l'Age n'est pas beaucoup d'influence sur le type d'Obésité

In [None]:
sns.scatterplot(x="Age", y="NObeyesdad",data=dfObesity);

In [None]:
sns.displot(dfObesity, x="Age", hue="NObeyesdad", kind="kde", fill=True)

Lien entre la taille et le type d'obésité:

In [None]:
sns.scatterplot(x="Height", y="NObeyesdad",data=dfObesity);

In [None]:
sns.displot(dfObesity, x="Height", hue="NObeyesdad", kind="kde", fill=True)

Lien entre le Poids et le type d'obésité:

In [None]:
sns.scatterplot(x="Weight", y="NObeyesdad",data=dfObesity);

In [None]:
sns.displot(dfObesity, x="Weight", hue="NObeyesdad", kind="kde", fill=True)

### Création de nouvelles variables:

On peut voir que le Poid et la Taille sont liés quasi-linéairement:

In [None]:
sns.scatterplot(x="Height", y="Weight",hue='NObeyesdad',data=dfObesity);
sns.displot(dfObesity, x="Height", y="Weight", hue="NObeyesdad", kind="kde", fill=True)

### On peut lier le poids et le taille de différente façon: 

En utilisant l'IMC, qui lie le poids et la taille: Poids/Taille²

In [None]:
dfObesity['IMC']=dfObesity['Weight']/(dfObesity['Height'])**2
sns.scatterplot(x="IMC", y='NObeyesdad',data=dfObesity);

In [None]:
sns.displot(dfObesity, x="IMC", hue="NObeyesdad", kind="kde", fill=True)

On peut voir que cela sépare déjà très bien les différents type d'obésité

### En utilisant une regression linéaire:

In [None]:
from sklearn.linear_model import *

lCoef=[]

for cat in dfObesity["NObeyesdad"].unique():
  x=dfObesity["Height"][dfObesity["NObeyesdad"]==cat].to_numpy()
  y=dfObesity["Weight"][dfObesity["NObeyesdad"]==cat].to_numpy()
  reg=LinearRegression().fit(np.array([x]).T, np.array([y]).T)
  lCoef+=[reg.coef_[0][0]]
  plt.plot(x,y,"o")

# On peut se demander s'il est mieux de faire une moyenne des angles des pentes, ou de leur coefficient directeur:
meanCoef1=np.mean(lCoef)
meanCoef2=np.tan(np.mean(np.arctan(lCoef)))

print("Coefficient directeur moyen:",meanCoef1,meanCoef2)
i=np.array([np.amin(dfObesity["Height"]), np.amax(dfObesity["Height"])])
plt.plot(i,i*meanCoef1-90, "b")
plt.plot(i,i*meanCoef2-90, "r")

dfObesity['Regression1']=dfObesity['Weight']-(dfObesity['Height'])*meanCoef1
dfObesity['Regression2']=dfObesity['Weight']-(dfObesity['Height'])*meanCoef2

Nous allons maintenant voir quel est la meilleur varibale créée des 3 (IMC, regression1, et regression2):

In [None]:


dfObesity['IMC_Test']=(dfObesity['IMC']-dfObesity['IMC'].mean())/dfObesity['IMC'].std()
dfObesity['Regression1_Test']=(dfObesity['Regression1']-dfObesity['Regression1'].mean())/dfObesity['Regression1'].std()
dfObesity['Regression2_Test']=(dfObesity['Regression2']-dfObesity['Regression2'].mean())/dfObesity['Regression2'].std()


sns.scatterplot(x="IMC_Test", y='NObeyesdad',data=dfObesity)
sns.scatterplot(x="Regression1_Test", y='NObeyesdad',data=dfObesity)
sns.scatterplot(x="Regression2_Test", y='NObeyesdad',data=dfObesity)
plt.legend(labels=['IMC', 'Regression1', 'Regression2'])

In [None]:
sns.displot(dfObesity, x="IMC", hue="NObeyesdad", kind="kde", fill=True)
sns.displot(dfObesity, x="Regression1", hue="NObeyesdad", kind="kde", fill=True)
sns.displot(dfObesity, x="Regression2", hue="NObeyesdad", kind="kde", fill=True)

# Modélisation et Prédiction:

## Utiliser IMC

In [None]:
def IMC(x):
  dIMC={18.5:"Insufficient_Weight",
        25:"Normal_Weight",
        30:"Overweight_Level_I",
        35:"Overweight_Level_II",
        40:"Obesity_Type_I",
        float('inf'):"Obesity_Type_II"}
  for v,c in dIMC.items():
    if x<v:
      return c

dfObesity["IMCPrediction"]=[IMC(x) for x in dfObesity['IMC']]

cm = confusion_matrix(dfObesity["NObeyesdad"], dfObesity["IMCPrediction"], labels=["Insufficient_Weight", "Normal_Weight", "Overweight_Level_I","Overweight_Level_II","Obesity_Type_I","Obesity_Type_II"])
plt.imshow(cm)
print(cm)
sns.scatterplot(x="NObeyesdad", y="IMCPrediction",hue='NObeyesdad',data=dfObesity)

In [None]:
sns.displot(x="NObeyesdad", y="IMCPrediction",hue='NObeyesdad',data=dfObesity)

In [None]:
result=sum(dfObesity["NObeyesdad"]==dfObesity["IMCPrediction"])/len(dfObesity["IMCPrediction"])
print("Il y a",round(result*100),"% de bons résultats !")

Nous allons essayer de modifier manuellement les valeurs de l'IMC, pour voir si nous pouvons trouver de meilleur résultats:

In [None]:
def IMC2(x):
  dIMC={18.5:"Insufficient_Weight",
        25:"Normal_Weight",
        27:"Overweight_Level_I",
        30:"Overweight_Level_II",
        35:"Obesity_Type_I",
        float('inf'):"Obesity_Type_II"}
  for v,c in dIMC.items():
    if x<v:
      return c

dfObesity["IMCPrediction2"]=[IMC2(x) for x in dfObesity['IMC']]

cm = confusion_matrix(dfObesity["NObeyesdad"], dfObesity["IMCPrediction2"], labels=["Insufficient_Weight", "Normal_Weight", "Overweight_Level_I","Overweight_Level_II","Obesity_Type_I","Obesity_Type_II"])
plt.imshow(cm)
sns.scatterplot(x="NObeyesdad", y="IMCPrediction2",hue='NObeyesdad',data=dfObesity)

In [None]:
sns.displot(x="NObeyesdad", y="IMC2Prediction",hue='NObeyesdad',data=dfObesity)

In [None]:
result=sum(dfObesity["NObeyesdad"]==dfObesity["IMCPrediction2"])/len(dfObesity["IMCPrediction2"])
print("Il y a",round(result*100),"de bons résultats !")

## KNN modélisation

In [None]:
title_cat2 = dfObesity.columns[:-1]
df = dfObesity.copy()
for i in title_cat2:
  df[i]=df[i].astype("category").cat.codes


X = df.iloc[:, :-1].values
y = df.iloc[:, 16].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
plt.imshow(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

error = []

# Calculating error for K values between 1 and 40
for i in range(1, 25):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i == y_test))

plt.figure(figsize=(15, 6))
plt.plot(range(1, 25), error, marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean susccess')

print("Il y a",round(max(error)*100),"% de bons résultats !")

## RandomForest:

In [None]:
y=dfObesity2["NObeyesdad"]
X=dfObesity2.drop("NObeyesdad",axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33)




rfc = RandomForestClassifier(n_estimators=500, random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)



print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
