In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [119]:
donnees = pd.read_csv("df_final_assurance.csv")
donnees_originales = donnees.copy()
donnees.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,True,False,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,True,False,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,True,False,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,True,False,0.0,1.0,0.0,0.0


In [120]:
donnees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1337 non-null   int64  
 1   bmi               1337 non-null   float64
 2   children          1337 non-null   int64  
 3   charges           1337 non-null   float64
 4   sex_male          1337 non-null   bool   
 5   smoker_yes        1337 non-null   bool   
 6   region_northeast  1337 non-null   float64
 7   region_northwest  1337 non-null   float64
 8   region_southeast  1337 non-null   float64
 9   region_southwest  1337 non-null   float64
dtypes: bool(2), float64(6), int64(2)
memory usage: 86.3 KB


#### fonction pour tester le modele :

In [121]:
def test_model(model, age, bmi, children, sex_male, smoker_yes, region_northeast, region_northwest, region_southeast, region_southwest, scaler=None) : 
    x = np.array([age, bmi, children, sex_male, smoker_yes, region_northeast, region_northwest, region_southeast, region_southwest]).reshape(1,-1)
    return f"montant de l'assurance : {model.predict(x)[0]}"


#### trainset et testset

In [122]:
X = donnees.drop("charges", axis=1)     # features
y = donnees["charges"]                  # target
# print(X.isna().sum())
# print(y.isna().sum())

# 1. dummy Model : LinearRegression

In [123]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker_yes"])

In [124]:
from sklearn.linear_model import LinearRegression

dummy_model = LinearRegression()
dummy_model.fit(X_train, y_train)

In [125]:
dummy_model.score(X_test, y_test)

0.8265441393970117

#### test 

In [126]:
# test_model(model=dummy_model,age=34,bmi=24.2,children=1,sex_male=1,smoker_yes=0,region_northeast=1,region_northwest=0,region_southeast=0,region_southwest=0)

In [127]:
data = pd.read_csv("dataset_assurance.csv")
data.drop_duplicates(inplace=True)

X = data.drop("charges", axis = 1)
y = data["charges"]
print(data.shape)
data.head()


(1337, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# pre-processing

## encodage 


In [128]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_smoker.classes_ = ["yes", "non"]

encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_sex.classes_ = np.array(['male', 'female'])  # on définit l'ordre

X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
X["sex"] = encodeur_sex.fit_transform(X["sex"])
X.rename(columns={"sex" : "sex_male"}, inplace=True)
X

encodeur_region = OneHotEncoder(sparse_output=False)
region_encodee = encodeur_region.fit_transform(X[["region"]])
region_encodee

df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index = X.index)
df_region_encodee

X = pd.concat([X, df_region_encodee], axis = 1)

# suppression de la colonne region
X.drop("region", axis = 1, inplace=True)
X



Unnamed: 0,age,sex_male,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0.0,0.0,0.0,1.0
1,18,1,33.770,1,0,0.0,0.0,1.0,0.0
2,28,1,33.000,3,0,0.0,0.0,1.0,0.0
3,33,1,22.705,0,0,0.0,1.0,0.0,0.0
4,32,1,28.880,0,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0.0,1.0,0.0,0.0
1334,18,0,31.920,0,0,1.0,0.0,0.0,0.0
1335,18,0,36.850,0,0,0.0,0.0,1.0,0.0
1336,21,0,25.800,0,0,0.0,0.0,0.0,1.0


# 2. model Lasso

split testset et dataset

In [129]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

# X_train

## normalisation

Robuste scaler qui est peu sensible aux valeurs aberrantes car il soustrait les données à la médiane qu'il divise par le IQR

In [130]:
from sklearn.preprocessing import RobustScaler, StandardScaler

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# X_train_stand[0]


### premier model lasso

In [131]:
from sklearn.linear_model import Lasso

lasso1 = Lasso()
lasso1.fit(X_train, y_train)

In [132]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

def metriques(model, X_test, y_test) : 
    pred_y = model.predict(X_test)
    mse = mean_squared_error(y_test, pred_y)
    mae = mean_absolute_error(y_test, pred_y)
    r2 = r2_score(y_test, pred_y)
    med_ae = median_absolute_error(y_test, pred_y)
    
    
    print(f"MSE : {mse}")
    print(f"RMSE : {np.sqrt(mse)}")
    print(f"MAE : {mae}")
    print(f"R2 : {r2}")
    print(f"MedAE : {med_ae}")      # cette métrique est moi sensible aux valeurs aberrantes
    
    return mse, mae, r2, med_ae


In [133]:
def coefficients(model, X) : 
    coefs = model.coef_
    noms_cols = X.columns
    coefs_df = pd.DataFrame({"variables" : noms_cols, "coef" : coefs})
    print("coefficients des variables")
    print(coefs_df)


In [134]:
metriques(model=lasso1, X_test=X_test, y_test=y_test)

MSE : 25129843.60096479
RMSE : 5012.967544375765
MAE : 3495.6195820917465
R2 : 0.8265900228347994
MedAE : 2064.9818309451466


(25129843.60096479,
 3495.6195820917465,
 0.8265900228347994,
 np.float64(2064.9818309451466))

In [135]:
coefficients(model=lasso1, X=X)

coefficients des variables
          variables          coef
0               age   6497.985226
1          sex_male   -191.185448
2               bmi   2814.016726
3          children   1072.366677
4            smoker  23815.897468
5  region_northeast   1049.299565
6  region_northwest    632.450334
7  region_southeast   -138.985357
8  region_southwest    -98.724463


## Sélection du meilleur alpha

avec grid search cv, model lassoCV


In [136]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params = {
    "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_cv = GridSearchCV(lasso1, param_grid=params, cv=5, n_jobs=-1)     # n_jobs pour déterminer le nombre de cœurs CPU utilisés pour exécuter les calculs en parallèle. -1 pour tous les coeurs


In [137]:
lasso_cv.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [138]:
print("meilleurs parametres : ", lasso_cv.best_params_)
lasso_cv.best_estimator_

meilleurs parametres :  {'alpha': 1}


### nouveau model avec le meilleur alpha
il sera le même que le premier, car alpha par défaut=1

In [139]:
lasso2 = Lasso(alpha=1)
lasso2.fit(X_train, y_train)

In [140]:
metriques(model=lasso2, X_test=X_test, y_test=y_test)


MSE : 25129843.60096479
RMSE : 5012.967544375765
MAE : 3495.6195820917465
R2 : 0.8265900228347994
MedAE : 2064.9818309451466


(25129843.60096479,
 3495.6195820917465,
 0.8265900228347994,
 np.float64(2064.9818309451466))

In [141]:
coefficients(model=lasso2, X=X)

coefficients des variables
          variables          coef
0               age   6497.985226
1          sex_male   -191.185448
2               bmi   2814.016726
3          children   1072.366677
4            smoker  23815.897468
5  region_northeast   1049.299565
6  region_northwest    632.450334
7  region_southeast   -138.985357
8  region_southwest    -98.724463


## feature engineering

#### création de polynomes 
PolynomialFeatures

utiliser un modele qui fixe les coefficients à zero lorsque les poids ne sont pas utiles (il fait du feature selection)


In [142]:
from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(degree=2, include_bias=False)
# poly = PolynomialFeatures(degree=2)

X_fe = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])
type(X_fe)


numpy.ndarray

#### création de nouvelles variables 

In [143]:
# X_fe = X.copy()
# y_fe = y.copy()

# X_fe["age_carre"] = X_fe["age"]**2  
# X_fe["bmi_carre"] = X_fe["bmi"]**2  
# X_fe["age_bmi"] = X_fe["age"] * X_fe["bmi"]
# X_fe["smoker_bmi"] = X_fe["smoker"] * X_fe["bmi"]
# X_fe["smoker_age"] = X_fe["smoker"] * X_fe["age"]
# X_fe["children_bmi"] = X_fe["children"] * X_fe["bmi"]

en se basant sur une logique métier dans le domaine de l'assurance :

    age_squared :

    L'impact de l'âge sur les prix n'est pas forcément constant
    Le risque santé peut augmenter plus rapidement avec l'âge
    Par exemple : la différence de risque entre 60-61 ans peut être plus importante qu'entre 20-21 ans

    bmi_squared :

    L'impact du BMI sur la santé n'est pas linéaire
    Les risques de santé augmentent plus rapidement quand le BMI devient très élevé
    Par exemple : passer d'un BMI de 35 à 36 est plus risqué que passer de 20 à 21

    smoker_bmi :

    Les fumeurs avec un BMI élevé peuvent présenter des risques plus importants
    Cette interaction est particulièrement pertinente pour les risques cardio-vasculaires

J'ai choisi ces interactions spécifiques car elles ont un sens médical et assurantiel, plutôt que de créer toutes les interactions possibles qui n'auraient pas forcément de justification métier.

In [144]:
# X.shape
X_fe.shape

(1337, 54)

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

### normalisation du nouveau X_fe 

In [146]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [147]:
lasso3 = Lasso()
lasso3.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [148]:
metriques(model=lasso3, X_test=X_test, y_test=y_test)


MSE : 14591951.297023077
RMSE : 3819.941268792372
MAE : 2500.1770729225523
R2 : 0.8993073740771175
MedAE : 1671.673997720989


(14591951.297023077,
 2500.1770729225523,
 0.8993073740771175,
 np.float64(1671.673997720989))

In [149]:
# coefficients(model=lasso3, X=X_fe)

### recherche du meilleur alpha

In [150]:
params2 = {
    "alpha" : [i for i in range(10,100)]
}

lasso_cv2 = GridSearchCV(lasso3, param_grid=params2, cv=5, n_jobs=-1)


In [151]:
lasso_cv2.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [152]:
print("meilleur score : ", lasso_cv2.best_score_)
best_aplha = lasso_cv2.best_params_["alpha"]
print("meilleur alpha : ", best_aplha)
lasso_cv2.best_estimator_

meilleur score :  0.8248768449209383
meilleur alpha :  27


In [153]:
lasso4 = Lasso(alpha=best_aplha)
lasso4.fit(X_train, y_train)

metriques(model=lasso4, X_test=X_test, y_test=y_test)

MSE : 14108468.969603997
RMSE : 3756.1241951783218
MAE : 2438.247766661416
R2 : 0.9026436725710048
MedAE : 1535.9400483577474


  model = cd_fast.enet_coordinate_descent(


(14108468.969603997,
 2438.247766661416,
 0.9026436725710048,
 np.float64(1535.9400483577474))

In [154]:
# coefficients(model=lasso4, X=X_fe)

    R² est passé de 0.83 à 0.90 
    RMSE a baissé de 5013 à 3798 
    MAE a baissé de 3496 à 2476 

    points intéressants :

    smoker_bmi a un fort impact positif (+1438), confirmant que l'effet du tabagisme augmente avec le BMI
    age_squared est positif (+4.16), montrant un effet accéléré avec l'âge
    bmi_squared est négatif (-9.19), suggérant un effet qui ralentit pour les BMI très élevés
    Les interactions créées ont permis de capturer des relations plus complexes

    Changements notables :

    Le coefficient de smoker est devenu négatif car son effet est maintenant capturé via les interactions
    L'effet direct de l'âge est devenu négatif mais est compensé par age_squared

    Ces résultats montrent que l'ajout des interactions était pertinent et a significativement amélioré le modèle tout en gardant son interprétabilité.

# 3. model Ridge

In [155]:
X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [156]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train


array([[-0.84      , -1.        ,  0.09040892, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.        ,  0.08446097, ...,  0.        ,
         0.        ,  1.        ],
       [-0.8       ,  0.        , -0.32773234, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.28      ,  0.        ,  0.80237918, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.8       , -1.        , -0.36758364, ...,  0.        ,
         0.        ,  1.        ],
       [-0.64      ,  0.        , -0.72327138, ...,  0.        ,
         0.        ,  0.        ]], shape=(1136, 9))

In [157]:
from sklearn.linear_model import Ridge

ridge1 = Ridge()
ridge1.fit(X_train, y_train)


In [158]:
metriques(model=ridge1, X_test=X_test, y_test=y_test)


MSE : 25137497.743033744
RMSE : 5013.7309204856365
MAE : 3502.921122421009
R2 : 0.8265372049731974
MedAE : 2102.5924347290374


(25137497.743033744,
 3502.921122421009,
 0.8265372049731974,
 np.float64(2102.5924347290374))

In [159]:
X.columns

Index(['age', 'sex_male', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [160]:
ridge1.coef_
coefficients(model=ridge1, X=X)

coefficients des variables
          variables          coef
0               age   6479.871068
1          sex_male   -189.448806
2               bmi   2810.890201
3          children   1073.667523
4            smoker  23692.184520
5  region_northeast    688.799517
6  region_northwest    271.464553
7  region_southeast   -494.550030
8  region_southwest   -465.714040


## amélioration

recherche du meilleur nombre de cv

In [161]:
params = {
    "alpha" : np.arange(1, 20)
}

ridge_cv = GridSearchCV(estimator=ridge1, param_grid=params, cv=5, n_jobs=-1)
ridge_cv.fit(X_train, y_train)


In [162]:
metriques(model=ridge_cv, X_test=X_test, y_test=y_test)

MSE : 25137497.743033744
RMSE : 5013.7309204856365
MAE : 3502.921122421009
R2 : 0.8265372049731974
MedAE : 2102.5924347290374


(25137497.743033744,
 3502.921122421009,
 0.8265372049731974,
 np.float64(2102.5924347290374))

#### création de nouvelles variables 

In [163]:
X_fe = X.copy()
y_fe = y.copy()

X_fe["age_carre"] = X_fe["age"]**2  
X_fe["bmi_carre"] = X_fe["bmi"]**2  
X_fe["age_bmi"] = X_fe["age"] * X_fe["bmi"]
X_fe["smoker_bmi"] = X_fe["smoker"] * X_fe["bmi"]
X_fe["smoker_age"] = X_fe["smoker"] * X_fe["age"]
X_fe["children_bmi"] = X_fe["children"] * X_fe["bmi"]

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [165]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [166]:
ridge3 = Ridge(alpha=1)
ridge3.fit(X_train, y_train)
metriques(model=ridge3, X_test=X_test, y_test=y_test)
ridge3.score(X_test, y_test)

MSE : 14427977.833823793
RMSE : 3798.4178066431546
MAE : 2465.920009876347
R2 : 0.9004388826913611
MedAE : 1618.8103614039028


0.9004388826913611

# 4. model ElasticNet

In [167]:
from sklearn.linear_model import ElasticNet

elastic1 = ElasticNet(random_state=42)

elastic1.fit(X_train, y_train)

metriques(model=elastic1, X_test=X_test, y_test=y_test)


MSE : 14925589.301651062
RMSE : 3863.3650230920534
MAE : 2459.7247636870975
R2 : 0.8970050852255561
MedAE : 1555.5096864398965


(14925589.301651062,
 2459.7247636870975,
 0.8970050852255561,
 np.float64(1555.5096864398965))

In [168]:
params = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95]
}
elastic_cv = GridSearchCV(elastic1, param_grid=params, cv=5, n_jobs=-1, scoring='r2')
elastic_cv.fit(X_train, y_train)

best_alpha = elastic_cv.best_params_['alpha']
best_l1_ratio = elastic_cv.best_params_['l1_ratio']
final_elastic = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
final_elastic.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [169]:
metriques(model=final_elastic, X_test=X_test, y_test=y_test)

MSE : 14495448.772098348
RMSE : 3807.2889005299226
MAE : 2479.0026982624313
R2 : 0.899973295477557
MedAE : 1611.6730951106183


(14495448.772098348,
 2479.0026982624313,
 0.899973295477557,
 np.float64(1611.6730951106183))

In [170]:
# X = X.copy()
# y = y.copy()
# y = np.log(y)
# X["bmi"] = np.log(X["bmi"])


# X_fe["age_carre"] = X["age"]**2  
# X_fe["bmi_carre"] = X["bmi"]**2  
# X_fe["age_bmi"] = X["age"] * X["bmi"]
# X_fe["smoker_bmi"] = X["smoker"] * X["bmi"]
# X_fe["smoker_age"] = X["smoker"] * X["age"]
# X_fe["children_bmi"] = X["children"] * X["bmi"]
# X_fe['children_smoker'] = X['children'] * X['smoker']
# X_fe['bmi_log'] = np.log(X['bmi'])
# X_fe['age_log'] = np.log(X['age'])


# X_train, X_test, y_train, y_test = train_test_split(X_fe, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

# X_fe

In [171]:
X = X.copy()
y = y.copy()
y_fe = np.log(y)


poly = PolynomialFeatures(degree=2, include_bias=False)

X_fe = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

X_fe

array([[19.  ,  0.  , 27.9 , ...,  0.  ,  0.  ,  1.  ],
       [18.  ,  1.  , 33.77, ...,  1.  ,  0.  ,  0.  ],
       [28.  ,  1.  , 33.  , ...,  1.  ,  0.  ,  0.  ],
       ...,
       [18.  ,  0.  , 36.85, ...,  1.  ,  0.  ,  0.  ],
       [21.  ,  0.  , 25.8 , ...,  0.  ,  0.  ,  1.  ],
       [61.  ,  0.  , 29.07, ...,  0.  ,  0.  ,  0.  ]], shape=(1337, 54))

In [172]:
scaler = RobustScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [173]:
# elastic_cv2 = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, random_state=42)
elastic_cv2 = ElasticNet(alpha=0.001, l1_ratio=0.9, random_state=42)

elastic_cv2.fit(X_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [174]:
metriques(model=elastic_cv2, X_test=X_test, y_test=y_test)

MSE : 0.0740747999543564
RMSE : 0.27216686049987127
MAE : 0.17061520030238128
R2 : 0.91315930147267
MedAE : 0.11887631060737824


(0.0740747999543564,
 0.17061520030238128,
 0.91315930147267,
 np.float64(0.11887631060737824))

# groupement des ages et bmi en catégories

In [175]:
def categorie_age(bmi) :
    if bmi < 25:
        return "24 et moins"
    elif bmi < 35:
        return "25-34"
    elif bmi < 45:
        return "35-44"
    elif bmi < 55:
        return "45-54"
    else:
        return "55 et plus"

def categorie_bmi(bmi):
    if bmi < 18.5 :
        return "insuffisance pondérale"
    elif bmi < 24.5:
        return "normal"
    elif bmi < 30:
        return "surpoids"
    elif bmi < 35:
        return "obésité I"
    elif bmi < 40:
        return "obésité II"
    else:
        return "obésité III"
    


In [176]:
X3 = X.copy()
y3 = np.log(y.copy())
X3['bmi_cat'] = X3['bmi'].apply(categorie_bmi)
# X3['age_cat'] = X3['age'].apply(categorie_age)



# X3 = pd.get_dummies(X3, columns=['bmi_cat', 'age_cat'], drop_first=True)
X3 = pd.get_dummies(X3, columns=['bmi_cat'], drop_first=True)
X3["bmi"] = np.log(X3["bmi"])

X3


Unnamed: 0,age,sex_male,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest,bmi_cat_normal,bmi_cat_obésité I,bmi_cat_obésité II,bmi_cat_obésité III,bmi_cat_surpoids
0,19,0,3.328627,0,1,0.0,0.0,0.0,1.0,False,False,False,False,True
1,18,1,3.519573,1,0,0.0,0.0,1.0,0.0,False,True,False,False,False
2,28,1,3.496508,3,0,0.0,0.0,1.0,0.0,False,True,False,False,False
3,33,1,3.122585,0,0,0.0,1.0,0.0,0.0,True,False,False,False,False
4,32,1,3.363149,0,0,0.0,1.0,0.0,0.0,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,1,3.433019,3,0,0.0,1.0,0.0,0.0,False,True,False,False,False
1334,18,0,3.463233,0,0,1.0,0.0,0.0,0.0,False,True,False,False,False
1335,18,0,3.606856,0,0,0.0,0.0,1.0,0.0,False,False,True,False,False
1336,21,0,3.250374,0,0,0.0,0.0,0.0,1.0,False,False,False,False,True


In [177]:
poly = PolynomialFeatures(degree=2, include_bias=False)

X3 = poly.fit_transform(X3)
X_train, X_test, y_train, y_test = train_test_split(X3, y3, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [178]:
lasso5 = Lasso(alpha=0.001, random_state=42)
lasso5.fit(X_train, y_train)

metriques(model=lasso5, X_test=X_test, y_test=y_test)

MSE : 0.07414220008657645
RMSE : 0.27229065368935534
MAE : 0.16235995915264817
R2 : 0.9130802857403774
MedAE : 0.10825565649659197


  model = cd_fast.enet_coordinate_descent(


(0.07414220008657645,
 0.16235995915264817,
 0.9130802857403774,
 np.float64(0.10825565649659197))

In [179]:
elasticnet5 = ElasticNet(random_state=42, alpha=0.001, l1_ratio=0.9)
elasticnet5.fit(X_train, y_train)

metriques(model=elasticnet5, X_test=X_test, y_test=y_test)


MSE : 0.07393427327625193
RMSE : 0.2719085752164722
MAE : 0.1624973901704779
R2 : 0.9133240462292651
MedAE : 0.10954756797313259


  model = cd_fast.enet_coordinate_descent(


(0.07393427327625193,
 0.1624973901704779,
 0.9133240462292651,
 np.float64(0.10954756797313259))

# meilleur résultat obtenu

In [186]:
X3 = X.copy()
y3 = y.copy()
X3["bmi_cat"] = X3["bmi"].apply(categorie_bmi)
# X3.drop("bmi", inplace=True)

X3 = pd.get_dummies(X3, columns=["bmi_cat"], drop_first=True)

poly = PolynomialFeatures(degree=2, include_bias=False)

X3 = poly.fit_transform(X3)
X_train, X_test, y_train, y_test = train_test_split(X3, y3, shuffle=True, train_size=0.85, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# lasso5 = Lasso(alpha=168, random_state=42)
lasso5 = Lasso(alpha=170, random_state=42)
lasso5.fit(X_train, y_train)

print("scores lasso : ")
metriques(model=lasso5, X_test=X_test, y_test=y_test)

elasticnet5 = ElasticNet(random_state=42, alpha=0.05, l1_ratio=0.6)
elasticnet5.fit(X_train, y_train)
elasticnet5.score

print("scores elasticnet : ")
metriques(model=elasticnet5, X_test=X_test, y_test=y_test)



scores lasso : 
MSE : 18285198.237631723
RMSE : 4276.1195303255645
MAE : 2372.2725834212342
R2 : 0.8972071863618539
MedAE : 1449.7046170697308
scores elasticnet : 
MSE : 17668281.21044563
RMSE : 4203.36546239387
MAE : 2374.9308149011754
R2 : 0.9006752721972718
MedAE : 1402.8185419320635


  model = cd_fast.enet_coordinate_descent(


(17668281.21044563,
 2374.9308149011754,
 0.9006752721972718,
 np.float64(1402.8185419320635))

# 5. model SVR (Support Vector Regression)

In [181]:
from sklearn.svm import SVR

